import commit
This commit is contained in:
commit
bccfe68192
39 changed files with 6837 additions and 0 deletions
512
meta_data_analysis/.Rhistory
Normal file
512
meta_data_analysis/.Rhistory
Normal file
|
@ -0,0 +1,512 @@
|
|||
, stringsAsFactors = F)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
|
||||
my_logor
|
||||
pnca_snps_or$Mutationinformation == i
|
||||
View(pnca_snps_or)
|
||||
#===============
|
||||
# Step 4: Calculate for one snp
|
||||
# using i, when you run the loop, it is easy
|
||||
#===============
|
||||
i = "pnca_p.trp68gly"
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
# uncomment as necessary
|
||||
pnca_snps_or = pnca_snps_or[1:5,]
|
||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
||||
#===============
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
pnca_snps_or = pnca_snps_or[1:5,]
|
||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
||||
pnca_snps_or = pnca_snps_or[1:5]
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
pnca_snps_or = pnca_snps_or[1:5]
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
foo = pnca_snps_or[c(1:5,)]
|
||||
foo = pnca_snps_or[c(1:5),]
|
||||
foo = as.data.frame(pnca_snps_or[c(1:5),])
|
||||
View(foo)
|
||||
# create an empty dataframe
|
||||
pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR, but only for one snp
|
||||
# this is test before you apply it all others
|
||||
#===============
|
||||
pnca_snps_or$mutation == i
|
||||
View(pnca_snps_or)
|
||||
# create an empty dataframe
|
||||
pnca_snps_or = data.frame(mutation = i)
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
View(pnca_snps_or_copy)
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR, but only for one snp
|
||||
# this is test before you apply it all others
|
||||
#===============
|
||||
#reset original df so you don't make a mistake
|
||||
pnca_snps_or = pnca_snps_or_copy
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
}
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
||||
View(pnca_snps_or)
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
View(pnca_snps_or)
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
warnings()
|
||||
View(pnca_snps_or)
|
||||
View(pnca_snps_or_copy)
|
||||
#sanity check
|
||||
pnca_snps_or$mutation == i1
|
||||
#sanity check
|
||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
||||
pnca_snps_or1 = unique(pnca_snps_or)
|
||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
||||
# you only need it for the unique mutations
|
||||
pnca_snps_or = unique(pnca_snps_or) #2133, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
View(pnca_snps_or)
|
||||
2.290256e+01
|
||||
1.561132e+06
|
||||
3.242285e-04
|
||||
#sanity check
|
||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
View(my_data)
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
snps_all = unique(my_data$mutation)# 337
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or = as.data.frame(snps_all)
|
||||
View(pnca_snps_or)
|
||||
snps_all[-"true_wt"]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
View(pnca_snps_or)
|
||||
snps_all = as.data.frame(snps_all)
|
||||
View(snps_all)
|
||||
#remove true_wt entry
|
||||
w1 = which(rownames(snps_all) == "true_wt")
|
||||
View(snps_all)
|
||||
#remove true_wt entry
|
||||
w1 = which(snps_all$snps_all == "true_wt")
|
||||
rm(pnca_snps_or)
|
||||
pnca_snps_or = snps_all[-w1]
|
||||
pnca_snps_or = snps_all[,-w1]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
#remove true_wt entry
|
||||
w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
snps_all = unique(my_data$mutation)# 337
|
||||
snps_all = as.data.frame(snps_all)
|
||||
snps_all[-c(1,1)]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
|
||||
#remove true_wt entry
|
||||
#w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or = pnca_snps_or_copy
|
||||
#remove true_wt entry
|
||||
#w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or -> pnca_snps_or_copy
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR for each snp
|
||||
# and assign to the pnca_snps_or df that has
|
||||
# each row as a unique snp
|
||||
#===============
|
||||
# reset original df so you don't make a mistake: IMPORTANT
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
# you only need it for the unique mutations
|
||||
pnca_snps_or = unique(pnca_snps_or) #337, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
|
||||
setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
|
||||
getwd()
|
||||
#===============
|
||||
# Step 1: read raw data
|
||||
#===============
|
||||
raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
|
||||
,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
|
||||
raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
|
||||
# combine the two mutation columns
|
||||
raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
|
||||
head(raw_data$all_mutations_pyrazinamide)
|
||||
# create yet another column that contains all the mutations but in lower case
|
||||
raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
|
||||
table(grepl("pnca_p",raw_data$all_muts_pza))
|
||||
#FALSE TRUE
|
||||
#10603 1908
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
# subset a snall section to test
|
||||
#pnca_snps_or_copy = pnca_snps_or
|
||||
#pnca_snps_or = pnca_snps_or_copy
|
||||
pnca_snps_unique = unique(pnca_snps_or$mutation) #293
|
||||
i2 = "pnca_p.trp68gly" # Should exist
|
||||
grep(i2, pnca_snps_unique)
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
head(my_data$mutation)
|
||||
my_data = unique(my_data$mutation)
|
||||
my_data[!duplicated(my_data$mutation)]
|
||||
my_data_unique = my_data[!duplicated(my_data$mutation),]
|
||||
my_data[!duplicated('mutation'),]
|
||||
my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
|
||||
my_data_unique = my_data[!duplicated(my_data['mutation']),]
|
||||
getwd()
|
||||
setwd("/git/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
getwd()
|
||||
setwd("/git/github/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
|
||||
c = file.choose()
|
||||
c = file.choose(../Data_original)
|
||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F))
|
||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
|
||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
||||
# define output variables
|
||||
drug = 'pyrazinamide'
|
||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
||||
outdir = paste0("../mcsm_analysis/",drug,"/Data/")
|
||||
outFile = "meta_data_with_AFandOR.csv"
|
||||
output_filename = paste0(outdir, outFile)
|
||||
output_filename
|
Loading…
Add table
Add a link
Reference in a new issue