updated pnca_extraction and AF_OR calcs
This commit is contained in:
parent
53d19d5dd8
commit
f686563c98
4 changed files with 195 additions and 699 deletions
|
@ -1,512 +0,0 @@
|
||||||
# run glm model
|
|
||||||
model = glm(y ~ x, family = binomial)
|
|
||||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
|
||||||
summary(model)
|
|
||||||
#**********
|
|
||||||
# extract relevant model output
|
|
||||||
#**********
|
|
||||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
|
||||||
my_logor = summary(model)$coefficients[2,1]
|
|
||||||
print(paste0('Beta:', my_logor))
|
|
||||||
# extract SE of the logistic model for a given snp
|
|
||||||
my_se = summary(model)$coefficients[2,2]
|
|
||||||
print(paste0('SE:', my_se))
|
|
||||||
# extract Z of the logistic model for a given snp
|
|
||||||
my_zval = summary(model)$coefficients[2,3]
|
|
||||||
print(paste0('Z-value:', my_zval))
|
|
||||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
|
||||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
|
||||||
my_or = exp(summary(model)$coefficients[2,1])
|
|
||||||
print(paste0('OR:', my_or))
|
|
||||||
# sanity check : should be True
|
|
||||||
log(my_or) == my_logor
|
|
||||||
# extract P-value of the logistic model for a given snp
|
|
||||||
my_pval = summary(model)$coefficients[2,4]
|
|
||||||
print(paste0('P-value:', my_pval))
|
|
||||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
|
||||||
print(paste0('CI:', my_ci))
|
|
||||||
#*************
|
|
||||||
# Assign the regression output in the original df
|
|
||||||
# you can use ('=' or '<-/->')
|
|
||||||
#*************
|
|
||||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
|
|
||||||
my_logor
|
|
||||||
pnca_snps_or$Mutationinformation == i
|
|
||||||
View(pnca_snps_or)
|
|
||||||
#===============
|
|
||||||
# Step 4: Calculate for one snp
|
|
||||||
# using i, when you run the loop, it is easy
|
|
||||||
#===============
|
|
||||||
i = "pnca_p.trp68gly"
|
|
||||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T) #2133
|
|
||||||
# uncomment as necessary
|
|
||||||
pnca_snps_or = pnca_snps_or[1:5,]
|
|
||||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
|
||||||
#===============
|
|
||||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T) #2133
|
|
||||||
pnca_snps_or = pnca_snps_or[1:5,]
|
|
||||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
|
||||||
pnca_snps_or = pnca_snps_or[1:5]
|
|
||||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T) #2133
|
|
||||||
pnca_snps_or = pnca_snps_or[1:5]
|
|
||||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T) #2133
|
|
||||||
foo = pnca_snps_or[c(1:5,)]
|
|
||||||
foo = pnca_snps_or[c(1:5),]
|
|
||||||
foo = as.data.frame(pnca_snps_or[c(1:5),])
|
|
||||||
View(foo)
|
|
||||||
# create an empty dataframe
|
|
||||||
pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
|
|
||||||
# IV: corresponds to each unique snp (extracted using grep)
|
|
||||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
|
||||||
# DV: pyrazinamide 0 or 1
|
|
||||||
y = as.numeric(raw_data$pyrazinamide)
|
|
||||||
table(y,x)
|
|
||||||
# run glm model
|
|
||||||
model = glm(y ~ x, family = binomial)
|
|
||||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
|
||||||
summary(model)
|
|
||||||
my_logor = summary(model)$coefficients[2,1]
|
|
||||||
print(paste0('Beta:', my_logor))
|
|
||||||
# extract SE of the logistic model for a given snp
|
|
||||||
my_se = summary(model)$coefficients[2,2]
|
|
||||||
print(paste0('SE:', my_se))
|
|
||||||
# extract Z of the logistic model for a given snp
|
|
||||||
my_zval = summary(model)$coefficients[2,3]
|
|
||||||
print(paste0('Z-value:', my_zval))
|
|
||||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
|
||||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
|
||||||
my_or = exp(summary(model)$coefficients[2,1])
|
|
||||||
print(paste0('OR:', my_or))
|
|
||||||
# sanity check : should be True
|
|
||||||
log(my_or) == my_logor
|
|
||||||
# extract P-value of the logistic model for a given snp
|
|
||||||
my_pval = summary(model)$coefficients[2,4]
|
|
||||||
print(paste0('P-value:', my_pval))
|
|
||||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
|
||||||
print(paste0('CI:', my_ci))
|
|
||||||
#*************
|
|
||||||
# Assign the regression output in the original df
|
|
||||||
# you can use ('=' or '<-/->')
|
|
||||||
#*************
|
|
||||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
|
||||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
|
||||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
|
||||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
|
||||||
#===============
|
|
||||||
# Step 4: Iterate through this unique list
|
|
||||||
# and calculate OR, but only for one snp
|
|
||||||
# this is test before you apply it all others
|
|
||||||
#===============
|
|
||||||
pnca_snps_or$mutation == i
|
|
||||||
View(pnca_snps_or)
|
|
||||||
# create an empty dataframe
|
|
||||||
pnca_snps_or = data.frame(mutation = i)
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
|
||||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
|
||||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
|
||||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
|
||||||
View(pnca_snps_or_copy)
|
|
||||||
#===============
|
|
||||||
# Step 4: Iterate through this unique list
|
|
||||||
# and calculate OR, but only for one snp
|
|
||||||
# this is test before you apply it all others
|
|
||||||
#===============
|
|
||||||
#reset original df so you don't make a mistake
|
|
||||||
pnca_snps_or = pnca_snps_or_copy
|
|
||||||
for (i in pnca_snps_unique){
|
|
||||||
print(i)
|
|
||||||
}
|
|
||||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
|
||||||
#........................................
|
|
||||||
# create an empty dataframe : uncomment as necessary
|
|
||||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
|
|
||||||
#........................................
|
|
||||||
# create an empty dataframe : uncomment as necessary
|
|
||||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
|
||||||
#........................................
|
|
||||||
# create an empty dataframe : uncomment as necessary
|
|
||||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
|
||||||
View(pnca_snps_or)
|
|
||||||
# IV: corresponds to each unique snp (extracted using grep)
|
|
||||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
|
||||||
# DV: pyrazinamide 0 or 1
|
|
||||||
y = as.numeric(raw_data$pyrazinamide)
|
|
||||||
table(y,x)
|
|
||||||
# run glm model
|
|
||||||
model = glm(y ~ x, family = binomial)
|
|
||||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
|
||||||
summary(model)
|
|
||||||
#**********
|
|
||||||
# extract relevant model output
|
|
||||||
#**********
|
|
||||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
|
||||||
my_logor = summary(model)$coefficients[2,1]
|
|
||||||
print(paste0('Beta:', my_logor))
|
|
||||||
# extract SE of the logistic model for a given snp
|
|
||||||
my_se = summary(model)$coefficients[2,2]
|
|
||||||
print(paste0('SE:', my_se))
|
|
||||||
# extract Z of the logistic model for a given snp
|
|
||||||
my_zval = summary(model)$coefficients[2,3]
|
|
||||||
print(paste0('Z-value:', my_zval))
|
|
||||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
|
||||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
|
||||||
my_or = exp(summary(model)$coefficients[2,1])
|
|
||||||
print(paste0('OR:', my_or))
|
|
||||||
# sanity check : should be True
|
|
||||||
log(my_or) == my_logor
|
|
||||||
# extract P-value of the logistic model for a given snp
|
|
||||||
my_pval = summary(model)$coefficients[2,4]
|
|
||||||
print(paste0('P-value:', my_pval))
|
|
||||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
|
||||||
print(paste0('CI:', my_ci))
|
|
||||||
#*************
|
|
||||||
# Assign the regression output in the original df
|
|
||||||
# you can use ('=' or '<-/->')
|
|
||||||
#*************
|
|
||||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
|
||||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
|
||||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
|
||||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
|
||||||
View(pnca_snps_or)
|
|
||||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
|
||||||
for (i in pnca_snps_unique){
|
|
||||||
print(i)
|
|
||||||
#*************
|
|
||||||
# start logistic regression model building
|
|
||||||
#*************
|
|
||||||
# set the IV and DV for the logistic regression model
|
|
||||||
# IV: corresponds to each unique snp (extracted using grep)
|
|
||||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
|
||||||
# DV: pyrazinamide 0 or 1
|
|
||||||
y = as.numeric(raw_data$pyrazinamide)
|
|
||||||
table(y,x)
|
|
||||||
# run glm model
|
|
||||||
model = glm(y ~ x, family = binomial)
|
|
||||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
|
||||||
summary(model)
|
|
||||||
#**********
|
|
||||||
# extract relevant model output
|
|
||||||
#**********
|
|
||||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
|
||||||
my_logor = summary(model)$coefficients[2,1]
|
|
||||||
print(paste0('Beta:', my_logor))
|
|
||||||
# extract SE of the logistic model for a given snp
|
|
||||||
my_se = summary(model)$coefficients[2,2]
|
|
||||||
print(paste0('SE:', my_se))
|
|
||||||
# extract Z of the logistic model for a given snp
|
|
||||||
my_zval = summary(model)$coefficients[2,3]
|
|
||||||
print(paste0('Z-value:', my_zval))
|
|
||||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
|
||||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
|
||||||
my_or = exp(summary(model)$coefficients[2,1])
|
|
||||||
print(paste0('OR:', my_or))
|
|
||||||
# sanity check : should be True
|
|
||||||
log(my_or) == my_logor
|
|
||||||
# extract P-value of the logistic model for a given snp
|
|
||||||
my_pval = summary(model)$coefficients[2,4]
|
|
||||||
print(paste0('P-value:', my_pval))
|
|
||||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
|
||||||
print(paste0('CI:', my_ci))
|
|
||||||
#*************
|
|
||||||
# Assign the regression output in the original df
|
|
||||||
# you can use ('=' or '<-/->')
|
|
||||||
#*************
|
|
||||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
|
||||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
|
||||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
|
||||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
|
||||||
}
|
|
||||||
warnings()
|
|
||||||
View(pnca_snps_or)
|
|
||||||
View(pnca_snps_or_copy)
|
|
||||||
#sanity check
|
|
||||||
pnca_snps_or$mutation == i1
|
|
||||||
#sanity check
|
|
||||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
|
||||||
pnca_snps_or[pnca_snps_or$mutation == i2]
|
|
||||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
|
||||||
pnca_snps_or1 = unique(pnca_snps_or)
|
|
||||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
|
||||||
# you only need it for the unique mutations
|
|
||||||
pnca_snps_or = unique(pnca_snps_or) #2133, 1
|
|
||||||
for (i in pnca_snps_unique){
|
|
||||||
print(i)
|
|
||||||
#*************
|
|
||||||
# start logistic regression model building
|
|
||||||
#*************
|
|
||||||
# set the IV and DV for the logistic regression model
|
|
||||||
# IV: corresponds to each unique snp (extracted using grep)
|
|
||||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
|
||||||
# DV: pyrazinamide 0 or 1
|
|
||||||
y = as.numeric(raw_data$pyrazinamide)
|
|
||||||
table(y,x)
|
|
||||||
# run glm model
|
|
||||||
model = glm(y ~ x, family = binomial)
|
|
||||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
|
||||||
summary(model)
|
|
||||||
#**********
|
|
||||||
# extract relevant model output
|
|
||||||
#**********
|
|
||||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
|
||||||
my_logor = summary(model)$coefficients[2,1]
|
|
||||||
print(paste0('Beta:', my_logor))
|
|
||||||
# extract SE of the logistic model for a given snp
|
|
||||||
my_se = summary(model)$coefficients[2,2]
|
|
||||||
print(paste0('SE:', my_se))
|
|
||||||
# extract Z of the logistic model for a given snp
|
|
||||||
my_zval = summary(model)$coefficients[2,3]
|
|
||||||
print(paste0('Z-value:', my_zval))
|
|
||||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
|
||||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
|
||||||
my_or = exp(summary(model)$coefficients[2,1])
|
|
||||||
print(paste0('OR:', my_or))
|
|
||||||
# sanity check : should be True
|
|
||||||
log(my_or) == my_logor
|
|
||||||
# extract P-value of the logistic model for a given snp
|
|
||||||
my_pval = summary(model)$coefficients[2,4]
|
|
||||||
print(paste0('P-value:', my_pval))
|
|
||||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
|
||||||
print(paste0('CI:', my_ci))
|
|
||||||
#*************
|
|
||||||
# Assign the regression output in the original df
|
|
||||||
# you can use ('=' or '<-/->')
|
|
||||||
#*************
|
|
||||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
|
||||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
|
||||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
|
||||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
|
||||||
}
|
|
||||||
View(pnca_snps_or)
|
|
||||||
2.290256e+01
|
|
||||||
1.561132e+06
|
|
||||||
3.242285e-04
|
|
||||||
#sanity check
|
|
||||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
|
||||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
|
||||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
|
||||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
|
||||||
, stringsAsFactors = FALSE) #11374, 19
|
|
||||||
View(my_data)
|
|
||||||
# remove the first column
|
|
||||||
my_data = my_data[-1] #11374, 18
|
|
||||||
# check if first col is 'id': should be TRUE
|
|
||||||
colnames(my_data)[1] == 'id'
|
|
||||||
# sanity check
|
|
||||||
snps_all = unique(my_data$mutation)# 337
|
|
||||||
pnca_snps_or = snps_all
|
|
||||||
pnca_snps_or = as.data.frame(snps_all)
|
|
||||||
View(pnca_snps_or)
|
|
||||||
snps_all[-"true_wt"]
|
|
||||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
|
||||||
View(pnca_snps_or)
|
|
||||||
snps_all = as.data.frame(snps_all)
|
|
||||||
View(snps_all)
|
|
||||||
#remove true_wt entry
|
|
||||||
w1 = which(rownames(snps_all) == "true_wt")
|
|
||||||
View(snps_all)
|
|
||||||
#remove true_wt entry
|
|
||||||
w1 = which(snps_all$snps_all == "true_wt")
|
|
||||||
rm(pnca_snps_or)
|
|
||||||
pnca_snps_or = snps_all[-w1]
|
|
||||||
pnca_snps_or = snps_all[,-w1]
|
|
||||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
|
||||||
#remove true_wt entry
|
|
||||||
w1 = which(snps_all) == "true_wt"
|
|
||||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
|
||||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
|
||||||
, stringsAsFactors = FALSE) #11374, 19
|
|
||||||
# remove the first column
|
|
||||||
my_data = my_data[-1] #11374, 18
|
|
||||||
# check if first col is 'id': should be TRUE
|
|
||||||
colnames(my_data)[1] == 'id'
|
|
||||||
# sanity check
|
|
||||||
snps_all = unique(my_data$mutation)# 337
|
|
||||||
snps_all = as.data.frame(snps_all)
|
|
||||||
snps_all[-c(1,1)]
|
|
||||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
|
||||||
pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
|
|
||||||
#remove true_wt entry
|
|
||||||
#w1 = which(snps_all) == "true_wt"
|
|
||||||
pnca_snps_or = snps_all
|
|
||||||
pnca_snps_or = pnca_snps_or_copy
|
|
||||||
#remove true_wt entry
|
|
||||||
#w1 = which(snps_all) == "true_wt"
|
|
||||||
pnca_snps_or = snps_all
|
|
||||||
pnca_snps_or -> pnca_snps_or_copy
|
|
||||||
#===============
|
|
||||||
# Step 4: Iterate through this unique list
|
|
||||||
# and calculate OR for each snp
|
|
||||||
# and assign to the pnca_snps_or df that has
|
|
||||||
# each row as a unique snp
|
|
||||||
#===============
|
|
||||||
# reset original df so you don't make a mistake: IMPORTANT
|
|
||||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
|
||||||
# you only need it for the unique mutations
|
|
||||||
pnca_snps_or = unique(pnca_snps_or) #337, 1
|
|
||||||
for (i in pnca_snps_unique){
|
|
||||||
print(i)
|
|
||||||
#*************
|
|
||||||
# start logistic regression model building
|
|
||||||
#*************
|
|
||||||
# set the IV and DV for the logistic regression model
|
|
||||||
# IV: corresponds to each unique snp (extracted using grep)
|
|
||||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
|
||||||
# DV: pyrazinamide 0 or 1
|
|
||||||
y = as.numeric(raw_data$pyrazinamide)
|
|
||||||
table(y,x)
|
|
||||||
# run glm model
|
|
||||||
model = glm(y ~ x, family = binomial)
|
|
||||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
|
||||||
summary(model)
|
|
||||||
#**********
|
|
||||||
# extract relevant model output
|
|
||||||
#**********
|
|
||||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
|
||||||
my_logor = summary(model)$coefficients[2,1]
|
|
||||||
print(paste0('Beta:', my_logor))
|
|
||||||
# extract SE of the logistic model for a given snp
|
|
||||||
my_se = summary(model)$coefficients[2,2]
|
|
||||||
print(paste0('SE:', my_se))
|
|
||||||
# extract Z of the logistic model for a given snp
|
|
||||||
my_zval = summary(model)$coefficients[2,3]
|
|
||||||
print(paste0('Z-value:', my_zval))
|
|
||||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
|
||||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
|
||||||
my_or = exp(summary(model)$coefficients[2,1])
|
|
||||||
print(paste0('OR:', my_or))
|
|
||||||
# sanity check : should be True
|
|
||||||
log(my_or) == my_logor
|
|
||||||
# extract P-value of the logistic model for a given snp
|
|
||||||
my_pval = summary(model)$coefficients[2,4]
|
|
||||||
print(paste0('P-value:', my_pval))
|
|
||||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
|
||||||
print(paste0('CI:', my_ci))
|
|
||||||
#*************
|
|
||||||
# Assign the regression output in the original df
|
|
||||||
# you can use ('=' or '<-/->')
|
|
||||||
#*************
|
|
||||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
|
||||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
|
||||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
|
||||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
|
||||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
|
||||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
|
||||||
}
|
|
||||||
getwd()
|
|
||||||
#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
|
|
||||||
setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
|
|
||||||
#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
|
|
||||||
getwd()
|
|
||||||
#===============
|
|
||||||
# Step 1: read raw data
|
|
||||||
#===============
|
|
||||||
raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
|
|
||||||
,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
|
|
||||||
raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
|
|
||||||
# combine the two mutation columns
|
|
||||||
raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
|
|
||||||
head(raw_data$all_mutations_pyrazinamide)
|
|
||||||
# create yet another column that contains all the mutations but in lower case
|
|
||||||
raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
|
|
||||||
table(grepl("pnca_p",raw_data$all_muts_pza))
|
|
||||||
#FALSE TRUE
|
|
||||||
#10603 1908
|
|
||||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T) #2133
|
|
||||||
# subset a snall section to test
|
|
||||||
#pnca_snps_or_copy = pnca_snps_or
|
|
||||||
#pnca_snps_or = pnca_snps_or_copy
|
|
||||||
pnca_snps_unique = unique(pnca_snps_or$mutation) #293
|
|
||||||
i2 = "pnca_p.trp68gly" # Should exist
|
|
||||||
grep(i2, pnca_snps_unique)
|
|
||||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
|
||||||
, stringsAsFactors = FALSE) #11374, 19
|
|
||||||
# remove the first column
|
|
||||||
my_data = my_data[-1] #11374, 18
|
|
||||||
# check if first col is 'id': should be TRUE
|
|
||||||
colnames(my_data)[1] == 'id'
|
|
||||||
# sanity check
|
|
||||||
head(my_data$mutation)
|
|
||||||
my_data = unique(my_data$mutation)
|
|
||||||
my_data[!duplicated(my_data$mutation)]
|
|
||||||
my_data_unique = my_data[!duplicated(my_data$mutation),]
|
|
||||||
my_data[!duplicated('mutation'),]
|
|
||||||
my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
|
|
||||||
my_data_unique = my_data[!duplicated(my_data['mutation']),]
|
|
||||||
getwd()
|
|
||||||
setwd("/git/LSHTM_analysis/meta_data_analysis")
|
|
||||||
getwd()
|
|
||||||
getwd()
|
|
||||||
setwd("/git/github/LSHTM_analysis/meta_data_analysis")
|
|
||||||
getwd()
|
|
||||||
#===============
|
|
||||||
# Step 1: read GWAS raw data stored in Data_original/
|
|
||||||
#===============
|
|
||||||
infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
|
|
||||||
c = file.choose()
|
|
||||||
c = file.choose(../Data_original)
|
|
||||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
|
||||||
#===============
|
|
||||||
# Step 1: read GWAS raw data stored in Data_original/
|
|
||||||
#===============
|
|
||||||
infile = read.csv(file.choose(), stringsAsFactors = F))
|
|
||||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
|
||||||
#===============
|
|
||||||
# Step 1: read GWAS raw data stored in Data_original/
|
|
||||||
#===============
|
|
||||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
|
||||||
#===============
|
|
||||||
# Step 1: read GWAS raw data stored in Data_original/
|
|
||||||
#===============
|
|
||||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
|
||||||
raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
|
|
||||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
|
||||||
# define output variables
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
|
||||||
outdir = paste0("../mcsm_analysis/",drug,"/Data/")
|
|
||||||
outFile = "meta_data_with_AFandOR.csv"
|
|
||||||
output_filename = paste0(outdir, outFile)
|
|
||||||
output_filename
|
|
||||||
pnca_snps_or = read.csv(file.choose()
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
View(pnca_snps_or)
|
|
||||||
View(pnca_snps_or)
|
|
|
@ -1,16 +1,59 @@
|
||||||
|
#============================================
|
||||||
|
# TASK: to calculate allele frequency and OR
|
||||||
|
# on master data
|
||||||
|
#===========================================
|
||||||
|
homedir = '~'
|
||||||
getwd()
|
getwd()
|
||||||
setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
|
#setwd('~/git/LSHTM_analysis/meta_data_analysis')
|
||||||
|
setwd(paste0(homedir, '/', 'git/LSHTM_analysis/meta_data_analysis'))
|
||||||
getwd()
|
getwd()
|
||||||
|
|
||||||
#===============
|
#%% variable assignment: input and output paths & filenames
|
||||||
# Step 1: read GWAS raw data stored in Data_original/
|
drug = 'pyrazinamide'
|
||||||
#===============
|
gene = 'pncA'
|
||||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
gene_match = paste0(gene,'_p.')
|
||||||
|
print(gene_match)
|
||||||
|
|
||||||
raw_data = infile[,c("id"
|
#=======
|
||||||
|
# input dir
|
||||||
|
#=======
|
||||||
|
# file1: Raw data
|
||||||
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
|
indir = paste0('git/Data', '/', drug, '/', 'input/original')
|
||||||
|
in_filename = 'original_tanushree_data_v2.csv'
|
||||||
|
infile = paste0(homedir, '/', indir, '/', in_filename)
|
||||||
|
print(paste0('Reading infile:', ' ', infile) )
|
||||||
|
|
||||||
|
# file2: file to extract valid snps and add calcs to: pnca_metadata.csv {outfile3 from data extraction script}
|
||||||
|
indir_metadata = paste0('git/Data', '/', drug, '/', 'output')
|
||||||
|
in_filename_metadata = 'pnca_metadata.csv'
|
||||||
|
infile_metadata = paste0(homedir, '/', indir_metadata, '/', in_filename_metadata)
|
||||||
|
print(paste0('Reading metadata infile:', infile_metadata))
|
||||||
|
|
||||||
|
#=========
|
||||||
|
# output dir
|
||||||
|
#=========
|
||||||
|
# output filename in respective section at the time of outputting files
|
||||||
|
#outdir = 'git/Data/pyrazinamide/output'
|
||||||
|
outdir = paste0('git/Data', '/', drug, '/', 'output')
|
||||||
|
out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv')
|
||||||
|
outfile = paste0(homedir, '/', outdir, '/', out_filename)
|
||||||
|
print(paste0('Output file with full path:', outfile))
|
||||||
|
|
||||||
|
#%% end of variable assignment for input and output files
|
||||||
|
|
||||||
|
#===============
|
||||||
|
# Step 1: Read master/raw data stored in Data/
|
||||||
|
#===============
|
||||||
|
raw_data_all = read.csv(infile, stringsAsFactors = F)
|
||||||
|
|
||||||
|
raw_data = raw_data_all[,c("id"
|
||||||
, "pyrazinamide"
|
, "pyrazinamide"
|
||||||
, "dr_mutations_pyrazinamide"
|
, "dr_mutations_pyrazinamide"
|
||||||
, "other_mutations_pyrazinamide")]
|
, "other_mutations_pyrazinamide")]
|
||||||
|
rm(raw_data_all)
|
||||||
|
|
||||||
|
rm(indir, in_filename, infile)
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# 1a: exclude na
|
# 1a: exclude na
|
||||||
|
@ -18,7 +61,7 @@ raw_data = infile[,c("id"
|
||||||
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
|
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
|
||||||
|
|
||||||
total_samples = length(unique(raw_data$id))
|
total_samples = length(unique(raw_data$id))
|
||||||
print(total_samples)
|
print(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
|
||||||
|
|
||||||
# sanity check: should be true
|
# sanity check: should be true
|
||||||
is.numeric(total_samples)
|
is.numeric(total_samples)
|
||||||
|
@ -36,10 +79,15 @@ head(raw_data$all_mutations_pyrazinamide)
|
||||||
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
|
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
|
||||||
|
|
||||||
# sanity checks
|
# sanity checks
|
||||||
table(grepl("pnca_p",raw_data$all_muts_pnca))
|
#table(grepl("pnca_p",raw_data$all_muts_pnca))
|
||||||
|
print(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
|
||||||
|
gene_match = tolower(gene_match)
|
||||||
|
|
||||||
|
table(grepl(gene_match,raw_data$all_muts_pnca))
|
||||||
|
|
||||||
# sanity check: should be TRUE
|
# sanity check: should be TRUE
|
||||||
sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
|
#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
|
||||||
|
sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples
|
||||||
|
|
||||||
# set up variables: can be used for logistic regression as well
|
# set up variables: can be used for logistic regression as well
|
||||||
i = "pnca_p.ala134gly" # has a NA, should NOT exist
|
i = "pnca_p.ala134gly" # has a NA, should NOT exist
|
||||||
|
@ -56,17 +104,40 @@ table(mut, dst)
|
||||||
#fisher.test(table(mut, dst))
|
#fisher.test(table(mut, dst))
|
||||||
#table(mut)
|
#table(mut)
|
||||||
|
|
||||||
###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
|
#===============
|
||||||
pnca_snps_or = read.csv(file.choose()
|
# Step 2: Read valid snps for which OR can be calculated (infile_comp_snps.csv)
|
||||||
|
#===============
|
||||||
|
print(paste0('Reading metadata infile:', infile_metadata))
|
||||||
|
|
||||||
|
pnca_metadata = read.csv(infile_metadata
|
||||||
|
# , file.choose()
|
||||||
, stringsAsFactors = F
|
, stringsAsFactors = F
|
||||||
, header = T)
|
, header = T)
|
||||||
|
|
||||||
# extract unique snps to iterate over for AF and OR calcs
|
|
||||||
# total no of unique snps
|
|
||||||
# AF and OR calculations
|
|
||||||
|
|
||||||
|
# clear variables
|
||||||
|
rm(homedir, in_filename, indir, infile)
|
||||||
|
rm(indir_metadata, infile_metadata, in_filename_metadata)
|
||||||
|
|
||||||
|
# count na in pyrazinamide column
|
||||||
|
tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide))
|
||||||
|
expected_rows = nrow(pnca_metadata) - tot_pza_na
|
||||||
|
|
||||||
|
# drop na from the pyrazinamide colum
|
||||||
|
pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),]
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
if(nrow(pnca_snps_or) == expected_rows){
|
||||||
|
print('PASS: no. of rows match with expected_rows')
|
||||||
|
} else{
|
||||||
|
print('FAIL: nrows mismatch.')
|
||||||
|
}
|
||||||
|
|
||||||
|
# extract unique snps to iterate over for AF and OR calcs
|
||||||
pnca_snps_unique = unique(pnca_snps_or$mutation)
|
pnca_snps_unique = unique(pnca_snps_or$mutation)
|
||||||
|
|
||||||
|
print(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique)))
|
||||||
|
|
||||||
# Define OR function
|
# Define OR function
|
||||||
x = as.numeric(mut)
|
x = as.numeric(mut)
|
||||||
y = dst
|
y = dst
|
||||||
|
@ -111,131 +182,106 @@ afs['pnca_p.trp68gly']
|
||||||
afs['pnca_p.gln10pro']
|
afs['pnca_p.gln10pro']
|
||||||
afs['pnca_p.leu4ser']
|
afs['pnca_p.leu4ser']
|
||||||
|
|
||||||
#plot(density(log(ors)))
|
plot(density(log(ors)))
|
||||||
#plot(-log10(pvals))
|
plot(-log10(pvals))
|
||||||
#hist(log(ors)
|
hist(log(ors)
|
||||||
# ,breaks = 100
|
, breaks = 100
|
||||||
# )
|
)
|
||||||
|
|
||||||
# subset df cols to add to the calc param df
|
# FIXME: could be good to add a sanity check
|
||||||
pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')]
|
if (table(names(ors) == names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){
|
||||||
pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
|
print('PASS: names of ors, pvals and afs match: proceed with combining into a single df')
|
||||||
|
} else{
|
||||||
rownames(pnca_snps_cols) = pnca_snps_cols$mutation
|
print('FAIL: names of ors, pvals and afs mismatch')
|
||||||
head(rownames(pnca_snps_cols))
|
}
|
||||||
#snps_with_AF_and_OR
|
|
||||||
|
|
||||||
# combine
|
# combine
|
||||||
comb_AF_and_OR = data.frame(ors, pvals, afs)
|
comb_AF_and_OR = data.frame(ors, pvals, afs)
|
||||||
head(rownames(comb_AF_and_OR))
|
head(rownames(comb_AF_and_OR))
|
||||||
|
|
||||||
# sanity checks: should be the same
|
# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column
|
||||||
dim(comb_AF_and_OR); dim(pnca_snps_cols)
|
comb_AF_and_OR$mutation = rownames(comb_AF_and_OR)
|
||||||
table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
|
|
||||||
|
|
||||||
table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
|
|
||||||
|
|
||||||
# merge the above two df whose dim you checked
|
|
||||||
snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
|
|
||||||
, by = "row.names"
|
|
||||||
# , all.x = T
|
|
||||||
)
|
|
||||||
|
|
||||||
#rm(pnca_snps_cols, pnca_snps_or, raw_data)
|
|
||||||
|
|
||||||
#===============
|
|
||||||
# Step 3: Read data file where you will add the calculated OR
|
|
||||||
# Note: this is the big file with one-many relationship between snps and lineages
|
|
||||||
# i.e fname4 from 'pnca_extraction.py'
|
|
||||||
#===============
|
|
||||||
my_data = read.csv(file.choose()
|
|
||||||
, row.names = 1
|
|
||||||
, stringsAsFactors = FALSE)
|
|
||||||
|
|
||||||
head(my_data)
|
|
||||||
length(unique(my_data$id))
|
|
||||||
|
|
||||||
# check if first col is 'id': should be TRUE
|
|
||||||
colnames(my_data)[1] == 'id'
|
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
head(my_data$mutation)
|
if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){
|
||||||
|
print('PASS: rownames and mutaion col values match')
|
||||||
|
}else{
|
||||||
|
print('FAIL: rownames and mutation col values mismatch')
|
||||||
|
}
|
||||||
|
|
||||||
# FILES TO MERGE:
|
############
|
||||||
# comb_AF_and_OR: file containing OR
|
# Merge 1:
|
||||||
# my_data = big meta data file
|
###########
|
||||||
# linking column: mutation
|
df1 = pnca_metadata
|
||||||
|
df2 = comb_AF_and_OR
|
||||||
|
|
||||||
head(my_data)
|
head(df1$mutation); head(df2$mutation)
|
||||||
merged_df = merge(my_data # big file
|
|
||||||
, snps_with_AF_and_OR # small (afor file)
|
# FIXME: newlines
|
||||||
|
print(paste0('merging two dfs: '
|
||||||
|
,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1)
|
||||||
|
,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2)
|
||||||
|
, 'expected rows in merged df: ', nrow(df1), 'expected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1)))
|
||||||
|
|
||||||
|
merged_df = merge(df1 # big file
|
||||||
|
, df2 # small (afor file)
|
||||||
, by = "mutation"
|
, by = "mutation"
|
||||||
, all.x = T) # because you want all the entries of the meta data
|
, all.x = T) # because you want all the entries of the meta data
|
||||||
|
|
||||||
# sanity checks: should be True
|
# sanity check
|
||||||
# FIXME: I have checked this manually, but make it so it is a pass or a fail!
|
if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){
|
||||||
comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors
|
print(paste0('PASS: no. of cols is as expected: ', ncol(merged_df)))
|
||||||
merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
|
} else{
|
||||||
|
print('FAIL: no.of cols mistmatch')
|
||||||
|
}
|
||||||
|
|
||||||
merged_df[merged_df$Mutationinformation.x == "Q10P",]
|
# quick check
|
||||||
|
i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc.
|
||||||
# sanity check: very important!
|
merged_df[merged_df$mutation == i,]
|
||||||
colnames(merged_df)
|
|
||||||
|
|
||||||
table(merged_df$mutation_info.x == merged_df$mutation_info.y)
|
|
||||||
|
|
||||||
#FIXME: what happened to other 7 and FALSE
|
|
||||||
table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
|
|
||||||
|
|
||||||
# problem
|
|
||||||
identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
|
|
||||||
|
|
||||||
#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
|
|
||||||
|
|
||||||
#throw away the y because that is a smaller df
|
|
||||||
d1 = which(colnames(merged_df) == "mutation_info.y") #21
|
|
||||||
d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
|
|
||||||
|
|
||||||
merged_df2 = merged_df[-c(d1, d2)] #3093 20
|
|
||||||
colnames(merged_df2)
|
|
||||||
|
|
||||||
# rename cols
|
|
||||||
colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
|
|
||||||
colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
|
|
||||||
|
|
||||||
colnames(merged_df2)
|
|
||||||
|
|
||||||
# should be 0
|
|
||||||
sum(is.na(merged_df2$Mutationinformation))
|
|
||||||
|
|
||||||
# count na in each column
|
# count na in each column
|
||||||
na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
|
na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count
|
||||||
# only some or and Af should be NA
|
# only some or and Af should be NA
|
||||||
#Row.names ors pvals afs
|
#Row.names ors pvals afs
|
||||||
#81 81 81 81
|
#63 63 63 63
|
||||||
|
|
||||||
|
# reassign custom colnames
|
||||||
|
colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR"
|
||||||
|
colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF"
|
||||||
|
colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue"
|
||||||
|
|
||||||
colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
|
colnames(merged_df)
|
||||||
colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
|
|
||||||
colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
|
|
||||||
|
|
||||||
colnames(merged_df2)
|
|
||||||
|
|
||||||
# add log OR and neglog pvalue
|
# add log OR and neglog pvalue
|
||||||
merged_df2$logor = log(merged_df2$OR)
|
merged_df$logor = log(merged_df$OR)
|
||||||
is.numeric(merged_df2$logor)
|
is.numeric(merged_df$logor)
|
||||||
|
|
||||||
merged_df2$neglog10pvalue = -log10(merged_df2$pvalue)
|
merged_df$neglog10pvalue = -log10(merged_df$pvalue)
|
||||||
is.numeric(merged_df2$neglog10pvalue)
|
is.numeric(merged_df$neglog10pvalue)
|
||||||
|
|
||||||
# write file out
|
merged_df$AF_percent = merged_df$AF*100
|
||||||
#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
|
is.numeric(merged_df$AF_percent)
|
||||||
|
|
||||||
# define output variables
|
# check AFs
|
||||||
drug = 'pyrazinamide'
|
#i = 'pnca_p.trp68gly'
|
||||||
out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
|
i = 'pnca_p.gln10pro'
|
||||||
outFile = "meta_data_with_AFandOR.csv"
|
#i = 'pnca_p.leu4ser'
|
||||||
output_filename = paste0(outdir, outFile)
|
merged_df[merged_df$mutation == i,]
|
||||||
|
|
||||||
write.csv(merged_df2, output_filename
|
# FIXME: harcoding (beware!), NOT FATAL though!
|
||||||
|
ncol_added = 3
|
||||||
|
|
||||||
|
print(paste0('Added', ncol_added, ' ', 'more cols to merged_df i.e log10 OR and -log10 P-val: '
|
||||||
|
, 'no. of cols in merged_df now: ', ncol(merged_df)))
|
||||||
|
|
||||||
|
#%% write file out: pnca_meta_data_with_AFandOR
|
||||||
|
print(paste0('writing output file in: '
|
||||||
|
, 'Filename: ', out_filename
|
||||||
|
, 'Path:', outdir))
|
||||||
|
|
||||||
|
write.csv(merged_df, outfile
|
||||||
, row.names = F)
|
, row.names = F)
|
||||||
|
|
||||||
|
print(paste0('Finished writing:', out_filename, '\nExpected no. of cols:', ncol(merged_df)))
|
||||||
|
print('======================================================================')
|
||||||
|
rm(out_filename)
|
||||||
|
|
|
@ -36,9 +36,10 @@ import numpy as np
|
||||||
# 1) pnca_ambiguous_muts.csv
|
# 1) pnca_ambiguous_muts.csv
|
||||||
# 2) pnca_mcsm_snps.csv
|
# 2) pnca_mcsm_snps.csv
|
||||||
# 3) pnca_metadata.csv
|
# 3) pnca_metadata.csv
|
||||||
# 4) pnca_comp_snps.csv
|
# 4) pnca_comp_snps.csv <---deleted>
|
||||||
# 5) pnca_all_muts_msa.csv
|
|
||||||
# 6) pnca_mutational_positons.csv
|
# 4) pnca_all_muts_msa.csv
|
||||||
|
# 5) pnca_mutational_positons.csv
|
||||||
#========================================================
|
#========================================================
|
||||||
#%% specify homedir as python doesn't recognise tilde
|
#%% specify homedir as python doesn't recognise tilde
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
@ -52,23 +53,25 @@ os.getcwd()
|
||||||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||||
#========================================================
|
#========================================================
|
||||||
|
|
||||||
#drug = 'pyrazinamide'
|
#%% variable assignment: input and output paths & filenames
|
||||||
|
|
||||||
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
|
|
||||||
#%% specify variables for input and output paths and filenames
|
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# input dir
|
# input dir
|
||||||
#=======
|
#=======
|
||||||
indir = 'git/Data/pyrazinamide/input/original'
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
|
indir = 'git/Data' + '/' + drug + '/' + 'input/original'
|
||||||
#=========
|
#=========
|
||||||
# output dir
|
# output dir
|
||||||
#=========
|
#=========
|
||||||
# several output files
|
# several output files
|
||||||
# output filenames in respective sections at the time of outputting files
|
# output filenames in respective sections at the time of outputting files
|
||||||
outdir = 'git/Data/pyrazinamide/output'
|
#outdir = 'git/Data/pyrazinamide/output'
|
||||||
|
outdir = 'git/Data' + '/' + drug + '/' + 'output'
|
||||||
|
|
||||||
#%%end of variable assignment for input and output files
|
#%%end of variable assignment for input and output files
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
#%% Read files
|
#%% Read files
|
||||||
|
@ -77,7 +80,7 @@ in_filename = 'original_tanushree_data_v2.csv'
|
||||||
infile = homedir + '/' + indir + '/' + in_filename
|
infile = homedir + '/' + indir + '/' + in_filename
|
||||||
print('Reading input master file:', infile)
|
print('Reading input master file:', infile)
|
||||||
|
|
||||||
master_data = pd.read_csv(infile, sep = ',')
|
master_data = pd.read_csv(infile, sep = ',')
|
||||||
|
|
||||||
# column names
|
# column names
|
||||||
#list(master_data.columns)
|
#list(master_data.columns)
|
||||||
|
@ -334,6 +337,8 @@ print('Writing file: common ids:\n',
|
||||||
|
|
||||||
common_ids.to_csv(outfile0)
|
common_ids.to_csv(outfile0)
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
|
del(out_filename0)
|
||||||
|
|
||||||
|
|
||||||
# clear variables
|
# clear variables
|
||||||
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
|
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
|
||||||
|
@ -701,21 +706,6 @@ del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_m
|
||||||
|
|
||||||
#%% end of data extraction and some files writing. Below are some more files writing.
|
#%% end of data extraction and some files writing. Below are some more files writing.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#%%: write file: ambiguous muts
|
#%%: write file: ambiguous muts
|
||||||
# uncomment as necessary
|
# uncomment as necessary
|
||||||
#print(outdir)
|
#print(outdir)
|
||||||
|
@ -735,6 +725,8 @@ inspect.to_csv(outfile1)
|
||||||
print('Finished writing:', out_filename1, '\nExpected no. of rows (no. of samples with the ambiguous muts present):', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum())
|
print('Finished writing:', out_filename1, '\nExpected no. of rows (no. of samples with the ambiguous muts present):', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum())
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
del(out_filename1)
|
del(out_filename1)
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
#===========
|
#===========
|
||||||
# Split 'mutation' column into three: wild_type, position and
|
# Split 'mutation' column into three: wild_type, position and
|
||||||
|
@ -891,6 +883,8 @@ print('Finished writing:', out_filename2,
|
||||||
'\nNo. of rows:', len(snps_only) )
|
'\nNo. of rows:', len(snps_only) )
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
del(out_filename2)
|
del(out_filename2)
|
||||||
|
|
||||||
|
|
||||||
#%% Write file: pnca_metadata (i.e pnca_LF1)
|
#%% Write file: pnca_metadata (i.e pnca_LF1)
|
||||||
# where each row has UNIQUE mutations NOT unique sample ids
|
# where each row has UNIQUE mutations NOT unique sample ids
|
||||||
out_filename3 = gene.lower() + '_' + 'metadata.csv'
|
out_filename3 = gene.lower() + '_' + 'metadata.csv'
|
||||||
|
@ -903,45 +897,10 @@ pnca_LF1.to_csv(outfile3, header = True, index = False)
|
||||||
print('Finished writing:', out_filename3,
|
print('Finished writing:', out_filename3,
|
||||||
'\nNo. of rows:', len(pnca_LF1),
|
'\nNo. of rows:', len(pnca_LF1),
|
||||||
'\nNo. of cols:', len(pnca_LF1.columns) )
|
'\nNo. of cols:', len(pnca_LF1.columns) )
|
||||||
|
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
|
del(out_filename3)
|
||||||
|
|
||||||
#%% Write file: comp SNPs (i.e snps without any corresponding 'NA' in the <drug>
|
|
||||||
# column to allow OR calcs)
|
|
||||||
|
|
||||||
# remove NA from pyrazinamide cols
|
|
||||||
pnca_LF2 = pnca_LF1.dropna(subset=['pyrazinamide'])
|
|
||||||
|
|
||||||
print('extracting OR muts by removing NAs from pyrazinamide cols')
|
|
||||||
if pnca_LF2.pyrazinamide.isna().sum() > 0:
|
|
||||||
print('FAIL: NAs NOT removed successfully')
|
|
||||||
else:
|
|
||||||
print('PASS: NAs removed successfully')
|
|
||||||
|
|
||||||
# extracting comp snps only
|
|
||||||
comp_snps_only = pd.DataFrame(pnca_LF2['mutation'].unique())
|
|
||||||
#print('Total no. of comp snps:', len(comp_snps_only))
|
|
||||||
comp_snps_only.head()
|
|
||||||
|
|
||||||
# assign column name
|
|
||||||
comp_snps_only.columns = ['mutation']
|
|
||||||
|
|
||||||
# count how many positions this corresponds to
|
|
||||||
comp_pos_only = pd.DataFrame(pnca_LF2['position'].unique())
|
|
||||||
#print('Total no. of pos corresponding to comp_snps:', len(comp_pos_only))
|
|
||||||
|
|
||||||
out_filename4 = gene.lower() + '_' + 'comp_snps.csv'
|
|
||||||
outfile4 = homedir + '/' + outdir + '/' + out_filename4
|
|
||||||
print('Writing file: comp snps to allow OR calcs',
|
|
||||||
'\nFilename:', out_filename4,
|
|
||||||
'\nPath:', homedir + '/' + outdir,
|
|
||||||
'\nNo. of comp muts:', len(comp_snps_only),
|
|
||||||
'\nNo. of distinct positions for comp muts:', len(comp_pos_only) )
|
|
||||||
|
|
||||||
comp_snps_only.to_csv(outfile4, header = True, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename4,
|
|
||||||
'\nNo. of rows:', len(comp_snps_only) )
|
|
||||||
#%% write file: mCSM style but with repitions for MSA and logo plots
|
#%% write file: mCSM style but with repitions for MSA and logo plots
|
||||||
all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation'])
|
all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation'])
|
||||||
all_muts_msa.head()
|
all_muts_msa.head()
|
||||||
|
@ -970,21 +929,22 @@ else:
|
||||||
'\nDebug please!')
|
'\nDebug please!')
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
|
|
||||||
out_filename5 = gene.lower() + '_' + 'all_muts_msa.csv'
|
out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv'
|
||||||
outfile5 = homedir + '/' + outdir + '/' + out_filename5
|
outfile4 = homedir + '/' + outdir + '/' + out_filename4
|
||||||
|
|
||||||
print('Writing file: mCSM style muts for msa',
|
print('Writing file: mCSM style muts for msa',
|
||||||
'\nmutation format (SNP): {Wt}<POS>{Mut}',
|
'\nmutation format (SNP): {Wt}<POS>{Mut}',
|
||||||
'\nNo.of lines of msa:', len(all_muts_msa),
|
'\nNo.of lines of msa:', len(all_muts_msa),
|
||||||
'\nFilename:', out_filename5,
|
'\nFilename:', out_filename4,
|
||||||
'\nPath:', homedir +'/'+ outdir)
|
'\nPath:', homedir +'/'+ outdir)
|
||||||
|
|
||||||
all_muts_msa_sorted.to_csv(outfile5, header = False, index = False)
|
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
|
||||||
|
|
||||||
print('Finished writing:', out_filename5,
|
print('Finished writing:', out_filename4,
|
||||||
'\nNo. of rows:', len(all_muts_msa) )
|
'\nNo. of rows:', len(all_muts_msa) )
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
del(out_filename5)
|
del(out_filename4)
|
||||||
|
|
||||||
|
|
||||||
#%% write file for mutational positions
|
#%% write file for mutational positions
|
||||||
# count how many positions this corresponds to
|
# count how many positions this corresponds to
|
||||||
|
@ -999,20 +959,22 @@ pos_only.position.dtype
|
||||||
# sort by position value
|
# sort by position value
|
||||||
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
||||||
|
|
||||||
out_filename6 = gene.lower() + '_' + 'mutational_positons.csv'
|
out_filename5 = gene.lower() + '_' + 'mutational_positons.csv'
|
||||||
outfile6 = homedir + '/' + outdir + '/' + out_filename6
|
outfile5 = homedir + '/' + outdir + '/' + out_filename5
|
||||||
|
|
||||||
print('Writing file: mutational positions',
|
print('Writing file: mutational positions',
|
||||||
'\nNo. of distinct positions:', len(pos_only_sorted),
|
'\nNo. of distinct positions:', len(pos_only_sorted),
|
||||||
'\nFilename:', out_filename6,
|
'\nFilename:', out_filename5,
|
||||||
'\nPath:', homedir +'/'+ outdir)
|
'\nPath:', homedir +'/'+ outdir)
|
||||||
|
|
||||||
pos_only_sorted.to_csv(outfile6, header = True, index = False)
|
pos_only_sorted.to_csv(outfile5, header = True, index = False)
|
||||||
|
|
||||||
print('Finished writing:', out_filename6,
|
print('Finished writing:', out_filename5,
|
||||||
'\nNo. of rows:', len(pos_only_sorted) )
|
'\nNo. of rows:', len(pos_only_sorted) )
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
del(out_filename6)
|
del(out_filename5)
|
||||||
|
|
||||||
|
|
||||||
#%% end of script
|
#%% end of script
|
||||||
print('======================================================================')
|
print('======================================================================')
|
||||||
print(u'\u2698' * 50,
|
print(u'\u2698' * 50,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue