import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/meta_data_analysis/.Rhistory
+++ b/meta_data_analysis/.Rhistory
@ -0,0 +1,512 @@
+, stringsAsFactors = F)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
+my_logor
+pnca_snps_or$Mutationinformation == i
+View(pnca_snps_or)
+#===============
+# Step 4: Calculate for one snp
+# using i, when you run the loop, it is easy
+#===============
+i = "pnca_p.trp68gly"
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+# uncomment as necessary
+pnca_snps_or = pnca_snps_or[1:5,]
+pnca_snps_or = pnca_snps_or[c(1:5),]
+#===============
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+pnca_snps_or = pnca_snps_or[1:5,]
+pnca_snps_or = pnca_snps_or[c(1:5),]
+pnca_snps_or = pnca_snps_or[1:5]
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+pnca_snps_or = pnca_snps_or[1:5]
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+foo = pnca_snps_or[c(1:5,)]
+foo = pnca_snps_or[c(1:5),]
+foo = as.data.frame(pnca_snps_or[c(1:5),])
+View(foo)
+# create an empty dataframe
+pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR, but only for one snp
+# this is test before you apply it all others
+#===============
+pnca_snps_or$mutation == i
+View(pnca_snps_or)
+# create an empty dataframe
+pnca_snps_or = data.frame(mutation = i)
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+View(pnca_snps_or_copy)
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR, but only for one snp
+# this is test before you apply it all others
+#===============
+#reset original df so you don't make a mistake
+pnca_snps_or = pnca_snps_or_copy
+for (i in pnca_snps_unique){
+print(i)
+}
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
+View(pnca_snps_or)
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+View(pnca_snps_or)
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+warnings()
+View(pnca_snps_or)
+View(pnca_snps_or_copy)
+#sanity check
+pnca_snps_or$mutation == i1
+#sanity check
+pnca_snps_or[pnca_snps_or$mutation == i1]
+pnca_snps_or[pnca_snps_or$mutation == i2]
+pnca_snps_or[pnca_snps_or$mutation == i2,]
+pnca_snps_or1 = unique(pnca_snps_or)
+write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
+# you only need it for the unique mutations
+pnca_snps_or = unique(pnca_snps_or) #2133, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+View(pnca_snps_or)
+2.290256e+01
+1.561132e+06
+3.242285e-04
+#sanity check
+pnca_snps_or[pnca_snps_or$mutation == i1]
+pnca_snps_or[pnca_snps_or$mutation == i2,]
+write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+View(my_data)
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+snps_all = unique(my_data$mutation)# 337
+pnca_snps_or = snps_all
+pnca_snps_or = as.data.frame(snps_all)
+View(pnca_snps_or)
+snps_all[-"true_wt"]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+View(pnca_snps_or)
+snps_all = as.data.frame(snps_all)
+View(snps_all)
+#remove true_wt entry
+w1 = which(rownames(snps_all) == "true_wt")
+View(snps_all)
+#remove true_wt entry
+w1 = which(snps_all$snps_all == "true_wt")
+rm(pnca_snps_or)
+pnca_snps_or = snps_all[-w1]
+pnca_snps_or = snps_all[,-w1]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+#remove true_wt entry
+w1 = which(snps_all) == "true_wt"
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+snps_all = unique(my_data$mutation)# 337
+snps_all = as.data.frame(snps_all)
+snps_all[-c(1,1)]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
+#remove true_wt entry
+#w1 = which(snps_all) == "true_wt"
+pnca_snps_or = snps_all
+pnca_snps_or = pnca_snps_or_copy
+#remove true_wt entry
+#w1 = which(snps_all) == "true_wt"
+pnca_snps_or = snps_all
+pnca_snps_or -> pnca_snps_or_copy
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR for each snp
+# and assign to the pnca_snps_or df that has
+# each row as a unique snp
+#===============
+# reset original df so you don't make a mistake: IMPORTANT
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+# you only need it for the unique mutations
+pnca_snps_or = unique(pnca_snps_or) #337, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+getwd()
+#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
+setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
+getwd()
+#===============
+# Step 1: read raw data
+#===============
+raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
+,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
+raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
+# combine the two mutation columns
+raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
+head(raw_data$all_mutations_pyrazinamide)
+# create yet another column that contains all the mutations but in lower case
+raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
+table(grepl("pnca_p",raw_data$all_muts_pza))
+#FALSE  TRUE
+#10603  1908
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+# subset a snall section to test
+#pnca_snps_or_copy = pnca_snps_or
+#pnca_snps_or = pnca_snps_or_copy
+pnca_snps_unique = unique(pnca_snps_or$mutation) #293
+i2 = "pnca_p.trp68gly" # Should exist
+grep(i2, pnca_snps_unique)
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+head(my_data$mutation)
+my_data = unique(my_data$mutation)
+my_data[!duplicated(my_data$mutation)]
+my_data_unique = my_data[!duplicated(my_data$mutation),]
+my_data[!duplicated('mutation'),]
+my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
+my_data_unique = my_data[!duplicated(my_data['mutation']),]
+getwd()
+setwd("/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+getwd()
+setwd("/git/github/LSHTM_analysis/meta_data_analysis")
+getwd()
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
+c = file.choose()
+c = file.choose(../Data_original)
+c = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F))
+c = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
+outdir = paste0("../mcsm_analysis",drug,"/Data/")
+# define output variables
+drug  = 'pyrazinamide'
+outdir = paste0("../mcsm_analysis",drug,"/Data/")
+outdir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+output_filename
--- a/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
+++ b/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -0,0 +1,7 @@
+#!/usr/bin/python3
+# Initialise a blank 'Data' directory and drug subdirs etc.
+# TODO:
+# - Read base dir from config file
+# - Create eg: '~/git/Data/{original,processed}
+# - Create eg: '~/git/Data/processed/' + drug (for each drug)
+# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
--- a/meta_data_analysis/pnca_AF_and_OR_calcs.R
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@ -0,0 +1,241 @@
+getwd()
+setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+
+raw_data = infile[,c("id"
+                     , "pyrazinamide"
+                     , "dr_mutations_pyrazinamide"
+                     , "other_mutations_pyrazinamide")]
+
+#####
+# 1a: exclude na
+#####
+raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
+
+total_samples = length(unique(raw_data$id))
+print(total_samples)
+
+# sanity check: should  be true
+is.numeric(total_samples) 
+
+#####
+# 1b: combine the two mutation columns
+#####
+raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
+                                            , raw_data$other_mutations_pyrazinamide)
+head(raw_data$all_mutations_pyrazinamide)
+
+#####
+# 1c: create yet another column that contains all the mutations but in lower case
+#####
+raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
+
+# sanity checks
+table(grepl("pnca_p",raw_data$all_muts_pnca))
+
+# sanity check: should be TRUE
+sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
+
+# set up variables: can be used for logistic regression as well
+i  = "pnca_p.ala134gly" # has a NA, should NOT exist
+table(grepl(i,raw_data$all_muts_pnca))
+
+i = "pnca_p.trp68gly"
+table(grepl(i,raw_data$all_muts_pnca))
+
+mut = grepl(i,raw_data$all_muts_pnca)
+dst = raw_data$pyrazinamide
+table(mut, dst)
+
+#chisq.test(table(mut,dst))
+#fisher.test(table(mut, dst))
+#table(mut)
+
+###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
+pnca_snps_or = read.csv(file.choose()
+                        , stringsAsFactors = F
+                        , header = T)
+
+# extract unique snps to iterate over for AF and OR calcs
+# total no of unique snps
+# AF and OR calculations
+
+pnca_snps_unique = unique(pnca_snps_or$mutation) 
+
+# Define OR function
+x = as.numeric(mut)
+y = dst
+or = function(x,y){
+  tab = as.matrix(table(x,y))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+  }
+
+dst = raw_data$pyrazinamide
+ors = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  or(mut,dst)
+})
+
+ors
+
+pvals = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  fisher.test(mut,dst)$p.value
+})
+
+pvals
+
+afs = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  mean(mut)
+})
+
+afs
+
+# check ..hmmm
+afs['pnca_p.trp68gly']
+afs['pnca_p.gln10pro'] 
+afs['pnca_p.leu4ser'] 
+
+#plot(density(log(ors)))
+#plot(-log10(pvals))
+#hist(log(ors)
+#     ,breaks = 100
+#     )
+
+# subset df cols to add to the calc param df
+pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
+pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
+
+rownames(pnca_snps_cols) = pnca_snps_cols$mutation
+head(rownames(pnca_snps_cols))
+#snps_with_AF_and_OR
+
+# combine
+comb_AF_and_OR = data.frame(ors, pvals, afs)
+head(rownames(comb_AF_and_OR))
+
+# sanity checks: should be the same
+dim(comb_AF_and_OR); dim(pnca_snps_cols)
+table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
+
+table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
+
+# merge the above two df whose dim you checked
+snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
+                            , by = "row.names"
+#                            , all.x = T
+                            )
+
+#rm(pnca_snps_cols, pnca_snps_or, raw_data)
+
+#===============
+# Step 3: Read data file where you will add the calculated OR 
+# Note: this is the big file with one-many relationship between snps and lineages
+# i.e fname4 from 'pnca_extraction.py'
+#===============
+my_data = read.csv(file.choose()
+                   , row.names = 1
+                   , stringsAsFactors = FALSE)
+
+head(my_data)
+length(unique(my_data$id))
+
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+
+# sanity check
+head(my_data$mutation)
+
+# FILES TO MERGE:
+# comb_AF_and_OR: file containing OR
+# my_data = big meta data file 
+# linking column: mutation
+
+head(my_data)
+merged_df = merge(my_data # big file
+                  , snps_with_AF_and_OR # small (afor file)
+                  , by = "mutation"
+                  , all.x = T) # because you want all the entries of the meta data 
+
+# sanity checks: should be True 
+# FIXME: I have checked this manually, but make it so it is a pass or a fail!
+comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
+merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
+
+merged_df[merged_df$Mutationinformation.x == "Q10P",]
+
+# sanity check: very important!
+colnames(merged_df)
+
+table(merged_df$mutation_info.x == merged_df$mutation_info.y)
+
+#FIXME: what happened to other 7 and FALSE
+table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
+
+# problem
+identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
+
+#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
+
+#throw away the y because that is a smaller df
+d1 = which(colnames(merged_df) == "mutation_info.y") #21
+d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
+
+merged_df2 = merged_df[-c(d1, d2)] #3093 20
+colnames(merged_df2)
+
+# rename cols 
+colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
+colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
+
+colnames(merged_df2)
+
+# should be 0
+sum(is.na(merged_df2$Mutationinformation))
+
+# count na in each column
+na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
+# only some or and Af should be NA
+#Row.names           ors               pvals               afs 
+#81                  81                81                  81 
+
+
+colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
+colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
+colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
+
+colnames(merged_df2)
+
+# add log OR and neglog pvalue
+merged_df2$logor = log(merged_df2$OR)
+is.numeric(merged_df2$logor)
+
+merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
+is.numeric(merged_df2$neglog10pvalue)
+
+# write file out
+#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
+
+# define output variables
+drug  = 'pyrazinamide'
+out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+
+write.csv(merged_df2, output_filename
+          , row.names = F)
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+"""
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+#%% load libraries
+###################
+# load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
+#from pandas.api.types import is_string_dtype
+#from pandas.api.types import is_numeric_dtype
+
+# to create dir
+#my_dir = os.path.expanduser('~/some_dir')
+#make sure mcsm_analysis/ exists
+#or specify the output directory
+
+#%%
+#%%
+#%%
+#========================================================
+# TASK: extract ALL pncA mutations from GWAS data
+#========================================================
+#%%
+####################
+# my working dir
+os.getcwd()
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#%%
+from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+#%%
+#NOTE: Out_dir MUST exis
+# User defined dir strpyrazinamide
+drug = 'pyrazinamide'
+gene = 'pnca'
+out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
+# = out_dir + drug
+data_dir = homedir + '/git/Data'
+
+if not os.path.exists(data_dir):
+    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
+    os.makedirs(data_dir)
+    die()
+
+if not os.path.exists(out_dir):
+    print('Error!', out_dir, 'does not exist. Please create it')
+    exit()
+    
+#if not os.path.exists(work_dir):
+#    print('creating dir that does not exist', 'dir_name:', work_dir)
+#    os.makedirs(work_dir)
+else:
+    print('Dir exists: Carrying on')
+
+# now create sub dir structure within work_dir
+# pyrazinamide/mcsm_analysis
+
+# we need three dir
+# Data
+# Scripts
+    # Plotting
+# Results
+    # Plots
+    
+# create a list of dir names
+#dir_names = ['Data', 'Scripts', 'Results']
+
+
+#for i in dir_names:
+#    this_dir = (work_dir + '/' + i)
+#    if not os.path.exists(this_dir):
+#        print('creating dir that does not exist:', this_dir)
+#        os.makedirs(this_dir)
+#else:
+#    print('Dir exists: Carrying on')
+      
+# Create sub dirs
+# 1)        
+# Scripts
+    # Plotting
+#subdir_plotting = work_dir + '/Scripts/Plotting'
+#if not os.path.exists(subdir_plotting):
+#      print('creating dir that does not exist:', subdir_plotting)
+#      os.makedirs(subdir_plotting)
+#else:
+#    print('Dir exists: Carrying on')
+ 
+# 2)    
+# Results
+    # Plots
+#subdir_plots = work_dir + '/Results/Plots'        
+#if not os.path.exists(subdir_plots):
+#        print('creating dir that does not exist:', subdir_plots)
+#        os.makedirs(subdir_plots)    
+#else:
+#    print('Dir exists: Carrying on')
+         
+# clear varaibles
+#del(dir_names, drug, i, subdir_plots, subdir_plotting)
+
+#exit()
+#%%
+#==============================================================================
+############
+# STEP 1: Read file original_tanushree_data_v2.csv
+############
+data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+meta_data = pd.read_csv(data_file, sep = ',') 
+
+# column names
+list(meta_data.columns)
+
+# extract elevant columns to extract from meta data related to the pyrazinamide
+meta_data = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+# checks
+total_samples = meta_data['id'].nunique() # 19265
+
+# counts NA per column
+meta_data.isna().sum()
+
+# glance
+meta_data.head()
+
+# equivalent of table in R
+# pyrazinamide counts
+meta_data.pyrazinamide.value_counts() 
+
+#%%
+############
+# STEP 2: extract entries containing selected genes: 
+# pyrazinamide: pnca_p.
+# in the dr_mutations and other mutations"
+# as we are interested in the mutations in the protein coding region only 
+# (corresponding to a structure)
+# and drop the entries with NA
+#############
+meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+
+del(meta_pza)
+
+##########################
+# pyrazinamide: pnca_p.
+##########################
+meta_data_pnca = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+del(meta_data)
+
+# sanity checks
+
+# dr_mutations only
+meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_dr['id'].nunique() 
+del(meta_pnca_dr)
+
+# other mutations
+meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_other['id'].nunique() 
+del(meta_pnca_other)
+
+# Now extract "all" mutations
+meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
+
+meta_pnca_all['id'].nunique() 
+pnca_samples = len(meta_pnca_all)
+pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
+comp_pnca_samples = pnca_samples - pnca_na 
+
+#=#=#=#=#=#=#
+# COMMENT: use it later to check number of complete samples from LF data
+#=#=#=#=#=#=#
+
+# sanity checks
+meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
+meta_pnca_all.other_mutations_pyrazinamide.value_counts()
+
+# more sanity checks 
+# !CAUTION!: muts will change depending on your gene
+
+# dr muts : insert
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
+
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+
+# other_muts
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
+
+#=#=#=#=#=#=#=#=#=#
+# FIXME
+# COMMENTS: both mutations columns are separated by ; 
+# CHECK if there are mutations that exist both in dr and other_muts!
+# doesn't make any sense for the same mut to exist in both, I would have thought!
+#=#=#=#=#=#=#=#=#=#
+
+# remove not required variables
+del(meta_data_pnca)
+
+############
+# STEP 3: split the columns of 
+# a) dr_mutation_... (;) as 
+# the column has snps related to multiple genes.
+# useful links
+# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
+# this one works beautifully
+# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
+############
+
+# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
+meta_pnca_all.isna().sum()
+
+#=#=#=#=#=#=#=#=#=#
+# COMMENT: no NA's in dr_mutations/other_mutations_columns
+#=#=#=#=#=#=#=#=#=#
+# define the split function
+def tidy_split(df, column, sep='|', keep=False):
+    """
+    Split the values of a column and expand so the new DataFrame has one split
+    value per row. Filters rows where the column is missing.
+
+    Params
+    ------
+    df : pandas.DataFrame
+        dataframe with the column to split and expand
+    column : str
+        the column to split and expand
+    sep : str
+        the string used to split the column's values
+    keep : bool
+        whether to retain the presplit value as it's own row
+
+    Returns
+    -------
+    pandas.DataFrame
+        Returns a dataframe with the same columns as `df`.
+    """
+    indexes = list()
+    new_values = list()
+    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
+    for i, presplit in enumerate(df[column].astype(str)):
+        values = presplit.split(sep)
+        if keep and len(values) > 1:
+            indexes.append(i)
+            new_values.append(presplit)
+        for value in values:
+            indexes.append(i)
+            new_values.append(value)
+    new_df = df.iloc[indexes, :].copy()
+    new_df[column] = new_values
+    return new_df
+
+########
+# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
+#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+########    
+meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 
+
+# remove leading white space else these are counted as distinct mutations as well
+meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
+
+########
+# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
+######## 
+meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 
+
+# remove the leading white spaces in the column
+meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 
+
+##########
+# Step 4: Reshape data so that all mutations are in one column and the 
+# annotations for the mutation reflect the column name
+# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
+
+# data frame “df” is passed to melt() function
+# id_vars is the variable which need to be left unaltered
+# var_name are the column names so we named it as 'mutation_info'
+# value_name are its values so we named it as 'mutation'
+##########
+meta_pnca_WF1.columns
+
+meta_pnca_LF0 = pd.melt(meta_pnca_WF1
+                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
+                      , var_name = 'mutation_info'
+                      , value_name = 'mutation') 
+
+# sanity check: should be true
+if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
+    print('sanity check passed: Long format df has the expected length')
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 5: This is still dirty data. Filter LF data so that you only have
+# mutations corresponding to pnca_p. 
+# this will be your list you run OR calcs 
+###########
+meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 
+
+# sanity checks
+# unique samples
+meta_pnca_LF1['id'].nunique()
+if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
+    print("Sanity check passed: No of samples with pncA mutations match")
+else:
+    print("Sanity check failed: Debug please!")
+
+# count if all the mutations are indeed in the protein coding region 
+# i.e begin with pnca_p
+meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
+
+# should  be true.
+# and check against the length of the df, which should match
+if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
+    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 6: Filter dataframe with "na" in the drug column
+# This is because for OR, you can't use the snps that have the
+# NA in the specified drug column
+# it creates problems when performing calcs in R inside the loop
+# so best to filter it here
+###########
+# NOT NEEDED FOR all snps, only for extracting valid OR snps
+del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
+
+###########
+# Step 7: count unique pncA_p mutations (all and comp cases)
+###########
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('mutation_info').nunique()
+
+meta_pnca_LF1['id'].nunique()  
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('id').nunique()
+
+###########
+# Step 8: convert all snps only (IN LOWERCASE)
+# because my mcsm file intergated has lowercase
+###########
+# convert mutation to lower case as it needs to exactly match the dict key
+#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
+meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
+
+###########
+# Step 9 : Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one from the 
+# referece_dict imported pncaeady. First convert to mutation to lowercase
+# to allow to match entries from dict 
+###########
+#=======
+# Step 9a: iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a regex match from the dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#=======
+# initialise a sub dict that is a lookup dict for three letter code to one
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
+
+# extract position info from mutation column separetly using regex
+meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#=========
+# Step 9b: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
+    
+# added two more cols
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#========
+# Step 9c: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+# added two more cols
+    
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+########
+# Step 10: combine the wild_type+poistion+mutant_type columns to generate 
+# Mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+#########
+meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
+
+#=#=#=#=#=#=#
+# Step 11:
+# COMMENT: there is more processing in the older version of this script
+# consult if necessary
+# possibly due to the presence of true_wt
+# since this file doesn't contain any true_wt, we won't need it(hopefully!)
+#=#=#=#=#=#=#
+
+#%%
+###########
+# Step 12: Output files for only SNPs to run mCSM
+###########
+
+#=========
+# Step 12a: all SNPs to run mCSM
+#=========
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+
+# assign meaningful colnames 
+#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
+#list(snps_only.columns)
+snps_only.isna().sum() # should be 0
+
+# output csv: all SNPS for mCSM analysis
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_snps_'
+nrows = len(snps_only) 
+
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+output_file_path = data_dir + '/input/processed/' + drug + '/'
+
+if not os.path.exists(output_file_path):
+    print( output_file_path, 'does not exist. Creating')
+    os.makedirs(output_file_path)
+    exit()
+
+output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+# Bad practice: numbers at the start of a filename
+snps_only.to_csv(output_filename, header = False, index = False)
+
+#=========
+# Step 12b: all snps with annotation
+#=========
+# all snps, selected cols
+#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
+#                               , 'drtype', 'pyrazinamide'
+#                               , 'mutation_info', 'mutation', 'Mutationinformation']] 
+
+#len(pnca_snps_ALL) 
+
+# sanity check
+#meta_pnca_LF1['mutation'].nunique() 
+
+# output csv: WITH column but WITHOUT row names(all snps with meta data)
+# specify variable name for output file
+#gene = 'pnca'
+#drug = 'pyrazinamide'
+#my_fname2 = '_snps_with_metadata_'
+#nrows = len(pnca_snps_ALL) 
+
+#output_file_path = work_dir + '/Data/'
+#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
+#print(output_filename)  #<<<- check
+
+# write out file
+#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+
+#=========
+# Step 12c: comp snps for OR calcs with annotation
+#=========
+# remove all NA's from pyrazinamide column from LF1
+    
+# counts NA per column
+meta_pnca_LF1.isna().sum()
+
+# remove NA
+meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
+
+# sanity checks
+# should be True
+len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
+
+# unique counts
+meta_pnca_LF2['mutation'].nunique() 
+
+meta_pnca_LF2.groupby('mutation_info').nunique() 
+
+# sanity check
+meta_pnca_LF2['id'].nunique() 
+
+# should be True
+if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
+    print ('sanity check passed: complete numbers match')
+else:
+    print('Error: Please Debug!')
+
+# value counts
+meta_pnca_LF2.mutation.value_counts()
+#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
+
+# valid/comp snps
+# uncomment as necessary
+pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+len(pnca_snps_COMP) 
+
+# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
+# specify variable name for output file
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname3 = '_comp_snps_with_metadata_'
+nrows = len(pnca_snps_COMP) 
+
+#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
+#print(output_filename) #<<<-check
+
+# write out file
+#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
+
+
+#=========
+# Step 12d: comp snps only
+#=========
+# output csv: comp SNPS for info (i.e snps for which OR exist)
+# specify variable name for output file
+
+snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_comp_snps_'
+nrows = len(snps_only) 
+
+output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+snps_only.to_csv(output_filename, header = False, index = False)
+
+
+#=#=#=#=#=#=#=#
+# COMMENT: LF1 is the file to extract all unique snps for mcsm 
+# but you have that already in file called pnca_snps...
+# LF2: is the file for extracting snps tested for DS and hence OR calcs
+#=#=#=#=#=#=#=#
+
+###########
+# Step 13 : Output the whole df i.e 
+# file for meta_data which is now formatted with
+# each row as a unique snp rather than the original version where
+# each row is a unique id
+# ***** This is the file you will ADD the AF and OR calculations to *****
+###########
+
+# output csv: the entire DF
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname4 = '_metadata'
+#nrows = len(meta_pnca_LF1)
+output_filename = output_file_path + gene + my_fname4  + '.csv'
+print(output_filename) #<<<-check
+
+# write out file
+meta_pnca_LF1.to_csv(output_filename) 
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun 18 11:32:28 2019
+
+@author: tanushree
+"""
+############################################
+#load libraries
+import pandas as pd
+import os
+#############################################
+
+#!#########################!
+# REQUIREMNETS:
+# Data_original/ must exist
+# containing GWAS data
+#!#########################!
+
+print(os.getcwd()) 
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/Data/input/original') 
+print(os.getcwd())
+#==========
+#read file
+#==========
+my_aa = pd.read_csv('aa_codes.csv') #20, 6
+#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
+#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
+my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
+
+#=========================================================
+#convert file to  dict of dicts
+#=========================================================
+#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
+#with your choice of column name that you have assigned to index as the "primary key". 
+#using 'index' creates a dict of dicts
+#using 'records' creates a list of dicts
+my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
+
+#================================================
+#dict of aa with their corresponding properties
+#This is defined twice
+#================================================
+#7 categories: no overlap
+qualities1 = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidic'
+             , ('N', 'Q'): 'Amidic'
+             , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
+             , ('S', 'T'): 'Hydroxylic'
+             , ('F', 'W', 'Y'): 'Aromatic'
+             , ('C', 'M'): 'Sulphur'
+}
+
+#9 categories: allowing for overlap
+qualities2 = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidc'
+             , ('S', 'T', 'N', 'Q'): 'Polar'
+             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
+             , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
+             , ('S', 'G', 'A', 'P'): 'Small'
+             , ('F', 'W', 'Y', 'H'): 'Aromatic'
+             , ('V', 'I', 'L', 'M'): 'Aliphatic'
+             , ('C', 'G', 'P'): 'Special'
+}
+
+qualities_taylor = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidc'
+             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
+             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
+             #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
+             , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' 
+             , ('F', 'W', 'Y', 'H'): 'Aromatic'
+             , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
+             , ('C', 'G', 'P'): 'Special'
+}
+
+qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
+                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
+}
+
+qualities_polarity = { ('D', 'E'): 'acidic'
+                      , ('H', 'K', 'R'): 'basic'
+                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
+                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
+}
+
+#==============================================================================                
+#adding amino acid properties to my dict of dicts                      
+for k, v in my_aa_dict.items():
+    #print (k,v)
+    v['aa_prop1'] = str() #initialise keys 
+    v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
+    v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
+    v['aa_prop_water'] = str() #initialise keys
+    v['aa_prop_polarity'] = str() #initialise keys
+    
+    for group in qualities1:
+        if v['one_letter_code'] in group:
+            v['aa_prop1']+= qualities1[group] # += for str concat   
+
+    for group in qualities2:
+        if v['one_letter_code'] in group:
+            v['aa_prop2'].append(qualities2[group]) # append to list
+ 
+    for group in qualities_taylor:
+        if v['one_letter_code'] in group:
+            v['aa_taylor'].append(qualities_taylor[group]) # append to list           
+            
+    for group in qualities_water:
+        if v['one_letter_code'] in group:
+            v['aa_prop_water']+= qualities_water[group] # += for str concat          
+
+    for group in qualities_polarity:
+        if v['one_letter_code'] in group:
+            v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat 
+             
+#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
+#==============================================================================
+            
+