From f686563c98576d85564dea89a1c6fe62a13eb8cd Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 23 Mar 2020 17:36:42 +0000
Subject: [PATCH] updated pnca_extraction and AF_OR calcs

---
 .../pyrazinamide/scripts/plotting/.Rhistory   |   0
 meta_data_analysis/.Rhistory                  | 512 ------------------
 meta_data_analysis/pnca_AF_and_OR_calcs.R     | 274 ++++++----
 meta_data_analysis/pnca_data_extraction.py    | 108 ++--
 4 files changed, 195 insertions(+), 699 deletions(-)
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
 delete mode 100644 meta_data_analysis/.Rhistory

diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
deleted file mode 100644
index e69de29..0000000
diff --git a/meta_data_analysis/.Rhistory b/meta_data_analysis/.Rhistory
deleted file mode 100644
index ec7b60d..0000000
--- a/meta_data_analysis/.Rhistory
+++ /dev/null
@@ -1,512 +0,0 @@
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
-my_logor
-pnca_snps_or$Mutationinformation == i
-View(pnca_snps_or)
-#===============
-# Step 4: Calculate for one snp
-# using i, when you run the loop, it is easy
-#===============
-i = "pnca_p.trp68gly"
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-# uncomment as necessary
-pnca_snps_or = pnca_snps_or[1:5,]
-pnca_snps_or = pnca_snps_or[c(1:5),]
-#===============
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-pnca_snps_or = pnca_snps_or[1:5,]
-pnca_snps_or = pnca_snps_or[c(1:5),]
-pnca_snps_or = pnca_snps_or[1:5]
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-pnca_snps_or = pnca_snps_or[1:5]
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-foo = pnca_snps_or[c(1:5,)]
-foo = pnca_snps_or[c(1:5),]
-foo = as.data.frame(pnca_snps_or[c(1:5),])
-View(foo)
-# create an empty dataframe
-pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-#===============
-# Step 4: Iterate through this unique list
-# and calculate OR, but only for one snp
-# this is test before you apply it all others
-#===============
-pnca_snps_or$mutation == i
-View(pnca_snps_or)
-# create an empty dataframe
-pnca_snps_or = data.frame(mutation = i)
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-View(pnca_snps_or_copy)
-#===============
-# Step 4: Iterate through this unique list
-# and calculate OR, but only for one snp
-# this is test before you apply it all others
-#===============
-#reset original df so you don't make a mistake
-pnca_snps_or = pnca_snps_or_copy
-for (i in pnca_snps_unique){
-print(i)
-}
-pnca_snps_or = pnca_snps_or_copy #2133, 1
-#........................................
-# create an empty dataframe : uncomment as necessary
-pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
-#........................................
-# create an empty dataframe : uncomment as necessary
-pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
-#........................................
-# create an empty dataframe : uncomment as necessary
-pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
-View(pnca_snps_or)
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-View(pnca_snps_or)
-pnca_snps_or = pnca_snps_or_copy #2133, 1
-for (i in pnca_snps_unique){
-print(i)
-#*************
-# start logistic regression model building
-#*************
-# set the IV and DV for the logistic regression model
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-}
-warnings()
-View(pnca_snps_or)
-View(pnca_snps_or_copy)
-#sanity check
-pnca_snps_or$mutation == i1
-#sanity check
-pnca_snps_or[pnca_snps_or$mutation == i1]
-pnca_snps_or[pnca_snps_or$mutation == i2]
-pnca_snps_or[pnca_snps_or$mutation == i2,]
-pnca_snps_or1 = unique(pnca_snps_or)
-write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
-# you only need it for the unique mutations
-pnca_snps_or = unique(pnca_snps_or) #2133, 1
-for (i in pnca_snps_unique){
-print(i)
-#*************
-# start logistic regression model building
-#*************
-# set the IV and DV for the logistic regression model
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-}
-View(pnca_snps_or)
-2.290256e+01
-1.561132e+06
-3.242285e-04
-#sanity check
-pnca_snps_or[pnca_snps_or$mutation == i1]
-pnca_snps_or[pnca_snps_or$mutation == i2,]
-write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
-my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
-, stringsAsFactors = FALSE) #11374, 19
-View(my_data)
-# remove the first column
-my_data = my_data[-1] #11374, 18
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-# sanity check
-snps_all = unique(my_data$mutation)# 337
-pnca_snps_or = snps_all
-pnca_snps_or = as.data.frame(snps_all)
-View(pnca_snps_or)
-snps_all[-"true_wt"]
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-View(pnca_snps_or)
-snps_all = as.data.frame(snps_all)
-View(snps_all)
-#remove true_wt entry
-w1 = which(rownames(snps_all) == "true_wt")
-View(snps_all)
-#remove true_wt entry
-w1 = which(snps_all$snps_all == "true_wt")
-rm(pnca_snps_or)
-pnca_snps_or = snps_all[-w1]
-pnca_snps_or = snps_all[,-w1]
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-#remove true_wt entry
-w1 = which(snps_all) == "true_wt"
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
-, stringsAsFactors = FALSE) #11374, 19
-# remove the first column
-my_data = my_data[-1] #11374, 18
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-# sanity check
-snps_all = unique(my_data$mutation)# 337
-snps_all = as.data.frame(snps_all)
-snps_all[-c(1,1)]
-pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
-pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
-#remove true_wt entry
-#w1 = which(snps_all) == "true_wt"
-pnca_snps_or = snps_all
-pnca_snps_or = pnca_snps_or_copy
-#remove true_wt entry
-#w1 = which(snps_all) == "true_wt"
-pnca_snps_or = snps_all
-pnca_snps_or -> pnca_snps_or_copy
-#===============
-# Step 4: Iterate through this unique list
-# and calculate OR for each snp
-# and assign to the pnca_snps_or df that has
-# each row as a unique snp
-#===============
-# reset original df so you don't make a mistake: IMPORTANT
-pnca_snps_or = pnca_snps_or_copy #2133, 1
-# you only need it for the unique mutations
-pnca_snps_or = unique(pnca_snps_or) #337, 1
-for (i in pnca_snps_unique){
-print(i)
-#*************
-# start logistic regression model building
-#*************
-# set the IV and DV for the logistic regression model
-# IV: corresponds to each unique snp (extracted using grep)
-x = as.numeric(grepl(i,raw_data$all_muts_pza))
-# DV: pyrazinamide 0 or 1
-y = as.numeric(raw_data$pyrazinamide)
-table(y,x)
-# run glm model
-model = glm(y ~ x, family = binomial)
-#model = glm(y ~ x, family = binomial(link = "logit"))
-summary(model)
-#**********
-# extract relevant model output
-#**********
-# extract log OR i.e the Beta estimate of the logistic model for a given snp
-my_logor = summary(model)$coefficients[2,1]
-print(paste0('Beta:', my_logor))
-# extract SE of the logistic model for a given snp
-my_se = summary(model)$coefficients[2,2]
-print(paste0('SE:', my_se))
-# extract Z of the logistic model for a given snp
-my_zval = summary(model)$coefficients[2,3]
-print(paste0('Z-value:', my_zval))
-# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
-#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
-my_or = exp(summary(model)$coefficients[2,1])
-print(paste0('OR:', my_or))
-# sanity check : should be True
-log(my_or) == my_logor
-# extract P-value of the logistic model for a given snp
-my_pval = summary(model)$coefficients[2,4]
-print(paste0('P-value:', my_pval))
-# extract confint interval of snp (2 steps, since the output is a named number)
-ci_mod = exp(confint(model))[2,]
-my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
-print(paste0('CI:', my_ci))
-#*************
-# Assign the regression output in the original df
-# you can use ('=' or '<-/->')
-#*************
-#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
-my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
-my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
-my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
-my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
-my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
-my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
-}
-getwd()
-#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
-setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
-getwd()
-#===============
-# Step 1: read raw data
-#===============
-raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
-,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
-raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
-# combine the two mutation columns
-raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
-head(raw_data$all_mutations_pyrazinamide)
-# create yet another column that contains all the mutations but in lower case
-raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
-table(grepl("pnca_p",raw_data$all_muts_pza))
-#FALSE  TRUE
-#10603  1908
-pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
-, stringsAsFactors = F
-, header = T) #2133
-# subset a snall section to test
-#pnca_snps_or_copy = pnca_snps_or
-#pnca_snps_or = pnca_snps_or_copy
-pnca_snps_unique = unique(pnca_snps_or$mutation) #293
-i2 = "pnca_p.trp68gly" # Should exist
-grep(i2, pnca_snps_unique)
-my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
-, stringsAsFactors = FALSE) #11374, 19
-# remove the first column
-my_data = my_data[-1] #11374, 18
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
-# sanity check
-head(my_data$mutation)
-my_data = unique(my_data$mutation)
-my_data[!duplicated(my_data$mutation)]
-my_data_unique = my_data[!duplicated(my_data$mutation),]
-my_data[!duplicated('mutation'),]
-my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
-my_data_unique = my_data[!duplicated(my_data['mutation']),]
-getwd()
-setwd("/git/LSHTM_analysis/meta_data_analysis")
-getwd()
-getwd()
-setwd("/git/github/LSHTM_analysis/meta_data_analysis")
-getwd()
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
-c = file.choose()
-c = file.choose(../Data_original)
-c = read.csv(file.choose(), stringsAsFactors = F)
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F))
-c = read.csv(file.choose(), stringsAsFactors = F)
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F)
-#===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F)
-raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
-outdir = paste0("../mcsm_analysis",drug,"/Data/")
-# define output variables
-drug  = 'pyrazinamide'
-outdir = paste0("../mcsm_analysis",drug,"/Data/")
-outdir = paste0("../mcsm_analysis/",drug,"/Data/")
-outFile = "meta_data_with_AFandOR.csv"
-output_filename = paste0(outdir, outFile)
-output_filename
-pnca_snps_or = read.csv(file.choose()
-, stringsAsFactors = F
-, header = T)
-View(pnca_snps_or)
-View(pnca_snps_or)
diff --git a/meta_data_analysis/pnca_AF_and_OR_calcs.R b/meta_data_analysis/pnca_AF_and_OR_calcs.R
index 3c84670..f1b5321 100644
--- a/meta_data_analysis/pnca_AF_and_OR_calcs.R
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@@ -1,16 +1,59 @@
+#============================================
+# TASK: to calculate allele frequency and OR
+# on master data
+#===========================================
+homedir = '~'
 getwd()
-setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
+#setwd('~/git/LSHTM_analysis/meta_data_analysis')
+setwd(paste0(homedir, '/', 'git/LSHTM_analysis/meta_data_analysis'))
 getwd()
+  
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = paste0(gene,'_p.')
+print(gene_match)
+
+#=======
+# input dir
+#=======
+# file1: Raw data
+#indir = 'git/Data/pyrazinamide/input/original'
+indir = paste0('git/Data', '/', drug, '/', 'input/original')
+in_filename  = 'original_tanushree_data_v2.csv'
+infile = paste0(homedir, '/', indir, '/', in_filename)
+print(paste0('Reading infile:', ' ', infile) )
+
+# file2: file to extract valid snps and add calcs to: pnca_metadata.csv {outfile3 from data extraction script}
+indir_metadata = paste0('git/Data', '/', drug, '/', 'output')
+in_filename_metadata = 'pnca_metadata.csv'
+infile_metadata = paste0(homedir, '/', indir_metadata, '/', in_filename_metadata)
+print(paste0('Reading metadata infile:', infile_metadata))
+
+#=========
+# output dir
+#=========
+# output filename in respective section at the time of outputting files
+#outdir = 'git/Data/pyrazinamide/output'
+outdir = paste0('git/Data', '/', drug, '/', 'output')
+out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv')
+outfile = paste0(homedir, '/', outdir, '/', out_filename)
+print(paste0('Output file with full path:', outfile))
+
+#%% end of variable assignment for input and output files
 
 #===============
-# Step 1: read GWAS raw data stored in Data_original/
-#===============
-infile = read.csv(file.choose(), stringsAsFactors = F)
+# Step 1: Read master/raw data stored in Data/
+#===============    
+raw_data_all = read.csv(infile, stringsAsFactors = F)
 
-raw_data = infile[,c("id"
+raw_data = raw_data_all[,c("id"
                      , "pyrazinamide"
                      , "dr_mutations_pyrazinamide"
                      , "other_mutations_pyrazinamide")]
+rm(raw_data_all)
+
+rm(indir, in_filename, infile)
 
 #####
 # 1a: exclude na
@@ -18,7 +61,7 @@ raw_data = infile[,c("id"
 raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
 
 total_samples = length(unique(raw_data$id))
-print(total_samples)
+print(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
 
 # sanity check: should  be true
 is.numeric(total_samples) 
@@ -36,10 +79,15 @@ head(raw_data$all_mutations_pyrazinamide)
 raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
 
 # sanity checks
-table(grepl("pnca_p",raw_data$all_muts_pnca))
+#table(grepl("pnca_p",raw_data$all_muts_pnca))
+print(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
+gene_match = tolower(gene_match)
+
+table(grepl(gene_match,raw_data$all_muts_pnca))
 
 # sanity check: should be TRUE
-sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
+#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
+sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples
 
 # set up variables: can be used for logistic regression as well
 i  = "pnca_p.ala134gly" # has a NA, should NOT exist
@@ -56,17 +104,40 @@ table(mut, dst)
 #fisher.test(table(mut, dst))
 #table(mut)
 
-###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
-pnca_snps_or = read.csv(file.choose()
+#===============
+# Step 2: Read valid snps for which OR can be calculated (infile_comp_snps.csv)
+#=============== 
+print(paste0('Reading metadata infile:', infile_metadata))
+
+pnca_metadata = read.csv(infile_metadata
+#                        , file.choose()
                         , stringsAsFactors = F
                         , header = T)
 
-# extract unique snps to iterate over for AF and OR calcs
-# total no of unique snps
-# AF and OR calculations
 
+# clear variables
+rm(homedir, in_filename, indir, infile)
+rm(indir_metadata, infile_metadata, in_filename_metadata)
+
+# count na in pyrazinamide column
+tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide))
+expected_rows = nrow(pnca_metadata) - tot_pza_na
+
+# drop na from the pyrazinamide colum
+pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),]
+
+# sanity check
+if(nrow(pnca_snps_or) == expected_rows){
+  print('PASS: no. of rows match with expected_rows')
+} else{
+  print('FAIL: nrows mismatch.')
+}
+
+# extract unique snps to iterate over for AF and OR calcs
 pnca_snps_unique = unique(pnca_snps_or$mutation) 
 
+print(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique)))
+
 # Define OR function
 x = as.numeric(mut)
 y = dst
@@ -111,131 +182,106 @@ afs['pnca_p.trp68gly']
 afs['pnca_p.gln10pro'] 
 afs['pnca_p.leu4ser'] 
 
-#plot(density(log(ors)))
-#plot(-log10(pvals))
-#hist(log(ors)
-#     ,breaks = 100
-#     )
+plot(density(log(ors)))
+plot(-log10(pvals))
+hist(log(ors)
+     , breaks = 100
+     )
 
-# subset df cols to add to the calc param df
-pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
-pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
-
-rownames(pnca_snps_cols) = pnca_snps_cols$mutation
-head(rownames(pnca_snps_cols))
-#snps_with_AF_and_OR
+# FIXME: could be good to add a sanity check
+if (table(names(ors) ==  names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){
+print('PASS: names of ors, pvals and afs match: proceed with combining into a single df')
+} else{
+  print('FAIL: names of ors, pvals and afs mismatch')
+}
 
 # combine
 comb_AF_and_OR = data.frame(ors, pvals, afs)
 head(rownames(comb_AF_and_OR))
 
-# sanity checks: should be the same
-dim(comb_AF_and_OR); dim(pnca_snps_cols)
-table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
-
-table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
-
-# merge the above two df whose dim you checked
-snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
-                            , by = "row.names"
-#                            , all.x = T
-                            )
-
-#rm(pnca_snps_cols, pnca_snps_or, raw_data)
-
-#===============
-# Step 3: Read data file where you will add the calculated OR 
-# Note: this is the big file with one-many relationship between snps and lineages
-# i.e fname4 from 'pnca_extraction.py'
-#===============
-my_data = read.csv(file.choose()
-                   , row.names = 1
-                   , stringsAsFactors = FALSE)
-
-head(my_data)
-length(unique(my_data$id))
-
-# check if first col is 'id': should be TRUE
-colnames(my_data)[1] == 'id'
+# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column
+comb_AF_and_OR$mutation = rownames(comb_AF_and_OR)
 
 # sanity check
-head(my_data$mutation)
+if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){
+  print('PASS: rownames and mutaion col values match')
+}else{
+  print('FAIL: rownames and mutation col values mismatch')
+}
 
-# FILES TO MERGE:
-# comb_AF_and_OR: file containing OR
-# my_data = big meta data file 
-# linking column: mutation
+############
+# Merge 1: 
+###########
+df1 = pnca_metadata
+df2 = comb_AF_and_OR
 
-head(my_data)
-merged_df = merge(my_data # big file
-                  , snps_with_AF_and_OR # small (afor file)
+head(df1$mutation); head(df2$mutation)
+
+# FIXME: newlines
+print(paste0('merging two dfs: '
+      ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1)
+      ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2)
+      , 'expected rows in merged df: ', nrow(df1), 'expected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1)))
+
+merged_df = merge(df1 # big file
+                  , df2 # small (afor file)
                   , by = "mutation"
                   , all.x = T) # because you want all the entries of the meta data 
 
-# sanity checks: should be True 
-# FIXME: I have checked this manually, but make it so it is a pass or a fail!
-comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
-merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
+# sanity check
+if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){
+  print(paste0('PASS: no. of cols is as expected: ', ncol(merged_df)))
+} else{
+  print('FAIL: no.of cols mistmatch')
+}
 
-merged_df[merged_df$Mutationinformation.x == "Q10P",]
-
-# sanity check: very important!
-colnames(merged_df)
-
-table(merged_df$mutation_info.x == merged_df$mutation_info.y)
-
-#FIXME: what happened to other 7 and FALSE
-table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
-
-# problem
-identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
-
-#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
-
-#throw away the y because that is a smaller df
-d1 = which(colnames(merged_df) == "mutation_info.y") #21
-d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
-
-merged_df2 = merged_df[-c(d1, d2)] #3093 20
-colnames(merged_df2)
-
-# rename cols 
-colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
-colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
-
-colnames(merged_df2)
-
-# should be 0
-sum(is.na(merged_df2$Mutationinformation))
+# quick check
+i  = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc.
+merged_df[merged_df$mutation == i,]
 
 # count na in each column
-na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
+na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count
 # only some or and Af should be NA
 #Row.names           ors               pvals               afs 
-#81                  81                81                  81 
+#63                  63               63                  63
 
+# reassign custom colnames
+colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR"
+colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF"
+colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue"
 
-colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
-colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
-colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
-
-colnames(merged_df2)
+colnames(merged_df)
 
 # add log OR and neglog pvalue
-merged_df2$logor = log(merged_df2$OR)
-is.numeric(merged_df2$logor)
+merged_df$logor = log(merged_df$OR)
+is.numeric(merged_df$logor)
 
-merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
-is.numeric(merged_df2$neglog10pvalue)
+merged_df$neglog10pvalue = -log10(merged_df$pvalue) 
+is.numeric(merged_df$neglog10pvalue)
 
-# write file out
-#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
+merged_df$AF_percent = merged_df$AF*100
+is.numeric(merged_df$AF_percent)
 
-# define output variables
-drug  = 'pyrazinamide'
-out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
-outFile = "meta_data_with_AFandOR.csv"
-output_filename = paste0(outdir, outFile)
+# check AFs
+#i = 'pnca_p.trp68gly'
+i = 'pnca_p.gln10pro' 
+#i = 'pnca_p.leu4ser'
+merged_df[merged_df$mutation == i,]
 
-write.csv(merged_df2, output_filename
+# FIXME: harcoding (beware!), NOT FATAL though!
+ncol_added = 3 
+
+print(paste0('Added', ncol_added, ' ', 'more cols to merged_df i.e log10 OR and -log10 P-val: '
+             , 'no. of cols in merged_df now: ', ncol(merged_df)))
+
+#%% write file out: pnca_meta_data_with_AFandOR
+print(paste0('writing output file in: '
+             , 'Filename: ', out_filename
+             , 'Path:', outdir))
+
+write.csv(merged_df, outfile
           , row.names = F)
+
+print(paste0('Finished writing:', out_filename, '\nExpected no. of cols:', ncol(merged_df)))
+print('======================================================================')
+rm(out_filename)
diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py
index 7588122..72c7860 100755
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@@ -36,9 +36,10 @@ import numpy as np
 # 1) pnca_ambiguous_muts.csv
 # 2) pnca_mcsm_snps.csv
 # 3) pnca_metadata.csv
-# 4) pnca_comp_snps.csv
-# 5) pnca_all_muts_msa.csv
-# 6) pnca_mutational_positons.csv
+# 4) pnca_comp_snps.csv <---deleted> 
+
+# 4) pnca_all_muts_msa.csv
+# 5) pnca_mutational_positons.csv
 #========================================================
 #%% specify homedir as python doesn't recognise tilde
 homedir = os.path.expanduser('~') 
@@ -52,23 +53,25 @@ os.getcwd()
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
 #========================================================
 
-#drug = 'pyrazinamide'
+#%% variable assignment: input and output paths & filenames
+
+drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 
-#%% specify variables for input and output paths and filenames
-
 #=======
 # input dir
 #=======
-indir = 'git/Data/pyrazinamide/input/original'
-
+#indir = 'git/Data/pyrazinamide/input/original'
+indir = 'git/Data' + '/' + drug + '/' + 'input/original'
 #=========
 # output dir
 #=========
 # several output files
 # output filenames in respective sections at the time of outputting files
-outdir = 'git/Data/pyrazinamide/output'
+#outdir = 'git/Data/pyrazinamide/output'
+outdir = 'git/Data' + '/' + drug + '/' + 'output'
+
 #%%end of variable assignment for input and output files
 #==============================================================================
 #%% Read files
@@ -77,7 +80,7 @@ in_filename  = 'original_tanushree_data_v2.csv'
 infile = homedir + '/' + indir + '/' + in_filename
 print('Reading input master file:', infile)
 
-master_data = pd.read_csv(infile, sep = ',')  
+master_data  = pd.read_csv(infile, sep = ',')  
 
 # column names
 #list(master_data.columns)
@@ -334,6 +337,8 @@ print('Writing file: common ids:\n',
 
 common_ids.to_csv(outfile0)
 print('======================================================================')
+del(out_filename0)
+
 
 # clear variables
 del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
@@ -701,21 +706,6 @@ del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_m
  
 #%% end of data extraction and some files writing. Below are some more files writing.
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 #%%: write file: ambiguous muts
 # uncomment as necessary
 #print(outdir)
@@ -735,6 +725,8 @@ inspect.to_csv(outfile1)
 print('Finished writing:', out_filename1, '\nExpected no. of rows (no. of samples with the ambiguous muts present):', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum())
 print('======================================================================')
 del(out_filename1)
+
+
 #%%
 #===========
 # Split 'mutation' column into three:  wild_type, position and
@@ -891,6 +883,8 @@ print('Finished writing:', out_filename2,
       '\nNo. of rows:', len(snps_only) )
 print('======================================================================')
 del(out_filename2)
+
+
 #%% Write file: pnca_metadata (i.e pnca_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename3 = gene.lower() + '_' + 'metadata.csv'
@@ -903,45 +897,10 @@ pnca_LF1.to_csv(outfile3, header = True, index = False)
 print('Finished writing:', out_filename3,
       '\nNo. of rows:', len(pnca_LF1),
       '\nNo. of cols:', len(pnca_LF1.columns) )
-
 print('======================================================================')
+del(out_filename3)
 
-#%% Write file: comp SNPs (i.e snps without any corresponding 'NA' in the <drug>
-# column to allow OR calcs)
 
-# remove NA from pyrazinamide cols
-pnca_LF2 = pnca_LF1.dropna(subset=['pyrazinamide'])
-
-print('extracting OR muts by removing NAs from pyrazinamide cols')
-if pnca_LF2.pyrazinamide.isna().sum() > 0:
-    print('FAIL: NAs NOT removed successfully')
-else:
-    print('PASS: NAs removed successfully')
-
-# extracting comp snps only
-comp_snps_only = pd.DataFrame(pnca_LF2['mutation'].unique())
-#print('Total no. of comp snps:', len(comp_snps_only))
-comp_snps_only.head()
-
-# assign column name
-comp_snps_only.columns = ['mutation']
-
-# count how many positions this corresponds to
-comp_pos_only = pd.DataFrame(pnca_LF2['position'].unique())
-#print('Total no. of pos corresponding to comp_snps:', len(comp_pos_only))
- 
-out_filename4 = gene.lower() + '_' + 'comp_snps.csv'
-outfile4 = homedir + '/' + outdir + '/' + out_filename4
-print('Writing file: comp snps to allow OR calcs',
-      '\nFilename:', out_filename4,
-      '\nPath:',  homedir + '/' + outdir,
-      '\nNo. of comp muts:', len(comp_snps_only),
-      '\nNo. of distinct positions for comp muts:', len(comp_pos_only) )
-
-comp_snps_only.to_csv(outfile4, header = True, index = False)
-
-print('Finished writing:', out_filename4,
-      '\nNo. of rows:', len(comp_snps_only) )
 #%% write file: mCSM style but with repitions for MSA and logo plots
 all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) 
 all_muts_msa.head()
@@ -970,21 +929,22 @@ else:
           '\nDebug please!')
 print('======================================================================')
 
-out_filename5 = gene.lower() + '_' + 'all_muts_msa.csv'
-outfile5 = homedir + '/' + outdir + '/' + out_filename5
+out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv'
+outfile4 = homedir + '/' + outdir + '/' + out_filename4
 
 print('Writing file: mCSM style muts for msa',
       '\nmutation format (SNP): {Wt}<POS>{Mut}',
       '\nNo.of lines of msa:', len(all_muts_msa),  
-      '\nFilename:', out_filename5,
+      '\nFilename:', out_filename4,
       '\nPath:',  homedir +'/'+ outdir)
 
-all_muts_msa_sorted.to_csv(outfile5, header = False, index = False)
+all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
 
-print('Finished writing:', out_filename5,
+print('Finished writing:', out_filename4,
       '\nNo. of rows:', len(all_muts_msa) )
 print('======================================================================')
-del(out_filename5)
+del(out_filename4)
+
 
 #%% write file for mutational positions
 # count how many positions this corresponds to
@@ -999,20 +959,22 @@ pos_only.position.dtype
 # sort by position value
 pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
 
-out_filename6 = gene.lower() + '_' + 'mutational_positons.csv'
-outfile6 = homedir + '/' + outdir + '/' + out_filename6
+out_filename5 = gene.lower() + '_' + 'mutational_positons.csv'
+outfile5 = homedir + '/' + outdir + '/' + out_filename5
 
 print('Writing file: mutational positions',
       '\nNo. of distinct positions:', len(pos_only_sorted),
-      '\nFilename:', out_filename6,
+      '\nFilename:', out_filename5,
       '\nPath:',  homedir +'/'+ outdir)
 
-pos_only_sorted.to_csv(outfile6, header = True, index = False)
+pos_only_sorted.to_csv(outfile5, header = True, index = False)
 
-print('Finished writing:', out_filename6,
+print('Finished writing:', out_filename5,
       '\nNo. of rows:', len(pos_only_sorted) )
 print('======================================================================')
-del(out_filename6)
+del(out_filename5)
+
+
 #%% end of script
 print('======================================================================')
 print(u'\u2698' * 50,