From f686563c98576d85564dea89a1c6fe62a13eb8cd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 23 Mar 2020 17:36:42 +0000 Subject: [PATCH] updated pnca_extraction and AF_OR calcs --- .../pyrazinamide/scripts/plotting/.Rhistory | 0 meta_data_analysis/.Rhistory | 512 ------------------ meta_data_analysis/pnca_AF_and_OR_calcs.R | 274 ++++++---- meta_data_analysis/pnca_data_extraction.py | 108 ++-- 4 files changed, 195 insertions(+), 699 deletions(-) delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory delete mode 100644 meta_data_analysis/.Rhistory diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory deleted file mode 100644 index e69de29..0000000 diff --git a/meta_data_analysis/.Rhistory b/meta_data_analysis/.Rhistory deleted file mode 100644 index ec7b60d..0000000 --- a/meta_data_analysis/.Rhistory +++ /dev/null @@ -1,512 +0,0 @@ -# run glm model -model = glm(y ~ x, family = binomial) -#model = glm(y ~ x, family = binomial(link = "logit")) -summary(model) -#********** -# extract relevant model output -#********** -# extract log OR i.e the Beta estimate of the logistic model for a given snp -my_logor = summary(model)$coefficients[2,1] -print(paste0('Beta:', my_logor)) -# extract SE of the logistic model for a given snp -my_se = summary(model)$coefficients[2,2] -print(paste0('SE:', my_se)) -# extract Z of the logistic model for a given snp -my_zval = summary(model)$coefficients[2,3] -print(paste0('Z-value:', my_zval)) -# Dervive OR i.e exp(my_logor) from the logistic model for a given snp -#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) -my_or = exp(summary(model)$coefficients[2,1]) -print(paste0('OR:', my_or)) -# sanity check : should be True -log(my_or) == my_logor -# extract P-value of the logistic model for a given snp -my_pval = summary(model)$coefficients[2,4] -print(paste0('P-value:', my_pval)) -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,] -my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) -print(paste0('CI:', my_ci)) -#************* -# Assign the regression output in the original df -# you can use ('=' or '<-/->') -#************* -#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] -my_logor -pnca_snps_or$Mutationinformation == i -View(pnca_snps_or) -#=============== -# Step 4: Calculate for one snp -# using i, when you run the loop, it is easy -#=============== -i = "pnca_p.trp68gly" -pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" -, stringsAsFactors = F -, header = T) #2133 -# uncomment as necessary -pnca_snps_or = pnca_snps_or[1:5,] -pnca_snps_or = pnca_snps_or[c(1:5),] -#=============== -pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" -, stringsAsFactors = F -, header = T) #2133 -pnca_snps_or = pnca_snps_or[1:5,] -pnca_snps_or = pnca_snps_or[c(1:5),] -pnca_snps_or = pnca_snps_or[1:5] -pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" -, stringsAsFactors = F -, header = T) #2133 -pnca_snps_or = pnca_snps_or[1:5] -pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" -, stringsAsFactors = F -, header = T) #2133 -foo = pnca_snps_or[c(1:5,)] -foo = pnca_snps_or[c(1:5),] -foo = as.data.frame(pnca_snps_or[c(1:5),]) -View(foo) -# create an empty dataframe -pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),]) -# IV: corresponds to each unique snp (extracted using grep) -x = as.numeric(grepl(i,raw_data$all_muts_pza)) -# DV: pyrazinamide 0 or 1 -y = as.numeric(raw_data$pyrazinamide) -table(y,x) -# run glm model -model = glm(y ~ x, family = binomial) -#model = glm(y ~ x, family = binomial(link = "logit")) -summary(model) -my_logor = summary(model)$coefficients[2,1] -print(paste0('Beta:', my_logor)) -# extract SE of the logistic model for a given snp -my_se = summary(model)$coefficients[2,2] -print(paste0('SE:', my_se)) -# extract Z of the logistic model for a given snp -my_zval = summary(model)$coefficients[2,3] -print(paste0('Z-value:', my_zval)) -# Dervive OR i.e exp(my_logor) from the logistic model for a given snp -#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) -my_or = exp(summary(model)$coefficients[2,1]) -print(paste0('OR:', my_or)) -# sanity check : should be True -log(my_or) == my_logor -# extract P-value of the logistic model for a given snp -my_pval = summary(model)$coefficients[2,4] -print(paste0('P-value:', my_pval)) -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,] -my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) -print(paste0('CI:', my_ci)) -#************* -# Assign the regression output in the original df -# you can use ('=' or '<-/->') -#************* -#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] -my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] -my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] -my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] -my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] -my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] -#=============== -# Step 4: Iterate through this unique list -# and calculate OR, but only for one snp -# this is test before you apply it all others -#=============== -pnca_snps_or$mutation == i -View(pnca_snps_or) -# create an empty dataframe -pnca_snps_or = data.frame(mutation = i) -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] -my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] -my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] -my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] -my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] -my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] -View(pnca_snps_or_copy) -#=============== -# Step 4: Iterate through this unique list -# and calculate OR, but only for one snp -# this is test before you apply it all others -#=============== -#reset original df so you don't make a mistake -pnca_snps_or = pnca_snps_or_copy -for (i in pnca_snps_unique){ -print(i) -} -pnca_snps_or = pnca_snps_or_copy #2133, 1 -#........................................ -# create an empty dataframe : uncomment as necessary -pnca_snps_or = data.frame(mutation = c(i, "blank_mut") -#........................................ -# create an empty dataframe : uncomment as necessary -pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) -#........................................ -# create an empty dataframe : uncomment as necessary -pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) -View(pnca_snps_or) -# IV: corresponds to each unique snp (extracted using grep) -x = as.numeric(grepl(i,raw_data$all_muts_pza)) -# DV: pyrazinamide 0 or 1 -y = as.numeric(raw_data$pyrazinamide) -table(y,x) -# run glm model -model = glm(y ~ x, family = binomial) -#model = glm(y ~ x, family = binomial(link = "logit")) -summary(model) -#********** -# extract relevant model output -#********** -# extract log OR i.e the Beta estimate of the logistic model for a given snp -my_logor = summary(model)$coefficients[2,1] -print(paste0('Beta:', my_logor)) -# extract SE of the logistic model for a given snp -my_se = summary(model)$coefficients[2,2] -print(paste0('SE:', my_se)) -# extract Z of the logistic model for a given snp -my_zval = summary(model)$coefficients[2,3] -print(paste0('Z-value:', my_zval)) -# Dervive OR i.e exp(my_logor) from the logistic model for a given snp -#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) -my_or = exp(summary(model)$coefficients[2,1]) -print(paste0('OR:', my_or)) -# sanity check : should be True -log(my_or) == my_logor -# extract P-value of the logistic model for a given snp -my_pval = summary(model)$coefficients[2,4] -print(paste0('P-value:', my_pval)) -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,] -my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) -print(paste0('CI:', my_ci)) -#************* -# Assign the regression output in the original df -# you can use ('=' or '<-/->') -#************* -#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] -my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] -my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] -my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] -my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] -my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] -View(pnca_snps_or) -pnca_snps_or = pnca_snps_or_copy #2133, 1 -for (i in pnca_snps_unique){ -print(i) -#************* -# start logistic regression model building -#************* -# set the IV and DV for the logistic regression model -# IV: corresponds to each unique snp (extracted using grep) -x = as.numeric(grepl(i,raw_data$all_muts_pza)) -# DV: pyrazinamide 0 or 1 -y = as.numeric(raw_data$pyrazinamide) -table(y,x) -# run glm model -model = glm(y ~ x, family = binomial) -#model = glm(y ~ x, family = binomial(link = "logit")) -summary(model) -#********** -# extract relevant model output -#********** -# extract log OR i.e the Beta estimate of the logistic model for a given snp -my_logor = summary(model)$coefficients[2,1] -print(paste0('Beta:', my_logor)) -# extract SE of the logistic model for a given snp -my_se = summary(model)$coefficients[2,2] -print(paste0('SE:', my_se)) -# extract Z of the logistic model for a given snp -my_zval = summary(model)$coefficients[2,3] -print(paste0('Z-value:', my_zval)) -# Dervive OR i.e exp(my_logor) from the logistic model for a given snp -#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) -my_or = exp(summary(model)$coefficients[2,1]) -print(paste0('OR:', my_or)) -# sanity check : should be True -log(my_or) == my_logor -# extract P-value of the logistic model for a given snp -my_pval = summary(model)$coefficients[2,4] -print(paste0('P-value:', my_pval)) -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,] -my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) -print(paste0('CI:', my_ci)) -#************* -# Assign the regression output in the original df -# you can use ('=' or '<-/->') -#************* -#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] -my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] -my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] -my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] -my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] -my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] -} -warnings() -View(pnca_snps_or) -View(pnca_snps_or_copy) -#sanity check -pnca_snps_or$mutation == i1 -#sanity check -pnca_snps_or[pnca_snps_or$mutation == i1] -pnca_snps_or[pnca_snps_or$mutation == i2] -pnca_snps_or[pnca_snps_or$mutation == i2,] -pnca_snps_or1 = unique(pnca_snps_or) -write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") -# you only need it for the unique mutations -pnca_snps_or = unique(pnca_snps_or) #2133, 1 -for (i in pnca_snps_unique){ -print(i) -#************* -# start logistic regression model building -#************* -# set the IV and DV for the logistic regression model -# IV: corresponds to each unique snp (extracted using grep) -x = as.numeric(grepl(i,raw_data$all_muts_pza)) -# DV: pyrazinamide 0 or 1 -y = as.numeric(raw_data$pyrazinamide) -table(y,x) -# run glm model -model = glm(y ~ x, family = binomial) -#model = glm(y ~ x, family = binomial(link = "logit")) -summary(model) -#********** -# extract relevant model output -#********** -# extract log OR i.e the Beta estimate of the logistic model for a given snp -my_logor = summary(model)$coefficients[2,1] -print(paste0('Beta:', my_logor)) -# extract SE of the logistic model for a given snp -my_se = summary(model)$coefficients[2,2] -print(paste0('SE:', my_se)) -# extract Z of the logistic model for a given snp -my_zval = summary(model)$coefficients[2,3] -print(paste0('Z-value:', my_zval)) -# Dervive OR i.e exp(my_logor) from the logistic model for a given snp -#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) -my_or = exp(summary(model)$coefficients[2,1]) -print(paste0('OR:', my_or)) -# sanity check : should be True -log(my_or) == my_logor -# extract P-value of the logistic model for a given snp -my_pval = summary(model)$coefficients[2,4] -print(paste0('P-value:', my_pval)) -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,] -my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) -print(paste0('CI:', my_ci)) -#************* -# Assign the regression output in the original df -# you can use ('=' or '<-/->') -#************* -#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] -my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] -my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] -my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] -my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] -my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] -} -View(pnca_snps_or) -2.290256e+01 -1.561132e+06 -3.242285e-04 -#sanity check -pnca_snps_or[pnca_snps_or$mutation == i1] -pnca_snps_or[pnca_snps_or$mutation == i2,] -write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") -my_data = read.csv("../Data_original/meta_pza_with_AF.csv" -, stringsAsFactors = FALSE) #11374, 19 -View(my_data) -# remove the first column -my_data = my_data[-1] #11374, 18 -# check if first col is 'id': should be TRUE -colnames(my_data)[1] == 'id' -# sanity check -snps_all = unique(my_data$mutation)# 337 -pnca_snps_or = snps_all -pnca_snps_or = as.data.frame(snps_all) -View(pnca_snps_or) -snps_all[-"true_wt"] -pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) -View(pnca_snps_or) -snps_all = as.data.frame(snps_all) -View(snps_all) -#remove true_wt entry -w1 = which(rownames(snps_all) == "true_wt") -View(snps_all) -#remove true_wt entry -w1 = which(snps_all$snps_all == "true_wt") -rm(pnca_snps_or) -pnca_snps_or = snps_all[-w1] -pnca_snps_or = snps_all[,-w1] -pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) -#remove true_wt entry -w1 = which(snps_all) == "true_wt" -pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) -my_data = read.csv("../Data_original/meta_pza_with_AF.csv" -, stringsAsFactors = FALSE) #11374, 19 -# remove the first column -my_data = my_data[-1] #11374, 18 -# check if first col is 'id': should be TRUE -colnames(my_data)[1] == 'id' -# sanity check -snps_all = unique(my_data$mutation)# 337 -snps_all = as.data.frame(snps_all) -snps_all[-c(1,1)] -pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) -pnca_snps_or = as.data.frame(snps_all[, -c(1,1)]) -#remove true_wt entry -#w1 = which(snps_all) == "true_wt" -pnca_snps_or = snps_all -pnca_snps_or = pnca_snps_or_copy -#remove true_wt entry -#w1 = which(snps_all) == "true_wt" -pnca_snps_or = snps_all -pnca_snps_or -> pnca_snps_or_copy -#=============== -# Step 4: Iterate through this unique list -# and calculate OR for each snp -# and assign to the pnca_snps_or df that has -# each row as a unique snp -#=============== -# reset original df so you don't make a mistake: IMPORTANT -pnca_snps_or = pnca_snps_or_copy #2133, 1 -# you only need it for the unique mutations -pnca_snps_or = unique(pnca_snps_or) #337, 1 -for (i in pnca_snps_unique){ -print(i) -#************* -# start logistic regression model building -#************* -# set the IV and DV for the logistic regression model -# IV: corresponds to each unique snp (extracted using grep) -x = as.numeric(grepl(i,raw_data$all_muts_pza)) -# DV: pyrazinamide 0 or 1 -y = as.numeric(raw_data$pyrazinamide) -table(y,x) -# run glm model -model = glm(y ~ x, family = binomial) -#model = glm(y ~ x, family = binomial(link = "logit")) -summary(model) -#********** -# extract relevant model output -#********** -# extract log OR i.e the Beta estimate of the logistic model for a given snp -my_logor = summary(model)$coefficients[2,1] -print(paste0('Beta:', my_logor)) -# extract SE of the logistic model for a given snp -my_se = summary(model)$coefficients[2,2] -print(paste0('SE:', my_se)) -# extract Z of the logistic model for a given snp -my_zval = summary(model)$coefficients[2,3] -print(paste0('Z-value:', my_zval)) -# Dervive OR i.e exp(my_logor) from the logistic model for a given snp -#my_or = round(exp(summary(model)$coefficients[2,1]), roundto) -my_or = exp(summary(model)$coefficients[2,1]) -print(paste0('OR:', my_or)) -# sanity check : should be True -log(my_or) == my_logor -# extract P-value of the logistic model for a given snp -my_pval = summary(model)$coefficients[2,4] -print(paste0('P-value:', my_pval)) -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,] -my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) -print(paste0('CI:', my_ci)) -#************* -# Assign the regression output in the original df -# you can use ('=' or '<-/->') -#************* -#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor -my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] -my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] -my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] -my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] -my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] -my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] -} -getwd() -#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work -setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad -#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac -getwd() -#=============== -# Step 1: read raw data -#=============== -raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv" -,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4 -raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4 -# combine the two mutation columns -raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5 -head(raw_data$all_mutations_pyrazinamide) -# create yet another column that contains all the mutations but in lower case -raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6 -table(grepl("pnca_p",raw_data$all_muts_pza)) -#FALSE TRUE -#10603 1908 -pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" -, stringsAsFactors = F -, header = T) #2133 -# subset a snall section to test -#pnca_snps_or_copy = pnca_snps_or -#pnca_snps_or = pnca_snps_or_copy -pnca_snps_unique = unique(pnca_snps_or$mutation) #293 -i2 = "pnca_p.trp68gly" # Should exist -grep(i2, pnca_snps_unique) -my_data = read.csv("../Data_original/meta_pza_with_AF.csv" -, stringsAsFactors = FALSE) #11374, 19 -# remove the first column -my_data = my_data[-1] #11374, 18 -# check if first col is 'id': should be TRUE -colnames(my_data)[1] == 'id' -# sanity check -head(my_data$mutation) -my_data = unique(my_data$mutation) -my_data[!duplicated(my_data$mutation)] -my_data_unique = my_data[!duplicated(my_data$mutation),] -my_data[!duplicated('mutation'),] -my_data_unique = my_data[!duplicated(my_data[,'mutation']),] -my_data_unique = my_data[!duplicated(my_data['mutation']),] -getwd() -setwd("/git/LSHTM_analysis/meta_data_analysis") -getwd() -getwd() -setwd("/git/github/LSHTM_analysis/meta_data_analysis") -getwd() -#=============== -# Step 1: read GWAS raw data stored in Data_original/ -#=============== -infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F)) -c = file.choose() -c = file.choose(../Data_original) -c = read.csv(file.choose(), stringsAsFactors = F) -#=============== -# Step 1: read GWAS raw data stored in Data_original/ -#=============== -infile = read.csv(file.choose(), stringsAsFactors = F)) -c = read.csv(file.choose(), stringsAsFactors = F) -#=============== -# Step 1: read GWAS raw data stored in Data_original/ -#=============== -infile = read.csv(file.choose(), stringsAsFactors = F) -#=============== -# Step 1: read GWAS raw data stored in Data_original/ -#=============== -infile = read.csv(file.choose(), stringsAsFactors = F) -raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")] -outdir = paste0("../mcsm_analysis",drug,"/Data/") -# define output variables -drug = 'pyrazinamide' -outdir = paste0("../mcsm_analysis",drug,"/Data/") -outdir = paste0("../mcsm_analysis/",drug,"/Data/") -outFile = "meta_data_with_AFandOR.csv" -output_filename = paste0(outdir, outFile) -output_filename -pnca_snps_or = read.csv(file.choose() -, stringsAsFactors = F -, header = T) -View(pnca_snps_or) -View(pnca_snps_or) diff --git a/meta_data_analysis/pnca_AF_and_OR_calcs.R b/meta_data_analysis/pnca_AF_and_OR_calcs.R index 3c84670..f1b5321 100644 --- a/meta_data_analysis/pnca_AF_and_OR_calcs.R +++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R @@ -1,16 +1,59 @@ +#============================================ +# TASK: to calculate allele frequency and OR +# on master data +#=========================================== +homedir = '~' getwd() -setwd("/git/github/git/LSHTM_analysis/meta_data_analysis") +#setwd('~/git/LSHTM_analysis/meta_data_analysis') +setwd(paste0(homedir, '/', 'git/LSHTM_analysis/meta_data_analysis')) getwd() + +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +print(gene_match) + +#======= +# input dir +#======= +# file1: Raw data +#indir = 'git/Data/pyrazinamide/input/original' +indir = paste0('git/Data', '/', drug, '/', 'input/original') +in_filename = 'original_tanushree_data_v2.csv' +infile = paste0(homedir, '/', indir, '/', in_filename) +print(paste0('Reading infile:', ' ', infile) ) + +# file2: file to extract valid snps and add calcs to: pnca_metadata.csv {outfile3 from data extraction script} +indir_metadata = paste0('git/Data', '/', drug, '/', 'output') +in_filename_metadata = 'pnca_metadata.csv' +infile_metadata = paste0(homedir, '/', indir_metadata, '/', in_filename_metadata) +print(paste0('Reading metadata infile:', infile_metadata)) + +#========= +# output dir +#========= +# output filename in respective section at the time of outputting files +#outdir = 'git/Data/pyrazinamide/output' +outdir = paste0('git/Data', '/', drug, '/', 'output') +out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv') +outfile = paste0(homedir, '/', outdir, '/', out_filename) +print(paste0('Output file with full path:', outfile)) + +#%% end of variable assignment for input and output files #=============== -# Step 1: read GWAS raw data stored in Data_original/ -#=============== -infile = read.csv(file.choose(), stringsAsFactors = F) +# Step 1: Read master/raw data stored in Data/ +#=============== +raw_data_all = read.csv(infile, stringsAsFactors = F) -raw_data = infile[,c("id" +raw_data = raw_data_all[,c("id" , "pyrazinamide" , "dr_mutations_pyrazinamide" , "other_mutations_pyrazinamide")] +rm(raw_data_all) + +rm(indir, in_filename, infile) ##### # 1a: exclude na @@ -18,7 +61,7 @@ raw_data = infile[,c("id" raw_data = raw_data[!is.na(raw_data$pyrazinamide),] total_samples = length(unique(raw_data$id)) -print(total_samples) +print(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) # sanity check: should be true is.numeric(total_samples) @@ -36,10 +79,15 @@ head(raw_data$all_mutations_pyrazinamide) raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) # sanity checks -table(grepl("pnca_p",raw_data$all_muts_pnca)) +#table(grepl("pnca_p",raw_data$all_muts_pnca)) +print(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) +gene_match = tolower(gene_match) + +table(grepl(gene_match,raw_data$all_muts_pnca)) # sanity check: should be TRUE -sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples +#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples +sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples # set up variables: can be used for logistic regression as well i = "pnca_p.ala134gly" # has a NA, should NOT exist @@ -56,17 +104,40 @@ table(mut, dst) #fisher.test(table(mut, dst)) #table(mut) -###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py) -pnca_snps_or = read.csv(file.choose() +#=============== +# Step 2: Read valid snps for which OR can be calculated (infile_comp_snps.csv) +#=============== +print(paste0('Reading metadata infile:', infile_metadata)) + +pnca_metadata = read.csv(infile_metadata +# , file.choose() , stringsAsFactors = F , header = T) -# extract unique snps to iterate over for AF and OR calcs -# total no of unique snps -# AF and OR calculations +# clear variables +rm(homedir, in_filename, indir, infile) +rm(indir_metadata, infile_metadata, in_filename_metadata) + +# count na in pyrazinamide column +tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide)) +expected_rows = nrow(pnca_metadata) - tot_pza_na + +# drop na from the pyrazinamide colum +pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),] + +# sanity check +if(nrow(pnca_snps_or) == expected_rows){ + print('PASS: no. of rows match with expected_rows') +} else{ + print('FAIL: nrows mismatch.') +} + +# extract unique snps to iterate over for AF and OR calcs pnca_snps_unique = unique(pnca_snps_or$mutation) +print(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique))) + # Define OR function x = as.numeric(mut) y = dst @@ -111,131 +182,106 @@ afs['pnca_p.trp68gly'] afs['pnca_p.gln10pro'] afs['pnca_p.leu4ser'] -#plot(density(log(ors))) -#plot(-log10(pvals)) -#hist(log(ors) -# ,breaks = 100 -# ) +plot(density(log(ors))) +plot(-log10(pvals)) +hist(log(ors) + , breaks = 100 + ) -# subset df cols to add to the calc param df -pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] -pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),] - -rownames(pnca_snps_cols) = pnca_snps_cols$mutation -head(rownames(pnca_snps_cols)) -#snps_with_AF_and_OR +# FIXME: could be good to add a sanity check +if (table(names(ors) == names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){ +print('PASS: names of ors, pvals and afs match: proceed with combining into a single df') +} else{ + print('FAIL: names of ors, pvals and afs mismatch') +} # combine comb_AF_and_OR = data.frame(ors, pvals, afs) head(rownames(comb_AF_and_OR)) -# sanity checks: should be the same -dim(comb_AF_and_OR); dim(pnca_snps_cols) -table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols)) - -table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR)) - -# merge the above two df whose dim you checked -snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols - , by = "row.names" -# , all.x = T - ) - -#rm(pnca_snps_cols, pnca_snps_or, raw_data) - -#=============== -# Step 3: Read data file where you will add the calculated OR -# Note: this is the big file with one-many relationship between snps and lineages -# i.e fname4 from 'pnca_extraction.py' -#=============== -my_data = read.csv(file.choose() - , row.names = 1 - , stringsAsFactors = FALSE) - -head(my_data) -length(unique(my_data$id)) - -# check if first col is 'id': should be TRUE -colnames(my_data)[1] == 'id' +# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column +comb_AF_and_OR$mutation = rownames(comb_AF_and_OR) # sanity check -head(my_data$mutation) +if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){ + print('PASS: rownames and mutaion col values match') +}else{ + print('FAIL: rownames and mutation col values mismatch') +} -# FILES TO MERGE: -# comb_AF_and_OR: file containing OR -# my_data = big meta data file -# linking column: mutation +############ +# Merge 1: +########### +df1 = pnca_metadata +df2 = comb_AF_and_OR -head(my_data) -merged_df = merge(my_data # big file - , snps_with_AF_and_OR # small (afor file) +head(df1$mutation); head(df2$mutation) + +# FIXME: newlines +print(paste0('merging two dfs: ' + ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1) + ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2) + , 'expected rows in merged df: ', nrow(df1), 'expected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1))) + +merged_df = merge(df1 # big file + , df2 # small (afor file) , by = "mutation" , all.x = T) # because you want all the entries of the meta data -# sanity checks: should be True -# FIXME: I have checked this manually, but make it so it is a pass or a fail! -comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors -merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors +# sanity check +if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){ + print(paste0('PASS: no. of cols is as expected: ', ncol(merged_df))) +} else{ + print('FAIL: no.of cols mistmatch') +} -merged_df[merged_df$Mutationinformation.x == "Q10P",] - -# sanity check: very important! -colnames(merged_df) - -table(merged_df$mutation_info.x == merged_df$mutation_info.y) - -#FIXME: what happened to other 7 and FALSE -table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y) - -# problem -identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y) - -#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,] - -#throw away the y because that is a smaller df -d1 = which(colnames(merged_df) == "mutation_info.y") #21 -d2 = which(colnames(merged_df) == "Mutationinformation.y") #22 - -merged_df2 = merged_df[-c(d1, d2)] #3093 20 -colnames(merged_df2) - -# rename cols -colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info" -colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation" - -colnames(merged_df2) - -# should be 0 -sum(is.na(merged_df2$Mutationinformation)) +# quick check +i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc. +merged_df[merged_df$mutation == i,] # count na in each column -na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count +na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count # only some or and Af should be NA #Row.names ors pvals afs -#81 81 81 81 +#63 63 63 63 +# reassign custom colnames +colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR" +colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF" +colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue" -colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR" -colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF" -colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue" - -colnames(merged_df2) +colnames(merged_df) # add log OR and neglog pvalue -merged_df2$logor = log(merged_df2$OR) -is.numeric(merged_df2$logor) +merged_df$logor = log(merged_df$OR) +is.numeric(merged_df$logor) -merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) -is.numeric(merged_df2$neglog10pvalue) +merged_df$neglog10pvalue = -log10(merged_df$pvalue) +is.numeric(merged_df$neglog10pvalue) -# write file out -#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv") +merged_df$AF_percent = merged_df$AF*100 +is.numeric(merged_df$AF_percent) -# define output variables -drug = 'pyrazinamide' -out_dir = paste0("../mcsm_analysis/",drug,"/Data/") -outFile = "meta_data_with_AFandOR.csv" -output_filename = paste0(outdir, outFile) +# check AFs +#i = 'pnca_p.trp68gly' +i = 'pnca_p.gln10pro' +#i = 'pnca_p.leu4ser' +merged_df[merged_df$mutation == i,] -write.csv(merged_df2, output_filename +# FIXME: harcoding (beware!), NOT FATAL though! +ncol_added = 3 + +print(paste0('Added', ncol_added, ' ', 'more cols to merged_df i.e log10 OR and -log10 P-val: ' + , 'no. of cols in merged_df now: ', ncol(merged_df))) + +#%% write file out: pnca_meta_data_with_AFandOR +print(paste0('writing output file in: ' + , 'Filename: ', out_filename + , 'Path:', outdir)) + +write.csv(merged_df, outfile , row.names = F) + +print(paste0('Finished writing:', out_filename, '\nExpected no. of cols:', ncol(merged_df))) +print('======================================================================') +rm(out_filename) diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py index 7588122..72c7860 100755 --- a/meta_data_analysis/pnca_data_extraction.py +++ b/meta_data_analysis/pnca_data_extraction.py @@ -36,9 +36,10 @@ import numpy as np # 1) pnca_ambiguous_muts.csv # 2) pnca_mcsm_snps.csv # 3) pnca_metadata.csv -# 4) pnca_comp_snps.csv -# 5) pnca_all_muts_msa.csv -# 6) pnca_mutational_positons.csv +# 4) pnca_comp_snps.csv <---deleted> + +# 4) pnca_all_muts_msa.csv +# 5) pnca_mutational_positons.csv #======================================================== #%% specify homedir as python doesn't recognise tilde homedir = os.path.expanduser('~') @@ -52,23 +53,25 @@ os.getcwd() from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! #======================================================== -#drug = 'pyrazinamide' +#%% variable assignment: input and output paths & filenames + +drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' -#%% specify variables for input and output paths and filenames - #======= # input dir #======= -indir = 'git/Data/pyrazinamide/input/original' - +#indir = 'git/Data/pyrazinamide/input/original' +indir = 'git/Data' + '/' + drug + '/' + 'input/original' #========= # output dir #========= # several output files # output filenames in respective sections at the time of outputting files -outdir = 'git/Data/pyrazinamide/output' +#outdir = 'git/Data/pyrazinamide/output' +outdir = 'git/Data' + '/' + drug + '/' + 'output' + #%%end of variable assignment for input and output files #============================================================================== #%% Read files @@ -77,7 +80,7 @@ in_filename = 'original_tanushree_data_v2.csv' infile = homedir + '/' + indir + '/' + in_filename print('Reading input master file:', infile) -master_data = pd.read_csv(infile, sep = ',') +master_data = pd.read_csv(infile, sep = ',') # column names #list(master_data.columns) @@ -334,6 +337,8 @@ print('Writing file: common ids:\n', common_ids.to_csv(outfile0) print('======================================================================') +del(out_filename0) + # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) @@ -701,21 +706,6 @@ del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_m #%% end of data extraction and some files writing. Below are some more files writing. - - - - - - - - - - - - - - - #%%: write file: ambiguous muts # uncomment as necessary #print(outdir) @@ -735,6 +725,8 @@ inspect.to_csv(outfile1) print('Finished writing:', out_filename1, '\nExpected no. of rows (no. of samples with the ambiguous muts present):', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()) print('======================================================================') del(out_filename1) + + #%% #=========== # Split 'mutation' column into three: wild_type, position and @@ -891,6 +883,8 @@ print('Finished writing:', out_filename2, '\nNo. of rows:', len(snps_only) ) print('======================================================================') del(out_filename2) + + #%% Write file: pnca_metadata (i.e pnca_LF1) # where each row has UNIQUE mutations NOT unique sample ids out_filename3 = gene.lower() + '_' + 'metadata.csv' @@ -903,45 +897,10 @@ pnca_LF1.to_csv(outfile3, header = True, index = False) print('Finished writing:', out_filename3, '\nNo. of rows:', len(pnca_LF1), '\nNo. of cols:', len(pnca_LF1.columns) ) - print('======================================================================') +del(out_filename3) -#%% Write file: comp SNPs (i.e snps without any corresponding 'NA' in the -# column to allow OR calcs) -# remove NA from pyrazinamide cols -pnca_LF2 = pnca_LF1.dropna(subset=['pyrazinamide']) - -print('extracting OR muts by removing NAs from pyrazinamide cols') -if pnca_LF2.pyrazinamide.isna().sum() > 0: - print('FAIL: NAs NOT removed successfully') -else: - print('PASS: NAs removed successfully') - -# extracting comp snps only -comp_snps_only = pd.DataFrame(pnca_LF2['mutation'].unique()) -#print('Total no. of comp snps:', len(comp_snps_only)) -comp_snps_only.head() - -# assign column name -comp_snps_only.columns = ['mutation'] - -# count how many positions this corresponds to -comp_pos_only = pd.DataFrame(pnca_LF2['position'].unique()) -#print('Total no. of pos corresponding to comp_snps:', len(comp_pos_only)) - -out_filename4 = gene.lower() + '_' + 'comp_snps.csv' -outfile4 = homedir + '/' + outdir + '/' + out_filename4 -print('Writing file: comp snps to allow OR calcs', - '\nFilename:', out_filename4, - '\nPath:', homedir + '/' + outdir, - '\nNo. of comp muts:', len(comp_snps_only), - '\nNo. of distinct positions for comp muts:', len(comp_pos_only) ) - -comp_snps_only.to_csv(outfile4, header = True, index = False) - -print('Finished writing:', out_filename4, - '\nNo. of rows:', len(comp_snps_only) ) #%% write file: mCSM style but with repitions for MSA and logo plots all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) all_muts_msa.head() @@ -970,21 +929,22 @@ else: '\nDebug please!') print('======================================================================') -out_filename5 = gene.lower() + '_' + 'all_muts_msa.csv' -outfile5 = homedir + '/' + outdir + '/' + out_filename5 +out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv' +outfile4 = homedir + '/' + outdir + '/' + out_filename4 print('Writing file: mCSM style muts for msa', '\nmutation format (SNP): {Wt}{Mut}', '\nNo.of lines of msa:', len(all_muts_msa), - '\nFilename:', out_filename5, + '\nFilename:', out_filename4, '\nPath:', homedir +'/'+ outdir) -all_muts_msa_sorted.to_csv(outfile5, header = False, index = False) +all_muts_msa_sorted.to_csv(outfile4, header = False, index = False) -print('Finished writing:', out_filename5, +print('Finished writing:', out_filename4, '\nNo. of rows:', len(all_muts_msa) ) print('======================================================================') -del(out_filename5) +del(out_filename4) + #%% write file for mutational positions # count how many positions this corresponds to @@ -999,20 +959,22 @@ pos_only.position.dtype # sort by position value pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) -out_filename6 = gene.lower() + '_' + 'mutational_positons.csv' -outfile6 = homedir + '/' + outdir + '/' + out_filename6 +out_filename5 = gene.lower() + '_' + 'mutational_positons.csv' +outfile5 = homedir + '/' + outdir + '/' + out_filename5 print('Writing file: mutational positions', '\nNo. of distinct positions:', len(pos_only_sorted), - '\nFilename:', out_filename6, + '\nFilename:', out_filename5, '\nPath:', homedir +'/'+ outdir) -pos_only_sorted.to_csv(outfile6, header = True, index = False) +pos_only_sorted.to_csv(outfile5, header = True, index = False) -print('Finished writing:', out_filename6, +print('Finished writing:', out_filename5, '\nNo. of rows:', len(pos_only_sorted) ) print('======================================================================') -del(out_filename6) +del(out_filename5) + + #%% end of script print('======================================================================') print(u'\u2698' * 50,