, stringsAsFactors = F) x = as.numeric(grepl(i,raw_data$all_muts_pza)) # DV: pyrazinamide 0 or 1 y = as.numeric(raw_data$pyrazinamide) table(y,x) # run glm model model = glm(y ~ x, family = binomial) #model = glm(y ~ x, family = binomial(link = "logit")) summary(model) #********** # extract relevant model output #********** # extract log OR i.e the Beta estimate of the logistic model for a given snp my_logor = summary(model)$coefficients[2,1] print(paste0('Beta:', my_logor)) # extract SE of the logistic model for a given snp my_se = summary(model)$coefficients[2,2] print(paste0('SE:', my_se)) # extract Z of the logistic model for a given snp my_zval = summary(model)$coefficients[2,3] print(paste0('Z-value:', my_zval)) # Dervive OR i.e exp(my_logor) from the logistic model for a given snp #my_or = round(exp(summary(model)$coefficients[2,1]), roundto) my_or = exp(summary(model)$coefficients[2,1]) print(paste0('OR:', my_or)) # sanity check : should be True log(my_or) == my_logor # extract P-value of the logistic model for a given snp my_pval = summary(model)$coefficients[2,4] print(paste0('P-value:', my_pval)) # extract confint interval of snp (2 steps, since the output is a named number) ci_mod = exp(confint(model))[2,] my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) print(paste0('CI:', my_ci)) #************* # Assign the regression output in the original df # you can use ('=' or '<-/->') #************* #pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] my_logor pnca_snps_or$Mutationinformation == i View(pnca_snps_or) #=============== # Step 4: Calculate for one snp # using i, when you run the loop, it is easy #=============== i = "pnca_p.trp68gly" pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" , stringsAsFactors = F , header = T) #2133 # uncomment as necessary pnca_snps_or = pnca_snps_or[1:5,] pnca_snps_or = pnca_snps_or[c(1:5),] #=============== pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" , stringsAsFactors = F , header = T) #2133 pnca_snps_or = pnca_snps_or[1:5,] pnca_snps_or = pnca_snps_or[c(1:5),] pnca_snps_or = pnca_snps_or[1:5] pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" , stringsAsFactors = F , header = T) #2133 pnca_snps_or = pnca_snps_or[1:5] pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" , stringsAsFactors = F , header = T) #2133 foo = pnca_snps_or[c(1:5,)] foo = pnca_snps_or[c(1:5),] foo = as.data.frame(pnca_snps_or[c(1:5),]) View(foo) # create an empty dataframe pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),]) # IV: corresponds to each unique snp (extracted using grep) x = as.numeric(grepl(i,raw_data$all_muts_pza)) # DV: pyrazinamide 0 or 1 y = as.numeric(raw_data$pyrazinamide) table(y,x) # run glm model model = glm(y ~ x, family = binomial) #model = glm(y ~ x, family = binomial(link = "logit")) summary(model) my_logor = summary(model)$coefficients[2,1] print(paste0('Beta:', my_logor)) # extract SE of the logistic model for a given snp my_se = summary(model)$coefficients[2,2] print(paste0('SE:', my_se)) # extract Z of the logistic model for a given snp my_zval = summary(model)$coefficients[2,3] print(paste0('Z-value:', my_zval)) # Dervive OR i.e exp(my_logor) from the logistic model for a given snp #my_or = round(exp(summary(model)$coefficients[2,1]), roundto) my_or = exp(summary(model)$coefficients[2,1]) print(paste0('OR:', my_or)) # sanity check : should be True log(my_or) == my_logor # extract P-value of the logistic model for a given snp my_pval = summary(model)$coefficients[2,4] print(paste0('P-value:', my_pval)) # extract confint interval of snp (2 steps, since the output is a named number) ci_mod = exp(confint(model))[2,] my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) print(paste0('CI:', my_ci)) #************* # Assign the regression output in the original df # you can use ('=' or '<-/->') #************* #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] #=============== # Step 4: Iterate through this unique list # and calculate OR, but only for one snp # this is test before you apply it all others #=============== pnca_snps_or$mutation == i View(pnca_snps_or) # create an empty dataframe pnca_snps_or = data.frame(mutation = i) my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] View(pnca_snps_or_copy) #=============== # Step 4: Iterate through this unique list # and calculate OR, but only for one snp # this is test before you apply it all others #=============== #reset original df so you don't make a mistake pnca_snps_or = pnca_snps_or_copy for (i in pnca_snps_unique){ print(i) } pnca_snps_or = pnca_snps_or_copy #2133, 1 #........................................ # create an empty dataframe : uncomment as necessary pnca_snps_or = data.frame(mutation = c(i, "blank_mut") #........................................ # create an empty dataframe : uncomment as necessary pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) #........................................ # create an empty dataframe : uncomment as necessary pnca_snps_or = data.frame(mutation = c(i, "blank_mut")) View(pnca_snps_or) # IV: corresponds to each unique snp (extracted using grep) x = as.numeric(grepl(i,raw_data$all_muts_pza)) # DV: pyrazinamide 0 or 1 y = as.numeric(raw_data$pyrazinamide) table(y,x) # run glm model model = glm(y ~ x, family = binomial) #model = glm(y ~ x, family = binomial(link = "logit")) summary(model) #********** # extract relevant model output #********** # extract log OR i.e the Beta estimate of the logistic model for a given snp my_logor = summary(model)$coefficients[2,1] print(paste0('Beta:', my_logor)) # extract SE of the logistic model for a given snp my_se = summary(model)$coefficients[2,2] print(paste0('SE:', my_se)) # extract Z of the logistic model for a given snp my_zval = summary(model)$coefficients[2,3] print(paste0('Z-value:', my_zval)) # Dervive OR i.e exp(my_logor) from the logistic model for a given snp #my_or = round(exp(summary(model)$coefficients[2,1]), roundto) my_or = exp(summary(model)$coefficients[2,1]) print(paste0('OR:', my_or)) # sanity check : should be True log(my_or) == my_logor # extract P-value of the logistic model for a given snp my_pval = summary(model)$coefficients[2,4] print(paste0('P-value:', my_pval)) # extract confint interval of snp (2 steps, since the output is a named number) ci_mod = exp(confint(model))[2,] my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) print(paste0('CI:', my_ci)) #************* # Assign the regression output in the original df # you can use ('=' or '<-/->') #************* #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] View(pnca_snps_or) pnca_snps_or = pnca_snps_or_copy #2133, 1 for (i in pnca_snps_unique){ print(i) #************* # start logistic regression model building #************* # set the IV and DV for the logistic regression model # IV: corresponds to each unique snp (extracted using grep) x = as.numeric(grepl(i,raw_data$all_muts_pza)) # DV: pyrazinamide 0 or 1 y = as.numeric(raw_data$pyrazinamide) table(y,x) # run glm model model = glm(y ~ x, family = binomial) #model = glm(y ~ x, family = binomial(link = "logit")) summary(model) #********** # extract relevant model output #********** # extract log OR i.e the Beta estimate of the logistic model for a given snp my_logor = summary(model)$coefficients[2,1] print(paste0('Beta:', my_logor)) # extract SE of the logistic model for a given snp my_se = summary(model)$coefficients[2,2] print(paste0('SE:', my_se)) # extract Z of the logistic model for a given snp my_zval = summary(model)$coefficients[2,3] print(paste0('Z-value:', my_zval)) # Dervive OR i.e exp(my_logor) from the logistic model for a given snp #my_or = round(exp(summary(model)$coefficients[2,1]), roundto) my_or = exp(summary(model)$coefficients[2,1]) print(paste0('OR:', my_or)) # sanity check : should be True log(my_or) == my_logor # extract P-value of the logistic model for a given snp my_pval = summary(model)$coefficients[2,4] print(paste0('P-value:', my_pval)) # extract confint interval of snp (2 steps, since the output is a named number) ci_mod = exp(confint(model))[2,] my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) print(paste0('CI:', my_ci)) #************* # Assign the regression output in the original df # you can use ('=' or '<-/->') #************* #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] } warnings() View(pnca_snps_or) View(pnca_snps_or_copy) #sanity check pnca_snps_or$mutation == i1 #sanity check pnca_snps_or[pnca_snps_or$mutation == i1] pnca_snps_or[pnca_snps_or$mutation == i2] pnca_snps_or[pnca_snps_or$mutation == i2,] pnca_snps_or1 = unique(pnca_snps_or) write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") # you only need it for the unique mutations pnca_snps_or = unique(pnca_snps_or) #2133, 1 for (i in pnca_snps_unique){ print(i) #************* # start logistic regression model building #************* # set the IV and DV for the logistic regression model # IV: corresponds to each unique snp (extracted using grep) x = as.numeric(grepl(i,raw_data$all_muts_pza)) # DV: pyrazinamide 0 or 1 y = as.numeric(raw_data$pyrazinamide) table(y,x) # run glm model model = glm(y ~ x, family = binomial) #model = glm(y ~ x, family = binomial(link = "logit")) summary(model) #********** # extract relevant model output #********** # extract log OR i.e the Beta estimate of the logistic model for a given snp my_logor = summary(model)$coefficients[2,1] print(paste0('Beta:', my_logor)) # extract SE of the logistic model for a given snp my_se = summary(model)$coefficients[2,2] print(paste0('SE:', my_se)) # extract Z of the logistic model for a given snp my_zval = summary(model)$coefficients[2,3] print(paste0('Z-value:', my_zval)) # Dervive OR i.e exp(my_logor) from the logistic model for a given snp #my_or = round(exp(summary(model)$coefficients[2,1]), roundto) my_or = exp(summary(model)$coefficients[2,1]) print(paste0('OR:', my_or)) # sanity check : should be True log(my_or) == my_logor # extract P-value of the logistic model for a given snp my_pval = summary(model)$coefficients[2,4] print(paste0('P-value:', my_pval)) # extract confint interval of snp (2 steps, since the output is a named number) ci_mod = exp(confint(model))[2,] my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) print(paste0('CI:', my_ci)) #************* # Assign the regression output in the original df # you can use ('=' or '<-/->') #************* #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] } View(pnca_snps_or) 2.290256e+01 1.561132e+06 3.242285e-04 #sanity check pnca_snps_or[pnca_snps_or$mutation == i1] pnca_snps_or[pnca_snps_or$mutation == i2,] write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv") my_data = read.csv("../Data_original/meta_pza_with_AF.csv" , stringsAsFactors = FALSE) #11374, 19 View(my_data) # remove the first column my_data = my_data[-1] #11374, 18 # check if first col is 'id': should be TRUE colnames(my_data)[1] == 'id' # sanity check snps_all = unique(my_data$mutation)# 337 pnca_snps_or = snps_all pnca_snps_or = as.data.frame(snps_all) View(pnca_snps_or) snps_all[-"true_wt"] pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) View(pnca_snps_or) snps_all = as.data.frame(snps_all) View(snps_all) #remove true_wt entry w1 = which(rownames(snps_all) == "true_wt") View(snps_all) #remove true_wt entry w1 = which(snps_all$snps_all == "true_wt") rm(pnca_snps_or) pnca_snps_or = snps_all[-w1] pnca_snps_or = snps_all[,-w1] pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) #remove true_wt entry w1 = which(snps_all) == "true_wt" pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) my_data = read.csv("../Data_original/meta_pza_with_AF.csv" , stringsAsFactors = FALSE) #11374, 19 # remove the first column my_data = my_data[-1] #11374, 18 # check if first col is 'id': should be TRUE colnames(my_data)[1] == 'id' # sanity check snps_all = unique(my_data$mutation)# 337 snps_all = as.data.frame(snps_all) snps_all[-c(1,1)] pnca_snps_or = as.data.frame(snps_all[-c(1,1)]) pnca_snps_or = as.data.frame(snps_all[, -c(1,1)]) #remove true_wt entry #w1 = which(snps_all) == "true_wt" pnca_snps_or = snps_all pnca_snps_or = pnca_snps_or_copy #remove true_wt entry #w1 = which(snps_all) == "true_wt" pnca_snps_or = snps_all pnca_snps_or -> pnca_snps_or_copy #=============== # Step 4: Iterate through this unique list # and calculate OR for each snp # and assign to the pnca_snps_or df that has # each row as a unique snp #=============== # reset original df so you don't make a mistake: IMPORTANT pnca_snps_or = pnca_snps_or_copy #2133, 1 # you only need it for the unique mutations pnca_snps_or = unique(pnca_snps_or) #337, 1 for (i in pnca_snps_unique){ print(i) #************* # start logistic regression model building #************* # set the IV and DV for the logistic regression model # IV: corresponds to each unique snp (extracted using grep) x = as.numeric(grepl(i,raw_data$all_muts_pza)) # DV: pyrazinamide 0 or 1 y = as.numeric(raw_data$pyrazinamide) table(y,x) # run glm model model = glm(y ~ x, family = binomial) #model = glm(y ~ x, family = binomial(link = "logit")) summary(model) #********** # extract relevant model output #********** # extract log OR i.e the Beta estimate of the logistic model for a given snp my_logor = summary(model)$coefficients[2,1] print(paste0('Beta:', my_logor)) # extract SE of the logistic model for a given snp my_se = summary(model)$coefficients[2,2] print(paste0('SE:', my_se)) # extract Z of the logistic model for a given snp my_zval = summary(model)$coefficients[2,3] print(paste0('Z-value:', my_zval)) # Dervive OR i.e exp(my_logor) from the logistic model for a given snp #my_or = round(exp(summary(model)$coefficients[2,1]), roundto) my_or = exp(summary(model)$coefficients[2,1]) print(paste0('OR:', my_or)) # sanity check : should be True log(my_or) == my_logor # extract P-value of the logistic model for a given snp my_pval = summary(model)$coefficients[2,4] print(paste0('P-value:', my_pval)) # extract confint interval of snp (2 steps, since the output is a named number) ci_mod = exp(confint(model))[2,] my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) print(paste0('CI:', my_ci)) #************* # Assign the regression output in the original df # you can use ('=' or '<-/->') #************* #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i] my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i] my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i] my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i] my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i] } getwd() #setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad #setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac getwd() #=============== # Step 1: read raw data #=============== raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv" ,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4 raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4 # combine the two mutation columns raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5 head(raw_data$all_mutations_pyrazinamide) # create yet another column that contains all the mutations but in lower case raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6 table(grepl("pnca_p",raw_data$all_muts_pza)) #FALSE TRUE #10603 1908 pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv" , stringsAsFactors = F , header = T) #2133 # subset a snall section to test #pnca_snps_or_copy = pnca_snps_or #pnca_snps_or = pnca_snps_or_copy pnca_snps_unique = unique(pnca_snps_or$mutation) #293 i2 = "pnca_p.trp68gly" # Should exist grep(i2, pnca_snps_unique) my_data = read.csv("../Data_original/meta_pza_with_AF.csv" , stringsAsFactors = FALSE) #11374, 19 # remove the first column my_data = my_data[-1] #11374, 18 # check if first col is 'id': should be TRUE colnames(my_data)[1] == 'id' # sanity check head(my_data$mutation) my_data = unique(my_data$mutation) my_data[!duplicated(my_data$mutation)] my_data_unique = my_data[!duplicated(my_data$mutation),] my_data[!duplicated('mutation'),] my_data_unique = my_data[!duplicated(my_data[,'mutation']),] my_data_unique = my_data[!duplicated(my_data['mutation']),] getwd() setwd("/git/LSHTM_analysis/meta_data_analysis") getwd() getwd() setwd("/git/github/LSHTM_analysis/meta_data_analysis") getwd() #=============== # Step 1: read GWAS raw data stored in Data_original/ #=============== infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F)) c = file.choose() c = file.choose(../Data_original) c = read.csv(file.choose(), stringsAsFactors = F) #=============== # Step 1: read GWAS raw data stored in Data_original/ #=============== infile = read.csv(file.choose(), stringsAsFactors = F)) c = read.csv(file.choose(), stringsAsFactors = F) #=============== # Step 1: read GWAS raw data stored in Data_original/ #=============== infile = read.csv(file.choose(), stringsAsFactors = F) #=============== # Step 1: read GWAS raw data stored in Data_original/ #=============== infile = read.csv(file.choose(), stringsAsFactors = F) raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")] outdir = paste0("../mcsm_analysis",drug,"/Data/") # define output variables drug = 'pyrazinamide' outdir = paste0("../mcsm_analysis",drug,"/Data/") outdir = paste0("../mcsm_analysis/",drug,"/Data/") outFile = "meta_data_with_AFandOR.csv" output_filename = paste0(outdir, outFile) output_filename