From 003b22ce3fcd279af1fab6498c1aefc58ca02bca Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 23 Jun 2020 13:07:29 +0100 Subject: [PATCH] script for calcuating various OR & output csv --- scripts/af_or_calcs.R | 412 +++++++++------------------------- scripts/af_or_calcs_scratch.R | 10 +- 2 files changed, 117 insertions(+), 305 deletions(-) mode change 100755 => 100644 scripts/af_or_calcs.R diff --git a/scripts/af_or_calcs.R b/scripts/af_or_calcs.R old mode 100755 new mode 100644 index da5e784..290cc13 --- a/scripts/af_or_calcs.R +++ b/scripts/af_or_calcs.R @@ -30,8 +30,8 @@ if(is.null(drug)|is.null(gene)) { #options(scipen = 4) #%% variable assignment: input and output paths & filenames -#drug = 'pyrazinamide' -#gene = 'pncA' +drug = 'pyrazinamide' +gene = 'pncA' gene_match = paste0(gene,'_p.') cat(gene_match) @@ -165,51 +165,21 @@ gene_snps_unique = unique(gene_snps_or$mutation) cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) -#===================================== -#OR calcs using the following 4 -#1) chisq.test -#2) fisher -#3) modified chisq.test -#4) logistic -#5) adjusted logistic? -#6) kinship (separate script) - -#====================================== -# TEST FOR ONE -i = "pnca_p.ala134gly" # has a NA, should NOT exist -table(grepl(i,raw_data$all_muts_gene)) - +#=========================================================================================== +######################### +# custom chisq function: +# To calculate OR +######################### i = "pnca_p.trp68gly" -table(grepl(i,raw_data$all_muts_gene)) - -i = "pnca_p.his51asp" -table(grepl(i,raw_data$all_muts_gene)) - -# IV mut = grepl(i,raw_data$all_muts_gene) +mut = as.numeric(mut) -# DV -dst = raw_data[[drug]] #or raw_data[,drug] +dst = raw_data[[drug]] -table(mut, dst) - -#=============================================== -# calculating OR - -#1) chisq : noy accurate for low counts -chisq.test(table(mut,dst)) -chisq.test(table(mut,dst))$p.value -chisq.test(table(mut,dst))$statistic - -t = chisq.test(table(mut,dst))$statistic; print(t) -names(t) -# remove suffix -#names(t2) = gsub(".X-squared", "", names(t)) - -#2) modified chisq OR: custom function #x = as.numeric(mut) #y = dst -my_chisq_or = function(x,y){ + +mychisq_or = function(x,y){ tab = as.matrix(table(x,y)) a = tab[2,2] if (a==0){ a<-0.5} @@ -220,288 +190,130 @@ my_chisq_or = function(x,y){ d = tab[1,1] if (d==0){ d<-0.5} (a/b)/(c/d) + } -my_chisq_or(mut, dst) -#3) fisher -fisher.test(table(mut, dst)) - -or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher); cat(names(or_fisher)) -pval_fisher = fisher.test(table(mut, dst))$p.value; print(pval_fisher) # the same one to be used for custom chisq_or -ci_lb_fisher = fisher.test(table(mut, dst))$conf.int[1]; print(ci_lb_fisher) -ci_ub_fisher = fisher.test(table(mut, dst))$conf.int[2]; print(ci_ub_fisher) +or_mychisq = mychisq_or(dst, mut) +print(paste0('mychisq OR:', or_mychisq )) +#===================================== +#OR calcs using the following 4 +#1) chisq.test +#2) fisher +#3) modified chisq.test #4) logistic -summary(model<-glm(dst ~ mut - , family = binomial - #, control = glm.control(maxit = 1) - #, options(warn = 1) - )) -or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic) -pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic) - - -#5) logistic adjusted: sample id (# identical results as unadjusted) -#c = raw_data$id[grepl(i,raw_data$all_muts_gene)] -#sid = grepl(paste(c,collapse="|"), raw_data$id) # else warning that pattern length > 1 -#table(sid) -#table(mut, dst, sid) - -#summary(model2<-glm(dst ~ mut + sid -# , family = binomial - ##, control = glm.control(maxit = 1) - ##, options(warn = 1) -# )) -#or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(or_logistic2) -#pval_logistic2 = summary(model2)$coefficients[2,4]; print(pval_logistic2) +#5) adjusted logistic? +#6) kinship (separate script) +#====================================== +# TEST FOR a few muts: sapply and df #=============================================== +snps <- gene_snps_unique # reassign so you test with subset of muts +#snps <- gene_snps_unique[1:2] +cat(paste0('Running calculations for:', length(snps), ' nssnps\n' +, 'gene: ', gene +, '\ndrug: ', drug )) -###################### -# AF and OR for all muts: sapply -###################### -print(table(dst)); print(table(mut)) # must exist -#dst = raw_data[[drug]] #or raw_data[,drug] +# DV: pyrazinamide 0 or 1 +dst = raw_data[[drug]] -# af -afs = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - mean(mut) -}) +# initialise an empty df +ors_df = data.frame() -#afs -head(afs) - -#1) chi square: original -statistic_chi = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - chisq.test(mut,dst)$statistic +x = sapply(snps,function(m){ -}) - -# statistic_chi: has suffix added of '.X-squared' -stat_chi = statistic_chi - -# remove suffix -names(stat_chi) = gsub(".X-squared", "", names(statistic_chi)) - -if (names(stat_chi)!= names(statistic_chi) && stat_chi == statistic_chi){ - cat('Sanity check passed: suffix removed correctly\n\n' - , 'names with suffix:', head(names(statistic_chi)), '\n\n' - , 'names without suffix:', head(names(stat_chi)), '\n\n' - , 'values in var with suffix:', head(statistic_chi),'\n' - , 'values in var without suffix:', head(stat_chi) - ) -}else{ - print('FAIL: suffix removal unsuccessful! Please Debug') -} - -## pval -pvals_chi = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) - chisq.test(mut,dst)$p.value -}) + mut = as.numeric(mut) + cat(paste0('Running mutation:', m, '\n')) -#pvals_chi -head(pvals_chi) - -#2) chi square: custom -ors_chi_cus = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - my_chisq_or(mut,dst) -}) - -#ors_chi_cus -head(ors_chi_cus) - -## pval:fisher (use the same one for custom chi sqaure) -pvals_fisher = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - fisher.test(mut,dst)$p.value -}) - -#pvals_fisher -head(pvals_fisher) - -#3) fisher -odds_ratio_fisher = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - fisher.test(mut,dst)$estimate; -}) - -#ors_fisher -head(odds_ratio_fisher) - -# statistic_chi: has suffix added of '.X-squared' -head(odds_ratio_fisher) - -# remove suffix -ors_fisher = odds_ratio_fisher -names(ors_fisher) = gsub(".odds ratio", "", names(odds_ratio_fisher)) - -if (names(ors_fisher)!= names(odds_ratio_fisher) && ors_fisher == odds_ratio_fisher){ - cat('Sanity check passed: suffix removed correctly\n\n' - , 'names with suffix:', head(names(odds_ratio_fisher)), '\n\n' - , 'names without suffix:', head(names(ors_fisher)), '\n\n' - , 'values in var with suffix:', head(odds_ratio_fisher),'\n' - , 'values in var without suffix:', head(ors_fisher) - ) -}else{ - print('FAIL: suffix removal unsuccessful! Please Debug') -} - - -## fisher ci (lower) -ci_lb_fisher = sapply(gene_snps_unique, function(m){ - mut = grepl(m,raw_data$all_muts_gene) - low_ci = fisher.test(table(mut, dst))$conf.int[1] + model<-glm(dst ~ mut, family = binomial) -}) - -#ci_lb_fisher -head(ci_lb_fisher) - -## fisher ci (upper) -ci_ub_fisher = sapply(gene_snps_unique, function(m){ - mut = grepl(m,raw_data$all_muts_gene) - up_ci = fisher.test(table(mut, dst))$conf.int[2] + #------------------- + # allele frequency + #------------------- + afs = mean(mut) + + #------------------- + # logistic model + #------------------- + beta_logistic = summary(model)$coefficients[2,1] -}) - -#ci_ub_fisher -head(ci_ub_fisher) - -#4) logistic or -ors_logistic = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - #print(table(dst, mut)) - model<-glm(dst ~ mut , family = binomial) or_logistic = exp(summary(model)$coefficients[2,1]) - #pval_logistic = summary(model)$coefficients[2,4] -}) - -#ors_logistic -head(ors_logistic) - -## logistic p-value -pvals_logistic = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - #print(table(dst, mut)) - model<-glm(dst ~ mut , family = binomial) + #print(paste0('logistic OR:', or_logistic)) + pval_logistic = summary(model)$coefficients[2,4] + #print(paste0('logistic pval:', pval_logistic)) + + se_logistic = summary(model)$coefficients[2,2] + #print(paste0('logistic SE:', se_logistic)) + + zval_logistic = summary(model)$coefficients[2,3] + #print(paste0('logistic zval:', zval_logistic)) + + ci_mod = exp(confint(model))[2,] + #print(paste0('logistic CI:', ci_mod)) + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + #------------------- + # custom_chisq and fisher: OR p-value and CI + #------------------- + or_mychisq = mychisq_or(dst, mut) + #print(paste0('mychisq OR:', or_mychisq)) + + odds_fisher = fisher.test(table(dst, mut))$estimate + or_fisher = odds_fisher[[1]] + + pval_fisher = fisher.test(table(dst, mut))$p.value + + ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1] + ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2] + + #------------------- + # chi sq estimates + #------------------- + estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq + est_chisq = estimate_chisq[[1]]; print(est_chisq) + + pval_chisq = chisq.test(table(dst, mut))$p.value + + # build a row to append to df + row = data.frame(mutation = m + , af = afs + , beta_logistic = beta_logistic + , or_logistic = or_logistic + , pval_logistic = pval_logistic + , se_logistic = se_logistic + , zval_logistic = zval_logistic + , ci_low_logistic = ci_lower_logistic + , ci_hi_logistic = ci_upper_logistic + , or_mychisq = or_mychisq + , or_fisher = or_fisher + , pval_fisher = pval_fisher + , ci_low_fisher= ci_lower_fisher + , ci_hi_fisher = ci_upper_fisher + , est_chisq = est_chisq + , pval_chisq = pval_chisq + ) + #print(row) + + ors_df <<- rbind(ors_df, row) + }) -#pvals_logistic -head(pvals_logistic) - -#============================================= - -# check ..(hmmm) perhaps separate script) -#afs['pnca_p.trp68gly'] -#afs['pnca_p.gln10pro'] -#afs['pnca_p.leu4ser'] -# -#plot(density(log(ors_logistic))) -#plot(-log10(pvals)) -#hist(log(ors) -# , breaks = 100) - -# sanity check: if names are equal (just for 3 vars) -all(sapply(list(names(afs) - , names(pvals_chi) - , names(statistic_chi) # should return False - , names(ors_chi_cus)), function (x) x == names(ors_logistic))) - -#compare(names(afs) -# , names(pvals_chi) -# , names(statistic_chi) #TEST: should return False, but DOESN'T -# , names(ors_chi_cus) -# , names(stat_chi))$result - -#=============== Now with all vars - -# sanity check: names for all vars -#c = compare( names(afs) -# , names(stat_chi) -# , names(statistic_chi) #TEST: should return False, but DOESN'T -# , names(pvals_chi) -# , names(ors_chi_cus) -# , names(pvals_fisher) -# , names(ors_fisher) -# , names(ci_lb_fisher) -# , names(ci_ub_fisher) -# , names(ors_logistic) -# , names(pvals_logistic))$result; c - -if (all( sapply( list(names(afs) -, names(stat_chi) -#, names(statistic_chi) # TEST should return FALSE if included -, names(pvals_chi) -, names(ors_chi_cus) -, names(pvals_fisher) -, names(ors_fisher) -, names(ci_lb_fisher) -, names(ci_ub_fisher) -, names(pvals_logistic) ), function (x) x == names(ors_logistic))) -){ -cat('PASS: names match: proceed with combining into a single df') -} else { -cat('FAIL: names mismatch') -} - -# combine ors, pvals and afs -cat('Combining calculated params into a df: ors, pvals and afs') - -comb_AF_and_OR = data.frame(afs - , stat_chi - , pvals_chi - , ors_chi_cus - , pvals_fisher - , ors_fisher - , ci_lb_fisher - , ci_ub_fisher - , pvals_logistic - , ors_logistic) - -cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR) - , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR)) - -cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR))) - -# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column -comb_AF_and_OR$mutation = rownames(comb_AF_and_OR) - -# sanity check -if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){ - cat('PASS: rownames and mutaion col values match') -}else{ - cat('FAIL: rownames and mutation col values mismatch') -} -######################################################### -# write file out: pnca_AF_OR -######################################################### +#%%====================================================== +# Writing file with calculated ORs and AFs cat(paste0('writing output file: ' - , '\nFilename: ', outfile)) + , '\nFilename: ', out_filename)) -write.csv(comb_AF_and_OR, outfile +write.csv(ors_df, outfile , row.names = F) cat(paste0('Finished writing:' - , out_filename - , '\nNo. of rows: ', nrow(comb_AF_and_OR) - , '\nNo. of cols: ', ncol(comb_AF_and_OR))) + , outfile + , '\nNo. of rows: ', nrow(ors_df) + , '\nNo. of cols: ', ncol(ors_df))) #************************************************ cat('\n======================================================================\n') -rm(out_filename) cat('End of script: calculated AF, OR, pvalues and saved file') - -######################################################### -# 3: Merge meta data file + calculated num params -######################################################### -#df1 = gene_metadata -#df2 = comb_AF_and_OR - - - - -# COMMENT: will do the combining with the other OR and AF (in python) diff --git a/scripts/af_or_calcs_scratch.R b/scripts/af_or_calcs_scratch.R index aadc635..7e18859 100644 --- a/scripts/af_or_calcs_scratch.R +++ b/scripts/af_or_calcs_scratch.R @@ -167,7 +167,7 @@ cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_ #x = as.numeric(mut) #y = dst -custom_chisq_or = function(x,y){ +mychisq_or = function(x,y){ tab = as.matrix(table(x,y)) a = tab[2,2] if (a==0){ a<-0.5} @@ -232,7 +232,7 @@ pval_fisher = fisher.test(table(mut, dst))$p.value; print(paste0('pval fisher:', #3) custom chisq -or_mychisq = custom_chisq_or(mut,dst) +or_mychisq = mychisq_or(mut,dst) #4) logistic summary(model<-glm(dst ~ mut, family = binomial)) @@ -277,7 +277,7 @@ snps # custom chisq ors = sapply(snps,function(m){ mut = grepl(m,raw_data$all_muts_gene) - custom_chisq_or(mut,dst) + mychisq_or(mut,dst) }) head(ors) @@ -423,7 +423,7 @@ x = sapply(snps,function(m){ ci_upper_logistic = ci_mod[["97.5 %"]] # custom_chisq and fisher: OR p-value and CI - or_mychisq = custom_chisq_or(dst, mut) + or_mychisq = mychisq_or(dst, mut) or_fisher = fisher.test(dst, mut)$estimate or_fisher = or_fisher[[1]] @@ -523,7 +523,7 @@ for (i in snps){ #===================== # custom chi square #===================== - or_mychisq = custom_chisq_or(mut,dst) + or_mychisq = mychisq_or(mut,dst) #===================== # chi square