From 010ef133dda94f1360faf45590e875872cf93cfc Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 18 Jun 2020 13:55:45 +0100 Subject: [PATCH] foramtting and adding or --- scripts/AF_and_OR_calcs.R | 170 ++++++++++++++++++++++++++------------ 1 file changed, 119 insertions(+), 51 deletions(-) diff --git a/scripts/AF_and_OR_calcs.R b/scripts/AF_and_OR_calcs.R index 3a0160a..2136386 100644 --- a/scripts/AF_and_OR_calcs.R +++ b/scripts/AF_and_OR_calcs.R @@ -8,6 +8,9 @@ getwd() setwd('~/git/LSHTM_analysis/scripts') getwd() +options(scipen = 999) #disabling scientific notation in R. +options(scipen = 4) + #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' @@ -154,13 +157,15 @@ cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_ #====================================== # TEST FOR ONE - i = "pnca_p.ala134gly" # has a NA, should NOT exist table(grepl(i,raw_data$all_muts_gene)) i = "pnca_p.trp68gly" table(grepl(i,raw_data$all_muts_gene)) +i = "pnca_p.his51asp" +table(grepl(i,raw_data$all_muts_gene)) + # IV mut = grepl(i,raw_data$all_muts_gene) @@ -174,8 +179,13 @@ table(mut, dst) #1) chisq : noy accurate for low counts chisq.test(table(mut,dst)) -attributes(chisq.test(table(mut,dst))) chisq.test(table(mut,dst))$p.value +chisq.test(table(mut,dst))$statistic + +t = chisq.test(table(mut,dst))$statistic; print(t) +names(t) +# remove suffix +#names(t2) = gsub(".X-squared", "", names(t)) #2) modified chisq OR: custom function #x = as.numeric(mut) @@ -197,7 +207,7 @@ my_chisq_or(mut, dst) #3) fisher fisher.test(table(mut, dst)) -or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher) +or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher); cat(names(or_fisher)) pval_fisher = fisher.test(table(mut, dst))$p.value; print(pval_fisher) # the same one to be used for custom chisq_or ci_lb_fisher = fisher.test(table(mut, dst))$conf.int[1]; print(ci_lb_fisher) ci_ub_fisher = fisher.test(table(mut, dst))$conf.int[2]; print(ci_ub_fisher) @@ -231,7 +241,8 @@ pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic) ###################### # AF and OR for all muts: sapply ###################### -print(table(dst)); print(table(mut)) +print(table(dst)); print(table(mut)) # must exist +#dst = raw_data[[drug]] #or raw_data[,drug] # af afs = sapply(gene_snps_unique,function(m){ @@ -239,17 +250,32 @@ afs = sapply(gene_snps_unique,function(m){ mean(mut) }) -afs +#afs head(afs) #1) chi square: original -stat_chi = sapply(gene_snps_unique,function(m){ +statistic_chi = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) chisq.test(mut,dst)$statistic + }) -stat_chi -head(stat_chi) +# statistic_chi: has suffix added of '.X-squared' +head(statistic_chi) + +# remove suffix +names(stat_chi) = gsub(".X-squared", "", names(statistic_chi)) + +if (names(stat_chi)!= names(statistic_chi) && stat_chi == statistic_chi){ + cat('Sanity check passed: suffix removed correctly\n\n' + , 'names with suffix:', head(names(statistic_chi)), '\n\n' + , 'names without suffix:', head(names(stat_chi)), '\n\n' + , 'values in var with suffix:', head(statistic_chi),'\n' + , 'values in var without suffix:', head(stat_chi) + ) +}else{ + print('FAIL: suffix removal unsuccessful! Please Debug') +} ## pval pvals_chi = sapply(gene_snps_unique,function(m){ @@ -257,7 +283,7 @@ pvals_chi = sapply(gene_snps_unique,function(m){ chisq.test(mut,dst)$p.value }) -pvals_chi +#pvals_chi head(pvals_chi) #2) chi square: custom @@ -266,7 +292,7 @@ ors_chi_cus = sapply(gene_snps_unique,function(m){ my_chisq_or(mut,dst) }) -ors_chi_cus +#ors_chi_cus head(ors_chi_cus) ## pval:fisher (use the same one for custom chi sqaure) @@ -275,17 +301,36 @@ pvals_fisher = sapply(gene_snps_unique,function(m){ fisher.test(mut,dst)$p.value }) -pvals_fisher +#pvals_fisher head(pvals_fisher) #3) fisher -ors_fisher = sapply(gene_snps_unique,function(m){ +odds_ratio_fisher = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) fisher.test(mut,dst)$estimate; }) -ors_fisher -head(ors_fisher) +#ors_fisher +head(odds_ratio_fisher) + +# statistic_chi: has suffix added of '.X-squared' +head(odds_ratio_fisher) + +# remove suffix +ors_fisher = odds_ratio_fisher +names(ors_fisher) = gsub(".odds ratio", "", names(odds_ratio_fisher)) + +if (names(ors_fisher)!= names(odds_ratio_fisher) && ors_fisher == odds_ratio_fisher){ + cat('Sanity check passed: suffix removed correctly\n\n' + , 'names with suffix:', head(names(odds_ratio_fisher)), '\n\n' + , 'names without suffix:', head(names(ors_fisher)), '\n\n' + , 'values in var with suffix:', head(odds_ratio_fisher),'\n' + , 'values in var without suffix:', head(ors_fisher) + ) +}else{ + print('FAIL: suffix removal unsuccessful! Please Debug') +} + ## fisher ci (lower) ci_lb_fisher = sapply(gene_snps_unique, function(m){ @@ -294,7 +339,7 @@ ci_lb_fisher = sapply(gene_snps_unique, function(m){ }) -ci_lb_fisher +#ci_lb_fisher head(ci_lb_fisher) ## fisher ci (upper) @@ -304,7 +349,7 @@ ci_ub_fisher = sapply(gene_snps_unique, function(m){ }) -ci_ub_fisher +#ci_ub_fisher head(ci_ub_fisher) #4) logistic or @@ -316,7 +361,7 @@ ors_logistic = sapply(gene_snps_unique,function(m){ #pval_logistic = summary(model)$coefficients[2,4] }) -ors_logistic +#ors_logistic head(ors_logistic) ## logistic p-value @@ -328,60 +373,83 @@ pvals_logistic = sapply(gene_snps_unique,function(m){ pval_logistic = summary(model)$coefficients[2,4] }) -pvals_logistic +#pvals_logistic head(pvals_logistic) #============================================= -# check ..hmmm +# check ..(hmmm) perhaps separate script) afs['pnca_p.trp68gly'] afs['pnca_p.gln10pro'] afs['pnca_p.leu4ser'] -plot(density(log(ors))) +plot(density(log(ors_logistic))) plot(-log10(pvals)) hist(log(ors) , breaks = 100) -# sanity check -my_vars = c(afs - , stat_chi - , pvals_chi - , ors_chi_cus - , pvals_fisher - , ors_fisher - , ci_lb_fisher - , ci_ub_fisher - , ors_logistic - , pvals_logistic) +# sanity check: if names are equal (just for 3 vars) +all(sapply(list(names(afs) + , names(pvals_chi) + , names(statistic_chi) # should return False + , names(ors_chi_cus)), function (x) x == names(ors_logistic))) -my_vars = c('afs', 'pvals_chi', 'ors_chi_cus') +compare(names(afs) + , names(pvals_chi) + , names(statistic_chi) #TEST: should return False, but DOESN'T + , names(ors_chi_cus) + , names(stat_chi))$result -names(get('afs')) +#=============== Now with all vars -# check if names are equal - - -if ( all(sapply(list(names(afs), names(pvals_chi), names(ors_chi_cus)), function (x) x == names(ors_logistic))) -& compare(names(afs), names(pvals_chi), names(ors_chi_cus), names(stat_chi)) ){ - cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') -} else{ - cat('FAIL: names of ors, pvals and afs mismatch') -} - - -# FROM HERE -if (table(names(ors_fisher) == names(pvals_fisher)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){ -cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') -} else{ - cat('FAIL: names of ors, pvals and afs mismatch') -} +# sanity check: names for all vars +#c = compare( names(afs) +# , names(stat_chi) +# , names(statistic_chi) #TEST: should return False, but DOESN'T +# , names(pvals_chi) +# , names(ors_chi_cus) +# , names(pvals_fisher) +# , names(ors_fisher) +# , names(ci_lb_fisher) +# , names(ci_ub_fisher) +# , names(ors_logistic) +# , names(pvals_logistic))$result; c + +if (all( sapply( list(names(afs) +, names(stat_chi) +#, names(statistic_chi) # TEST should return FALSE if included +, names(pvals_chi) +, names(ors_chi_cus) +, names(pvals_fisher) +, names(ors_fisher) +, names(ci_lb_fisher) +, names(ci_ub_fisher) +, names(pvals_logistic) ), function (x) x == names(ors_logistic))) +){ +cat('PASS: names match: proceed with combining into a single df') +} else { +cat('FAIL: names mismatch') +} # combine ors, pvals and afs cat('Combining calculated params into a df: ors, pvals and afs') -comb_AF_and_OR = data.frame(ors, pvals, afs) +comb_AF_and_OR = data.frame(afs + , stat_chi + , pvals_chi + , ors_chi_cus + , pvals_fisher + , ors_fisher + , ci_lb_fisher + , ci_ub_fisher + , pvals_logistic + , ors_logistic) + + + + + cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR) , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR))