From 931f8ec2f9e0c6628f1666b2d57dbf2578678db5 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 11 Jun 2021 13:28:07 +0100 Subject: [PATCH] added mychisq_or.R and af_or_calcs.R --- scripts/functions/af_or_calcs.R | 254 ++++++++++++++++++++++++++++++++ scripts/functions/mychisq_or.R | 13 ++ 2 files changed, 267 insertions(+) create mode 100644 scripts/functions/af_or_calcs.R create mode 100644 scripts/functions/mychisq_or.R diff --git a/scripts/functions/af_or_calcs.R b/scripts/functions/af_or_calcs.R new file mode 100644 index 0000000..2da3fc8 --- /dev/null +++ b/scripts/functions/af_or_calcs.R @@ -0,0 +1,254 @@ +my_afor <- function ( infile_master + , infile_metadata + , outfile + , drug + , gene + , idcol = "id" + , dr_muts_col + , other_muts_col){ + + #=========================================== + # 1: Read master/raw data stored in Data/ + #=========================================== + raw_data_all = read.csv(infile_master, stringsAsFactors = F) + + cat("\nExtracting columns based on variables:\n" + , drug + , "\n" + , dr_muts_col + , "\n" + , other_muts_col + , "\n===============================================================") + + raw_data = raw_data_all[,c(idcol + , drug + , dr_muts_col + , other_muts_col)] + #rm(raw_data_all) + #rm(indir, in_filename_master, infile_master) + + #------------------- + # 1a: exclude na + #-------------------- + raw_data = raw_data[!is.na(raw_data[[drug]]),] + + total_samples = length(unique(raw_data[[idcol]])) + cat(paste0("\nTotal samples without NA in", " ", drug, " is:", total_samples)) + + # sanity check: should be true + cat("\nThis should be True:\n" + , is.numeric(total_samples)) + + #---------------------------------------- + # 1b: combine the two mutation columns + #----------------------------------------- + all_muts_colname = paste0("all_mutations_", drug) + cat(paste("\nNew column added:", all_muts_colname)) + raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) + head(raw_data[[all_muts_colname]]) + + #---------------------------------------------- + # 1c: create yet another column that contains + # all the mutations but in lower case + #----------------------------------------------- + head(raw_data[[all_muts_colname]]) + raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) + head(raw_data$all_muts_gene) + + # sanity checks + gene_match = paste0(gene,"_p.") + cat(paste0("\nconverting gene match: ", gene_match, " ", "to lowercase\n")) + gene_match = tolower(gene_match) + + table(grepl(gene_match,raw_data$all_muts_gene)) + + # sanity check: should be TRUE + if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ + cat("\nPASS: Total no. of samples match\n") + } else{ + cat("\nFAIL: No. of samples mismatch\n") + exit() + } + + #==================================================== + # 2: Read valid snps for which OR + # can be calculated + #===================================================== + cat(paste0("\nReading gene associated metadata:", infile_metadata)) + + gene_metadata = read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + + cat(paste("\nDim of gene_metadata:\n" + , dim(gene_metadata))) + + # count na in drug column + tot_drug_na = sum(is.na(gene_metadata[[drug]])) + expected_rows = nrow(gene_metadata) - tot_drug_na + + # drop na from the drug column + gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] + + # sanity check + if(nrow(gene_snps_or) == expected_rows){ + cat("\nPASS: no. of rows match with expected_rows") + } else{ + cat("\nFAIL: nrows mismatch.") + exit() + } + + # extract unique snps to iterate over for AF and OR calcs + gene_snps_unique = unique(gene_snps_or$mutation) + + cat(paste0("\nTotal no. of distinct comp snps to perform OR calcs: ", length(gene_snps_unique))) + + #================================================== + # OR calcs using the following 4 + #1) logistic + #2) custom chisq.test + #3) fisher + #4) chisq.test + + # adjusted logistic (NO good) + # kinship (separate script) + #================================================= + #snps <- gene_snps_unique [1:5] # small test + snps <- gene_snps_unique + cat(paste0("\nRunning calculations for:" + , length(snps), " nssnps\n" + , "\ngene: ", gene + , "\ndrug: ", drug )) + + # DV: 0 or 1 + dst = raw_data[[drug]] + + # initialise an empty df + ors_df = data.frame() + x = sapply(snps,function(m){ + + mut = grepl(m,raw_data$all_muts_gene) + mut = as.numeric(mut) + cat(paste0("\nRunning mutation:", m, "\n")) + + model<-glm(dst ~ mut, family = binomial) + + #------------------- + # allele frequency + #------------------- + afs = mean(mut) + + #------------------- + # logistic model + #------------------- + beta_logistic = summary(model)$coefficients[2,1] + + or_logistic = exp(summary(model)$coefficients[2,1]) + #print(paste0("logistic OR:", or_logistic)) + + pval_logistic = summary(model)$coefficients[2,4] + #print(paste0("logistic pval:", pval_logistic)) + + se_logistic = summary(model)$coefficients[2,2] + #print(paste0("logistic SE:", se_logistic)) + + zval_logistic = summary(model)$coefficients[2,3] + #print(paste0("logistic zval:", zval_logistic)) + + ci_mod = exp(confint(model))[2,] + #print(paste0("logistic CI:", ci_mod)) + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + #------------------- + # custom_chisq and fisher: OR p-value and CI + #------------------- + or_mychisq = mychisq_or(dst, mut) + #print(paste0("mychisq OR:", or_mychisq)) + + odds_fisher = fisher.test(table(dst, mut))$estimate + or_fisher = odds_fisher[[1]] + + pval_fisher = fisher.test(table(dst, mut))$p.value + + ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1] + ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2] + + #------------------- + # chi sq estimates + #------------------- + estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq + est_chisq = estimate_chisq[[1]]; print(est_chisq) + + pval_chisq = chisq.test(table(dst, mut))$p.value + + # build a row to append to df + row = data.frame(mutation = m + , af = afs + , beta_logistic = beta_logistic + , or_logistic = or_logistic + , pval_logistic = pval_logistic + , se_logistic = se_logistic + , zval_logistic = zval_logistic + , ci_low_logistic = ci_lower_logistic + , ci_hi_logistic = ci_upper_logistic + , or_mychisq = or_mychisq + , or_fisher = or_fisher + , pval_fisher = pval_fisher + , ci_low_fisher= ci_lower_fisher + , ci_hi_fisher = ci_upper_fisher + , est_chisq = est_chisq + , pval_chisq = pval_chisq + ) + + ors_df <<- rbind(ors_df, row) + }) + + #============================================== + # Writing file with calculated ORs and AFs + #============================================== + cat(paste0("\nwriting output file: " + , "\nFile: ", outfile)) + + write.csv(ors_df, outfile + , row.names = F) + + cat(paste0("\nFinished writing:" + , outfile + , "\nNo. of rows: ", nrow(ors_df) + , "\nNo. of cols: ", ncol(ors_df))) + cat("\n======sneak peek into a few muts with prominent or and p-vals=======\n") + + cat(paste0("=======================================" + , "\nmutation with highest logistic OR:" + , "\n=======================================\n")) + print(ors_df[which(ors_df$or_logistic == max(ors_df$or_logistic)),]) + + cat(paste0("=======================================" + , "\nmutation with highest mychisq OR:" + , "\n=======================================\n")) + + print(ors_df[which(ors_df$or_mychisq == max(ors_df$or_mychisq)),]) + + # gives too many due to Inf + #cat(paste0("=======================================" + #, "\nmutation with highest fisher OR:" + #, "\n=======================================\n")) + #print(ors_df[which(ors_df$or_fisher == max(ors_df$or_fisher)),]) + + cat(paste0("=======================================" + , "\nmutation with lowest logistic pval:" + , "\n=======================================\n")) + print(ors_df[which(ors_df$pval_logistic == min(ors_df$pval_logistic)),]) + + cat(paste0("=======================================" + , "\nmutation with lowest fisher pval:" + , "\n=======================================\n")) + print(ors_df[which(ors_df$pval_fisher == min(ors_df$pval_fisher)),]) + + ################################################################ + cat("\nEnd of script: calculated AF, OR, pvalues and saved file\n") + ################################################################# + + } diff --git a/scripts/functions/mychisq_or.R b/scripts/functions/mychisq_or.R new file mode 100644 index 0000000..a7b2ac5 --- /dev/null +++ b/scripts/functions/mychisq_or.R @@ -0,0 +1,13 @@ +mychisq_or = function(dst_numeric, mut_numeric){ + tab = as.matrix(table(dst_numeric, mut_numeric)) + a = tab[2,2] + if (a==0){ a<-0.5} + b = tab[2,1] + if (b==0){ b<-0.5} + c = tab[1,2] + if (c==0){ c<-0.5} + d = tab[1,1] + if (d==0){ d<-0.5} + (a/b)/(c/d) + +}