#!/usr/bin/env Rscript ######################################################################## # TASK:calculate af and or for gwas data ######################################################################## my_afor <- function ( drug , gene , infile_master , infile_metadata , outfile , idcol , dr_muts_col , other_muts_col){ #=========================================== # 1: Read master/raw data stored in Data/ #=========================================== cat(infile_master) raw_data_all = read.csv(infile_master, stringsAsFactors = F) cat("\nExtracting columns based on variables:\n" , drug , "\n" , dr_muts_col , "\n" , other_muts_col , "\n===============================================================") raw_data = raw_data_all[,c(idcol , drug , dr_muts_col , other_muts_col)] #rm(raw_data_all) #rm(indir, in_filename_master, infile_master) #------------------- # 1a: exclude na #-------------------- raw_data = raw_data[!is.na(raw_data[[drug]]),] total_samples = length(unique(raw_data[[idcol]])) cat(paste0("\nTotal samples without NA in", " ", drug, " is:", total_samples)) # sanity check: should be true cat("\nThis should be True:\n" , is.numeric(total_samples)) #---------------------------------------- # 1b: combine the two mutation columns #----------------------------------------- all_muts_colname = paste0("all_mutations_", drug) cat(paste("\nNew column added:", all_muts_colname)) raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) head(raw_data[[all_muts_colname]]) #---------------------------------------------- # 1c: create yet another column that contains # all the mutations but in lower case #----------------------------------------------- head(raw_data[[all_muts_colname]]) raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) head(raw_data$all_muts_gene) # sanity checks gene_match = paste0(gene,"_p.") cat(paste0("\nconverting gene match: ", gene_match, " ", "to lowercase\n")) gene_match = tolower(gene_match) table(grepl(gene_match,raw_data$all_muts_gene)) # sanity check: should be TRUE if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ cat("\nPASS: Total no. of samples match\n") } else{ cat("\nFAIL: No. of samples mismatch\n") exit() } #==================================================== # 2: Read valid snps for which OR # can be calculated #===================================================== cat(paste0("\nReading gene associated metadata:", infile_metadata)) gene_metadata = read.csv(infile_metadata , stringsAsFactors = F , header = T) cat(paste("\nDim of gene_metadata:\n" , dim(gene_metadata))) # count na in drug column tot_drug_na = sum(is.na(gene_metadata[[drug]])) expected_rows = nrow(gene_metadata) - tot_drug_na # drop na from the drug column gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] # sanity check if(nrow(gene_snps_or) == expected_rows){ cat("\nPASS: no. of rows match with expected_rows") } else{ cat("\nFAIL: nrows mismatch.") exit() } # extract unique snps to iterate over for AF and OR calcs gene_snps_unique = unique(gene_snps_or$mutation) cat(paste0("\nTotal no. of distinct comp snps to perform OR calcs: ", length(gene_snps_unique))) #================================================== # OR calcs using the following 4 #1) logistic #2) custom chisq.test #3) fisher #4) chisq.test # adjusted logistic (NO good) # kinship (separate script) #================================================= #snps <- gene_snps_unique [1:5] # small test snps <- gene_snps_unique cat(paste0("\nRunning calculations for:" , length(snps), " nssnps\n" , "\ngene: ", gene , "\ndrug: ", drug )) # DV: 0 or 1 dst = raw_data[[drug]] # initialise an empty df ors_df = data.frame() x = sapply(snps,function(m){ mut = grepl(m,raw_data$all_muts_gene) mut = as.numeric(mut) cat(paste0("\nRunning mutation:", m, "\n")) model<-glm(dst ~ mut, family = binomial) #------------------- # allele frequency #------------------- afs = mean(mut) #------------------- # logistic model #------------------- beta_logistic = summary(model)$coefficients[2,1] or_logistic = exp(summary(model)$coefficients[2,1]) #print(paste0("logistic OR:", or_logistic)) pval_logistic = summary(model)$coefficients[2,4] #print(paste0("logistic pval:", pval_logistic)) se_logistic = summary(model)$coefficients[2,2] #print(paste0("logistic SE:", se_logistic)) zval_logistic = summary(model)$coefficients[2,3] #print(paste0("logistic zval:", zval_logistic)) ci_mod = exp(confint(model))[2,] #print(paste0("logistic CI:", ci_mod)) ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] #------------------- # custom_chisq and fisher: OR p-value and CI #------------------- or_mychisq = mychisq_or(dst, mut) #print(paste0("mychisq OR:", or_mychisq)) odds_fisher = fisher.test(table(dst, mut))$estimate or_fisher = odds_fisher[[1]] pval_fisher = fisher.test(table(dst, mut))$p.value ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1] ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2] #-------------------- # adding log cols : NEW #-------------------- log10_or_mychisq = log10(or_mychisq) neglog_pval_fisher = -log10(pval_fisher) #------------------- # chi sq estimates #------------------- estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq est_chisq = estimate_chisq[[1]]; print(est_chisq) pval_chisq = chisq.test(table(dst, mut))$p.value # build a row to append to df row = data.frame(mutation = m , af = afs , beta_logistic = beta_logistic , or_logistic = or_logistic , pval_logistic = pval_logistic , se_logistic = se_logistic , zval_logistic = zval_logistic , ci_low_logistic = ci_lower_logistic , ci_hi_logistic = ci_upper_logistic , or_mychisq = or_mychisq , log10_or_mychisq = log10_or_mychisq # NEW , or_fisher = or_fisher , pval_fisher = pval_fisher , neglog_pval_fisher = neglog_pval_fisher # NEW , ci_low_fisher= ci_lower_fisher , ci_hi_fisher = ci_upper_fisher , est_chisq = est_chisq , pval_chisq = pval_chisq ) ors_df <<- rbind(ors_df, row) }) #============================================== # Writing file with calculated ORs and AFs #============================================== cat(paste0("\nwriting output file: " , "\nFile: ", outfile)) write.csv(ors_df, outfile , row.names = F) cat(paste0("\nFinished writing:" , outfile , "\nNo. of rows: ", nrow(ors_df) , "\nNo. of cols: ", ncol(ors_df))) cat("\n======sneak peek into a few muts with prominent or and p-vals=======\n") cat(paste0("=======================================" , "\nmutation with highest logistic OR:" , "\n=======================================\n")) print(ors_df[which(ors_df$or_logistic == max(ors_df$or_logistic)),]) cat(paste0("=======================================" , "\nmutation with highest mychisq OR:" , "\n=======================================\n")) print(ors_df[which(ors_df$or_mychisq == max(ors_df$or_mychisq)),]) # gives too many due to Inf #cat(paste0("=======================================" #, "\nmutation with highest fisher OR:" #, "\n=======================================\n")) #print(ors_df[which(ors_df$or_fisher == max(ors_df$or_fisher)),]) cat(paste0("=======================================" , "\nmutation with lowest logistic pval:" , "\n=======================================\n")) print(ors_df[which(ors_df$pval_logistic == min(ors_df$pval_logistic)),]) cat(paste0("=======================================" , "\nmutation with lowest fisher pval:" , "\n=======================================\n")) print(ors_df[which(ors_df$pval_fisher == min(ors_df$pval_fisher)),]) ################################################################ cat("\nEnd of script: calculated AF, OR, pvalues and saved file\n") ################################################################# }