#!/usr/bin/env Rscript #require('compare') require('getopt', quietly=TRUE) # We need to be able to parse arguments ######################################################### # TASK: To calculate Allele Frequency and # Odds Ratio from master data # and add the calculated params to meta_data extracted from # data_extraction.py ######################################################### #getwd() setwd('~/git/LSHTM_analysis/scripts') cat(c(getwd(),'\n')) # Command line args spec = matrix(c( "drug" , "d", 1, "character", "gene" , "g", 1, "character" ), byrow = TRUE, ncol = 4) opt = getopt(spec); drug = opt$drug gene = opt$gene if(is.null(drug)|is.null(gene)) { stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)') } #options(scipen = 999) #disabling scientific notation in R. #options(scipen = 4) #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' gene_match = paste0(gene,'_p.') cat(gene_match) #=========== # input #=========== # infile1: Raw data #indir = 'git/Data/pyrazinamide/input/original' indir = paste0('~/git/Data') in_filename = 'original_tanushree_data_v2.csv' #in_filename = 'mtb_gwas_v3.csv' infile = paste0(indir, '/', in_filename) cat(paste0('Reading infile1: raw data', ' ', infile) ) # infile2: gene associated meta data file to extract valid snps and add calcs to. # This is outfile3 from data_extraction.py indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output') in_filename_metadata = 'pnca_metadata.csv' infile_metadata = paste0(indir_metadata, '/', in_filename_metadata) cat(paste0('Reading infile2: gene associated metadata:', infile_metadata)) #=========== # output #=========== # outdir = 'git/Data/pyrazinamide/output' outdir = paste0('~/git/Data', '/', drug, '/', 'output') #out_filename = paste0(tolower(gene), '_meta_data_with_AF_OR.csv') out_filename = paste0(tolower(gene), '_af_or.csv') outfile = paste0(outdir, '/', out_filename) cat(paste0('Output file with full path:', outfile)) #%% end of variable assignment for input and output files ######################################################### # 1: Read master/raw data stored in Data/ ######################################################### raw_data_all = read.csv(infile, stringsAsFactors = F) # building cols to extract dr_muts_col = paste0('dr_mutations_', drug) other_muts_col = paste0('other_mutations_', drug) cat('Extracting columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col , '\n===============================================================') raw_data = raw_data_all[,c("id" , drug , dr_muts_col , other_muts_col)] rm(raw_data_all) rm(indir, in_filename, infile) #=========== # 1a: exclude na #=========== raw_data = raw_data[!is.na(raw_data[[drug]]),] total_samples = length(unique(raw_data$id)) cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) # sanity check: should be true is.numeric(total_samples) #=========== # 1b: combine the two mutation columns #=========== all_muts_colname = paste0('all_mutations_', drug) #raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) head(raw_data[[all_muts_colname]]) #=========== # 1c: create yet another column that contains all the mutations but in lower case #=========== head(raw_data[[all_muts_colname]]) raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) head(raw_data$all_muts_gene) # sanity checks #table(grepl("gene_p",raw_data$all_muts_gene)) cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) gene_match = tolower(gene_match) table(grepl(gene_match,raw_data$all_muts_gene)) # sanity check: should be TRUE #sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples # sanity check if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ cat('PASS: Total no. of samples match') } else{ cat('FAIL: No. of samples mismatch') } ######################################################### # 2: Read valid snps for which OR # can be calculated ######################################################### cat(paste0('Reading metadata infile:', infile_metadata)) gene_metadata = read.csv(infile_metadata #, file.choose() , stringsAsFactors = F , header = T) # clear variables rm(in_filename_metadata, infile_metadata) # count na in pyrazinamide column tot_pza_na = sum(is.na(gene_metadata$pyrazinamide)) expected_rows = nrow(gene_metadata) - tot_pza_na # drop na from the pyrazinamide colum gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] # sanity check if(nrow(gene_snps_or) == expected_rows){ cat('PASS: no. of rows match with expected_rows') } else{ cat('FAIL: nrows mismatch.') } # extract unique snps to iterate over for AF and OR calcs gene_snps_unique = unique(gene_snps_or$mutation) cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) #=========================================================================================== ######################### # custom chisq function: # To calculate OR ######################### i = "pnca_p.trp68gly" mut = grepl(i,raw_data$all_muts_gene) mut = as.numeric(mut) dst = raw_data[[drug]] #x = as.numeric(mut) #y = dst mychisq_or = function(x,y){ tab = as.matrix(table(x,y)) a = tab[2,2] if (a==0){ a<-0.5} b = tab[2,1] if (b==0){ b<-0.5} c = tab[1,2] if (c==0){ c<-0.5} d = tab[1,1] if (d==0){ d<-0.5} (a/b)/(c/d) } or_mychisq = mychisq_or(dst, mut) print(paste0('mychisq OR:', or_mychisq )) #===================================== #OR calcs using the following 4 #1) chisq.test #2) fisher #3) modified chisq.test #4) logistic #5) adjusted logistic? #6) kinship (separate script) #====================================== # TEST FOR a few muts: sapply and df #=============================================== snps <- gene_snps_unique # reassign so you test with subset of muts #snps <- gene_snps_unique[1:2] cat(paste0('Running calculations for:', length(snps), ' nssnps\n' , 'gene: ', gene , '\ndrug: ', drug )) # DV: pyrazinamide 0 or 1 dst = raw_data[[drug]] # initialise an empty df ors_df = data.frame() x = sapply(snps,function(m){ mut = grepl(m,raw_data$all_muts_gene) mut = as.numeric(mut) cat(paste0('Running mutation:', m, '\n')) model<-glm(dst ~ mut, family = binomial) #------------------- # allele frequency #------------------- afs = mean(mut) #------------------- # logistic model #------------------- beta_logistic = summary(model)$coefficients[2,1] or_logistic = exp(summary(model)$coefficients[2,1]) #print(paste0('logistic OR:', or_logistic)) pval_logistic = summary(model)$coefficients[2,4] #print(paste0('logistic pval:', pval_logistic)) se_logistic = summary(model)$coefficients[2,2] #print(paste0('logistic SE:', se_logistic)) zval_logistic = summary(model)$coefficients[2,3] #print(paste0('logistic zval:', zval_logistic)) ci_mod = exp(confint(model))[2,] #print(paste0('logistic CI:', ci_mod)) ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] #------------------- # custom_chisq and fisher: OR p-value and CI #------------------- or_mychisq = mychisq_or(dst, mut) #print(paste0('mychisq OR:', or_mychisq)) odds_fisher = fisher.test(table(dst, mut))$estimate or_fisher = odds_fisher[[1]] pval_fisher = fisher.test(table(dst, mut))$p.value ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1] ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2] #------------------- # chi sq estimates #------------------- estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq est_chisq = estimate_chisq[[1]]; print(est_chisq) pval_chisq = chisq.test(table(dst, mut))$p.value # build a row to append to df row = data.frame(mutation = m , af = afs , beta_logistic = beta_logistic , or_logistic = or_logistic , pval_logistic = pval_logistic , se_logistic = se_logistic , zval_logistic = zval_logistic , ci_low_logistic = ci_lower_logistic , ci_hi_logistic = ci_upper_logistic , or_mychisq = or_mychisq , or_fisher = or_fisher , pval_fisher = pval_fisher , ci_low_fisher= ci_lower_fisher , ci_hi_fisher = ci_upper_fisher , est_chisq = est_chisq , pval_chisq = pval_chisq ) #print(row) ors_df <<- rbind(ors_df, row) }) #%%====================================================== # Writing file with calculated ORs and AFs cat(paste0('writing output file: ' , '\nFilename: ', out_filename)) write.csv(ors_df, outfile , row.names = F) cat(paste0('Finished writing:' , outfile , '\nNo. of rows: ', nrow(ors_df) , '\nNo. of cols: ', ncol(ors_df))) #************************************************ cat('\n======================================================================\n') cat('End of script: calculated AF, OR, pvalues and saved file')