#!/usr/bin/Rscript #!/usr/bin/env Rscript ######################################################### # TASK: To calculate Allele Frequency and # Odds Ratio from master data ######################################################### # working dir and loading libraries getwd() setwd('~/git/LSHTM_analysis/scripts') cat(c(getwd(),'\n')) # cmd parse arguments require('getopt', quietly = TRUE) #======================================================== # command line args spec = matrix(c( "drug" , "d", 1, "character", "gene" , "g", 1, "character" ), byrow = TRUE, ncol = 4) opt = getopt(spec) drug = opt$drug gene = opt$gene if(is.null(drug)|is.null(gene)) { stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)') } #options(scipen = 999) #disabling scientific notation in R. #======================================================== #%% variable assignment: input and output paths & filenames gene_match = paste0(gene,'_p.') cat(gene_match) #============= # directories #============= datadir = paste0('~/git/Data') indir = paste0(datadir, '/', drug, '/', 'input') outdir = paste0(datadir, '/', drug, '/', 'output') #=========== # input #=========== # input file 1: master data #in_filename_master = 'original_tanushree_data_v2.csv' #19K in_filename_master = 'mtb_gwas_meta_v6.csv' #35k infile_master = paste0(datadir, '/', in_filename_master) cat(paste0('Reading infile1: raw data', ' ', infile_master) ) # input file 2: gene associated meta data file to extract valid snps and add calcs to. # This is outfile_metadata from data_extraction.py in_filename_metadata = paste0(tolower(gene), '_metadata.csv') infile_metadata = paste0(outdir, '/', in_filename_metadata) cat(paste0('Reading input file 2 i.e gene associated metadata:', infile_metadata)) #=========== # output #=========== #out_filename_af_or = paste0(tolower(gene), '_meta_data_with_AF_OR.csv') out_filename_af_or = paste0(tolower(gene), '_af_or.csv') outfile_af_or = paste0(outdir, '/', out_filename_af_or) cat(paste0('Output file with full path:', outfile_af_or)) #%% end of variable assignment for input and output files ######################################################### # 1: Read master/raw data stored in Data/ ######################################################### raw_data_all = read.csv(infile_master, stringsAsFactors = F) # building cols to extract dr_muts_col = paste0('dr_mutations_', drug) other_muts_col = paste0('other_mutations_', drug) cat('Extracting columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col , '\n===============================================================') raw_data = raw_data_all[,c("id" , drug , dr_muts_col , other_muts_col)] rm(raw_data_all) rm(indir, in_filename_master, infile_master) #=========== # 1a: exclude na #=========== raw_data = raw_data[!is.na(raw_data[[drug]]),] total_samples = length(unique(raw_data$id)) cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) # sanity check: should be true is.numeric(total_samples) #=========== # 1b: combine the two mutation columns #=========== all_muts_colname = paste0('all_mutations_', drug) print(paste('New column added:', all_muts_colname)) #raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) head(raw_data[[all_muts_colname]]) #=========== # 1c: create yet another column that contains all the mutations but in lower case #=========== head(raw_data[[all_muts_colname]]) raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) head(raw_data$all_muts_gene) # sanity checks #table(grepl("gene_p",raw_data$all_muts_gene)) cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) gene_match = tolower(gene_match) table(grepl(gene_match,raw_data$all_muts_gene)) # sanity check: should be TRUE #sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples # sanity check if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ cat('PASS: Total no. of samples match') } else{ cat('FAIL: No. of samples mismatch') } ######################################################### # 2: Read valid snps for which OR # can be calculated ######################################################### cat(paste0('Reading gene associated metadata:', infile_metadata)) gene_metadata = read.csv(infile_metadata #, file.choose() , stringsAsFactors = F , header = T) # get just the variable name from variable #deparse(substitute(myvar) print(paste('Dim of', deparse(substitute(gene_metadata)), ':')); print(dim(gene_metadata)) # clear variables rm(in_filename_metadata, infile_metadata) # count na in drug column tot_pza_na = sum(is.na(gene_metadata[[drug]])) expected_rows = nrow(gene_metadata) - tot_pza_na # drop na from the drug column gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] # sanity check if(nrow(gene_snps_or) == expected_rows){ cat('PASS: no. of rows match with expected_rows') } else{ cat('FAIL: nrows mismatch.') } # extract unique snps to iterate over for AF and OR calcs gene_snps_unique = unique(gene_snps_or$mutation) cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) #=========================================================================================== ######################### # custom chisq function: # To calculate OR ######################### #i = "pnca_p.trp68gly" i = "pnca_p.gly162asp" mut = grepl(i,raw_data$all_muts_gene) mut = as.numeric(mut) dst = raw_data[[drug]] #x = as.numeric(mut) #y = dst mychisq_or = function(x,y){ tab = as.matrix(table(x,y)) a = tab[2,2] if (a==0){ a<-0.5} b = tab[2,1] if (b==0){ b<-0.5} c = tab[1,2] if (c==0){ c<-0.5} d = tab[1,1] if (d==0){ d<-0.5} (a/b)/(c/d) } or_mychisq = mychisq_or(dst, mut) print(paste0('mychisq OR:', or_mychisq )) odds_fisher = fisher.test(table(dst, mut))$estimate pval_fisher = fisher.test(table(dst, mut))$p.value print(paste0('fisher OR:', odds_fisher)) print(paste0('fisher p-value:', pval_fisher)) model<-glm(dst ~ mut, family = binomial) or_logistic = exp(summary(model)$coefficients[2,1]) print(paste0('logistic OR:', or_logistic)) pval_logistic = summary(model)$coefficients[2,4] print(paste0('logistic pval:', pval_logistic)) #===================================== #OR calcs using the following 4 #1) chisq.test #2) fisher #3) modified chisq.test #4) logistic #5) adjusted logistic? #6) kinship (separate script) #====================================== # TEST FOR a few muts: sapply and df #=============================================== #snps <- gene_snps_unique[1:2]# reassign so you test with subset of muts snps <- gene_snps_unique cat(paste0('Running calculations for:', length(snps), ' nssnps\n' , 'gene: ', gene , '\ndrug: ', drug )) # DV: 0 or 1 dst = raw_data[[drug]] # initialise an empty df ors_df = data.frame() x = sapply(snps,function(m){ mut = grepl(m,raw_data$all_muts_gene) mut = as.numeric(mut) cat(paste0('Running mutation:', m, '\n')) model<-glm(dst ~ mut, family = binomial) #------------------- # allele frequency #------------------- afs = mean(mut) #------------------- # logistic model #------------------- beta_logistic = summary(model)$coefficients[2,1] or_logistic = exp(summary(model)$coefficients[2,1]) #print(paste0('logistic OR:', or_logistic)) pval_logistic = summary(model)$coefficients[2,4] #print(paste0('logistic pval:', pval_logistic)) se_logistic = summary(model)$coefficients[2,2] #print(paste0('logistic SE:', se_logistic)) zval_logistic = summary(model)$coefficients[2,3] #print(paste0('logistic zval:', zval_logistic)) ci_mod = exp(confint(model))[2,] #print(paste0('logistic CI:', ci_mod)) ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] #------------------- # custom_chisq and fisher: OR p-value and CI #------------------- or_mychisq = mychisq_or(dst, mut) #print(paste0('mychisq OR:', or_mychisq)) odds_fisher = fisher.test(table(dst, mut))$estimate or_fisher = odds_fisher[[1]] pval_fisher = fisher.test(table(dst, mut))$p.value ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1] ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2] #------------------- # chi sq estimates #------------------- estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq est_chisq = estimate_chisq[[1]]; print(est_chisq) pval_chisq = chisq.test(table(dst, mut))$p.value # build a row to append to df row = data.frame(mutation = m , af = afs , beta_logistic = beta_logistic , or_logistic = or_logistic , pval_logistic = pval_logistic , se_logistic = se_logistic , zval_logistic = zval_logistic , ci_low_logistic = ci_lower_logistic , ci_hi_logistic = ci_upper_logistic , or_mychisq = or_mychisq , or_fisher = or_fisher , pval_fisher = pval_fisher , ci_low_fisher= ci_lower_fisher , ci_hi_fisher = ci_upper_fisher , est_chisq = est_chisq , pval_chisq = pval_chisq ) #print(row) ors_df <<- rbind(ors_df, row) }) #%%====================================================== # Writing file with calculated ORs and AFs cat(paste0('writing output file: ' , '\nFile: ', outfile_af_or)) write.csv(ors_df, outfile_af_or , row.names = F) cat(paste0('Finished writing:' , outfile_af_or , '\nNo. of rows: ', nrow(ors_df) , '\nNo. of cols: ', ncol(ors_df))) #%%====================================================== cat('\n======sneak peek into a few muts with prominent or and p-vals=======\n') cat(paste0('=======================================' , '\nmutation with highest logistic OR:' , '\n=======================================\n')) print(ors_df[which(ors_df$or_logistic == max(ors_df$or_logistic)),]) cat(paste0('=======================================' , '\nmutation with highest mychisq OR:' , '\n=======================================\n')) print(ors_df[which(ors_df$or_mychisq == max(ors_df$or_mychisq)),]) # gives too many due to Inf #cat(paste0('=======================================' #, '\nmutation with highest fisher OR:' #, '\n=======================================\n')) #print(ors_df[which(ors_df$or_fisher == max(ors_df$or_fisher)),]) cat(paste0('=======================================' , '\nmutation with lowest logistic pval:' , '\n=======================================\n')) print(ors_df[which(ors_df$pval_logistic == min(ors_df$pval_logistic)),]) cat(paste0('=======================================' , '\nmutation with lowest fisher pval:' , '\n=======================================\n')) print(ors_df[which(ors_df$pval_fisher == min(ors_df$pval_fisher)),]) #********************************************************** cat('End of script: calculated AF, OR, pvalues and saved file') #**********************************************************