#!/usr/bin/env Rscript #require('compare') require('getopt', quietly=TRUE) # We need to be able to parse arguments ######################################################### # TASK: To calculate Allele Frequency and # Odds Ratio from master data # and add the calculated params to meta_data extracted from # data_extraction.py ######################################################### #getwd() setwd('~/git/LSHTM_analysis/scripts') cat(c(getwd(),'\n')) # Command line args spec = matrix(c( "drug" , "d", 1, "character", "gene" , "g", 1, "character" ), byrow = TRUE, ncol = 4) opt = getopt(spec); drug = opt$drug gene = opt$gene if(is.null(drug)|is.null(gene)) { stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)') } #options(scipen = 999) #disabling scientific notation in R. #options(scipen = 4) #%% variable assignment: input and output paths & filenames #drug = 'pyrazinamide' #gene = 'pncA' gene_match = paste0(gene,'_p.') cat(gene_match) #=========== # input #=========== # infile1: Raw data #indir = 'git/Data/pyrazinamide/input/original' indir = paste0('~/git/Data') in_filename = 'original_tanushree_data_v2.csv' #in_filename = 'mtb_gwas_v3.csv' infile = paste0(indir, '/', in_filename) cat(paste0('Reading infile1: raw data', ' ', infile) ) # infile2: gene associated meta data file to extract valid snps and add calcs to. # This is outfile3 from data_extraction.py indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output') in_filename_metadata = 'pnca_metadata.csv' infile_metadata = paste0(indir_metadata, '/', in_filename_metadata) cat(paste0('Reading infile2: gene associated metadata:', infile_metadata)) #=========== # output #=========== # outdir = 'git/Data/pyrazinamide/output' outdir = paste0('~/git/Data', '/', drug, '/', 'output') #out_filename = paste0(tolower(gene), '_meta_data_with_AF_OR.csv') out_filename = paste0(tolower(gene), '_af_or.csv') outfile = paste0(outdir, '/', out_filename) cat(paste0('Output file with full path:', outfile)) #%% end of variable assignment for input and output files ######################################################### # 1: Read master/raw data stored in Data/ ######################################################### raw_data_all = read.csv(infile, stringsAsFactors = F) # building cols to extract dr_muts_col = paste0('dr_mutations_', drug) other_muts_col = paste0('other_mutations_', drug) cat('Extracting columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col , '\n===============================================================') raw_data = raw_data_all[,c("id" , drug , dr_muts_col , other_muts_col)] rm(raw_data_all) rm(indir, in_filename, infile) #=========== # 1a: exclude na #=========== raw_data = raw_data[!is.na(raw_data[[drug]]),] total_samples = length(unique(raw_data$id)) cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) # sanity check: should be true is.numeric(total_samples) #=========== # 1b: combine the two mutation columns #=========== all_muts_colname = paste0('all_mutations_', drug) #raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) head(raw_data[[all_muts_colname]]) #=========== # 1c: create yet another column that contains all the mutations but in lower case #=========== head(raw_data[[all_muts_colname]]) raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) head(raw_data$all_muts_gene) # sanity checks #table(grepl("gene_p",raw_data$all_muts_gene)) cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) gene_match = tolower(gene_match) table(grepl(gene_match,raw_data$all_muts_gene)) # sanity check: should be TRUE #sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples # sanity check if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ cat('PASS: Total no. of samples match') } else{ cat('FAIL: No. of samples mismatch') } ######################################################### # 2: Read valid snps for which OR # can be calculated ######################################################### cat(paste0('Reading metadata infile:', infile_metadata)) gene_metadata = read.csv(infile_metadata #, file.choose() , stringsAsFactors = F , header = T) # clear variables rm(in_filename_metadata, infile_metadata) # count na in pyrazinamide column tot_pza_na = sum(is.na(gene_metadata$pyrazinamide)) expected_rows = nrow(gene_metadata) - tot_pza_na # drop na from the pyrazinamide colum gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] # sanity check if(nrow(gene_snps_or) == expected_rows){ cat('PASS: no. of rows match with expected_rows') } else{ cat('FAIL: nrows mismatch.') } # extract unique snps to iterate over for AF and OR calcs gene_snps_unique = unique(gene_snps_or$mutation) cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) #===================================== #OR calcs using the following 4 #1) chisq.test #2) fisher #3) modified chisq.test #4) logistic #5) adjusted logistic? #6) kinship (separate script) #====================================== # TEST FOR ONE i = "pnca_p.ala134gly" # has a NA, should NOT exist table(grepl(i,raw_data$all_muts_gene)) i = "pnca_p.trp68gly" table(grepl(i,raw_data$all_muts_gene)) i = "pnca_p.his51asp" table(grepl(i,raw_data$all_muts_gene)) # IV mut = grepl(i,raw_data$all_muts_gene) # DV dst = raw_data[[drug]] #or raw_data[,drug] table(mut, dst) #=============================================== # calculating OR #1) chisq : noy accurate for low counts chisq.test(table(mut,dst)) chisq.test(table(mut,dst))$p.value chisq.test(table(mut,dst))$statistic t = chisq.test(table(mut,dst))$statistic; print(t) names(t) # remove suffix #names(t2) = gsub(".X-squared", "", names(t)) #2) modified chisq OR: custom function #x = as.numeric(mut) #y = dst my_chisq_or = function(x,y){ tab = as.matrix(table(x,y)) a = tab[2,2] if (a==0){ a<-0.5} b = tab[2,1] if (b==0){ b<-0.5} c = tab[1,2] if (c==0){ c<-0.5} d = tab[1,1] if (d==0){ d<-0.5} (a/b)/(c/d) } my_chisq_or(mut, dst) #3) fisher fisher.test(table(mut, dst)) or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher); cat(names(or_fisher)) pval_fisher = fisher.test(table(mut, dst))$p.value; print(pval_fisher) # the same one to be used for custom chisq_or ci_lb_fisher = fisher.test(table(mut, dst))$conf.int[1]; print(ci_lb_fisher) ci_ub_fisher = fisher.test(table(mut, dst))$conf.int[2]; print(ci_ub_fisher) #4) logistic summary(model<-glm(dst ~ mut , family = binomial #, control = glm.control(maxit = 1) #, options(warn = 1) )) or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic) pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic) #5) logistic adjusted: sample id (# identical results as unadjusted) #c = raw_data$id[grepl(i,raw_data$all_muts_gene)] #sid = grepl(paste(c,collapse="|"), raw_data$id) # else warning that pattern length > 1 #table(sid) #table(mut, dst, sid) #summary(model2<-glm(dst ~ mut + sid # , family = binomial ##, control = glm.control(maxit = 1) ##, options(warn = 1) # )) #or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(or_logistic2) #pval_logistic2 = summary(model2)$coefficients[2,4]; print(pval_logistic2) #=============================================== ###################### # AF and OR for all muts: sapply ###################### print(table(dst)); print(table(mut)) # must exist #dst = raw_data[[drug]] #or raw_data[,drug] # af afs = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) mean(mut) }) #afs head(afs) #1) chi square: original statistic_chi = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) chisq.test(mut,dst)$statistic }) # statistic_chi: has suffix added of '.X-squared' stat_chi = statistic_chi # remove suffix names(stat_chi) = gsub(".X-squared", "", names(statistic_chi)) if (names(stat_chi)!= names(statistic_chi) && stat_chi == statistic_chi){ cat('Sanity check passed: suffix removed correctly\n\n' , 'names with suffix:', head(names(statistic_chi)), '\n\n' , 'names without suffix:', head(names(stat_chi)), '\n\n' , 'values in var with suffix:', head(statistic_chi),'\n' , 'values in var without suffix:', head(stat_chi) ) }else{ print('FAIL: suffix removal unsuccessful! Please Debug') } ## pval pvals_chi = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) chisq.test(mut,dst)$p.value }) #pvals_chi head(pvals_chi) #2) chi square: custom ors_chi_cus = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) my_chisq_or(mut,dst) }) #ors_chi_cus head(ors_chi_cus) ## pval:fisher (use the same one for custom chi sqaure) pvals_fisher = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) fisher.test(mut,dst)$p.value }) #pvals_fisher head(pvals_fisher) #3) fisher odds_ratio_fisher = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) fisher.test(mut,dst)$estimate; }) #ors_fisher head(odds_ratio_fisher) # statistic_chi: has suffix added of '.X-squared' head(odds_ratio_fisher) # remove suffix ors_fisher = odds_ratio_fisher names(ors_fisher) = gsub(".odds ratio", "", names(odds_ratio_fisher)) if (names(ors_fisher)!= names(odds_ratio_fisher) && ors_fisher == odds_ratio_fisher){ cat('Sanity check passed: suffix removed correctly\n\n' , 'names with suffix:', head(names(odds_ratio_fisher)), '\n\n' , 'names without suffix:', head(names(ors_fisher)), '\n\n' , 'values in var with suffix:', head(odds_ratio_fisher),'\n' , 'values in var without suffix:', head(ors_fisher) ) }else{ print('FAIL: suffix removal unsuccessful! Please Debug') } ## fisher ci (lower) ci_lb_fisher = sapply(gene_snps_unique, function(m){ mut = grepl(m,raw_data$all_muts_gene) low_ci = fisher.test(table(mut, dst))$conf.int[1] }) #ci_lb_fisher head(ci_lb_fisher) ## fisher ci (upper) ci_ub_fisher = sapply(gene_snps_unique, function(m){ mut = grepl(m,raw_data$all_muts_gene) up_ci = fisher.test(table(mut, dst))$conf.int[2] }) #ci_ub_fisher head(ci_ub_fisher) #4) logistic or ors_logistic = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) #print(table(dst, mut)) model<-glm(dst ~ mut , family = binomial) or_logistic = exp(summary(model)$coefficients[2,1]) #pval_logistic = summary(model)$coefficients[2,4] }) #ors_logistic head(ors_logistic) ## logistic p-value pvals_logistic = sapply(gene_snps_unique,function(m){ mut = grepl(m,raw_data$all_muts_gene) #print(table(dst, mut)) model<-glm(dst ~ mut , family = binomial) pval_logistic = summary(model)$coefficients[2,4] }) #pvals_logistic head(pvals_logistic) #============================================= # check ..(hmmm) perhaps separate script) #afs['pnca_p.trp68gly'] #afs['pnca_p.gln10pro'] #afs['pnca_p.leu4ser'] # #plot(density(log(ors_logistic))) #plot(-log10(pvals)) #hist(log(ors) # , breaks = 100) # sanity check: if names are equal (just for 3 vars) all(sapply(list(names(afs) , names(pvals_chi) , names(statistic_chi) # should return False , names(ors_chi_cus)), function (x) x == names(ors_logistic))) #compare(names(afs) # , names(pvals_chi) # , names(statistic_chi) #TEST: should return False, but DOESN'T # , names(ors_chi_cus) # , names(stat_chi))$result #=============== Now with all vars # sanity check: names for all vars #c = compare( names(afs) # , names(stat_chi) # , names(statistic_chi) #TEST: should return False, but DOESN'T # , names(pvals_chi) # , names(ors_chi_cus) # , names(pvals_fisher) # , names(ors_fisher) # , names(ci_lb_fisher) # , names(ci_ub_fisher) # , names(ors_logistic) # , names(pvals_logistic))$result; c if (all( sapply( list(names(afs) , names(stat_chi) #, names(statistic_chi) # TEST should return FALSE if included , names(pvals_chi) , names(ors_chi_cus) , names(pvals_fisher) , names(ors_fisher) , names(ci_lb_fisher) , names(ci_ub_fisher) , names(pvals_logistic) ), function (x) x == names(ors_logistic))) ){ cat('PASS: names match: proceed with combining into a single df') } else { cat('FAIL: names mismatch') } # combine ors, pvals and afs cat('Combining calculated params into a df: ors, pvals and afs') comb_AF_and_OR = data.frame(afs , stat_chi , pvals_chi , ors_chi_cus , pvals_fisher , ors_fisher , ci_lb_fisher , ci_ub_fisher , pvals_logistic , ors_logistic) cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR) , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR)) cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR))) # add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column comb_AF_and_OR$mutation = rownames(comb_AF_and_OR) # sanity check if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){ cat('PASS: rownames and mutaion col values match') }else{ cat('FAIL: rownames and mutation col values mismatch') } ######################################################### # write file out: pnca_AF_OR ######################################################### cat(paste0('writing output file: ' , '\nFilename: ', outfile)) write.csv(comb_AF_and_OR, outfile , row.names = F) cat(paste0('Finished writing:' , out_filename , '\nNo. of rows: ', nrow(comb_AF_and_OR) , '\nNo. of cols: ', ncol(comb_AF_and_OR))) #************************************************ cat('\n======================================================================\n') rm(out_filename) cat('End of script: calculated AF, OR, pvalues and saved file') ######################################################### # 3: Merge meta data file + calculated num params ######################################################### #df1 = gene_metadata #df2 = comb_AF_and_OR # COMMENT: will do the combining with the other OR and AF (in python)