diff --git a/scripts/af_or_calcs.R b/scripts/af_or_calcs.R new file mode 100755 index 0000000..a1165e8 --- /dev/null +++ b/scripts/af_or_calcs.R @@ -0,0 +1,365 @@ +#!/usr/bin/env Rscript +#require('compare') +require('getopt', quietly=TRUE) # We need to be able to parse arguments +######################################################### +# TASK: To calculate Allele Frequency and +# Odds Ratio from master data +# and add the calculated params to meta_data extracted from +# data_extraction.py +######################################################### +#getwd() +setwd('~/git/LSHTM_analysis/scripts') +cat(c(getwd(),'\n')) + +# Command line args +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character" +), byrow = TRUE, ncol = 4) + +opt = getopt(spec); + +drug = opt$drug +gene = opt$gene + +if(is.null(drug)|is.null(gene)) { + stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)') + } + +#options(scipen = 999) #disabling scientific notation in R. + +#%% variable assignment: input and output paths & filenames +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#============= +# directories +#============= +datadir = paste0('~/git/Data') +indir = paste0(datadir, '/', drug, '/', 'input') +outdir = paste0(datadir, '/', drug, '/', 'output') + +#=========== +# input +#=========== +# input file 1: master data +#in_filename_master = 'original_tanushree_data_v2.csv' #19K +in_filename_master = 'mtb_gwas_meta_v3.csv' #33k +infile_master = paste0(datadir, '/', in_filename_master) +cat(paste0('Reading infile1: raw data', ' ', infile_master) ) + +# input file 2: gene associated meta data file to extract valid snps and add calcs to. +# This is outfile_metadata from data_extraction.py +in_filename_metadata = paste0(tolower(gene), '_metadata.csv') +infile_metadata = paste0(outdir, '/', in_filename_metadata) +cat(paste0('Reading input file 2 i.e gene associated metadata:', infile_metadata)) + +#=========== +# output +#=========== +#out_filename_af_or = paste0(tolower(gene), '_meta_data_with_AF_OR.csv') +out_filename_af_or = paste0(tolower(gene), '_af_or.csv') +outfile_af_or = paste0(outdir, '/', out_filename_af_or) +cat(paste0('Output file with full path:', outfile_af_or)) +#%% end of variable assignment for input and output files + +######################################################### +# 1: Read master/raw data stored in Data/ +######################################################### +raw_data_all = read.csv(infile_master, stringsAsFactors = F) + +# building cols to extract +dr_muts_col = paste0('dr_mutations_', drug) +other_muts_col = paste0('other_mutations_', drug) + +cat('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') + +raw_data = raw_data_all[,c("id" + , drug + , dr_muts_col + , other_muts_col)] +rm(raw_data_all) + +rm(indir, in_filename_master, infile_master) + +#=========== +# 1a: exclude na +#=========== +raw_data = raw_data[!is.na(raw_data[[drug]]),] + +total_samples = length(unique(raw_data$id)) +cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) + +# sanity check: should be true +is.numeric(total_samples) + +#=========== +# 1b: combine the two mutation columns +#=========== +all_muts_colname = paste0('all_mutations_', drug) +print(paste('New column added:', all_muts_colname)) +#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) +raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) +head(raw_data[[all_muts_colname]]) + +#=========== +# 1c: create yet another column that contains all the mutations but in lower case +#=========== +head(raw_data[[all_muts_colname]]) +raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) +head(raw_data$all_muts_gene) + +# sanity checks +#table(grepl("gene_p",raw_data$all_muts_gene)) +cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) +gene_match = tolower(gene_match) + +table(grepl(gene_match,raw_data$all_muts_gene)) + +# sanity check: should be TRUE +#sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples +# sanity check +if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ + cat('PASS: Total no. of samples match') +} else{ + cat('FAIL: No. of samples mismatch') +} + +######################################################### +# 2: Read valid snps for which OR +# can be calculated +######################################################### +cat(paste0('Reading gene associated metadata:', infile_metadata)) + +gene_metadata = read.csv(infile_metadata + #, file.choose() + , stringsAsFactors = F + , header = T) + +# get just the variable name from variable +#deparse(substitute(myvar) +print(paste('Dim of', deparse(substitute(gene_metadata)), ':')); print(dim(gene_metadata)) + +# clear variables +rm(in_filename_metadata, infile_metadata) + +# count na in drug column +tot_pza_na = sum(is.na(gene_metadata[[drug]])) +expected_rows = nrow(gene_metadata) - tot_pza_na + +# drop na from the drug column +gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] + +# sanity check +if(nrow(gene_snps_or) == expected_rows){ + cat('PASS: no. of rows match with expected_rows') +} else{ + cat('FAIL: nrows mismatch.') +} + +# extract unique snps to iterate over for AF and OR calcs +gene_snps_unique = unique(gene_snps_or$mutation) + +cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) + +#=========================================================================================== +######################### +# custom chisq function: +# To calculate OR +######################### +#i = "pnca_p.trp68gly" +i = "pnca_p.gly162asp" + +mut = grepl(i,raw_data$all_muts_gene) +mut = as.numeric(mut) + +dst = raw_data[[drug]] + +#x = as.numeric(mut) +#y = dst + +mychisq_or = function(x,y){ + tab = as.matrix(table(x,y)) + a = tab[2,2] + if (a==0){ a<-0.5} + b = tab[2,1] + if (b==0){ b<-0.5} + c = tab[1,2] + if (c==0){ c<-0.5} + d = tab[1,1] + if (d==0){ d<-0.5} + (a/b)/(c/d) + +} + +or_mychisq = mychisq_or(dst, mut) +print(paste0('mychisq OR:', or_mychisq )) + +odds_fisher = fisher.test(table(dst, mut))$estimate +pval_fisher = fisher.test(table(dst, mut))$p.value +print(paste0('fisher OR:', odds_fisher)) +print(paste0('fisher p-value:', pval_fisher)) + +model<-glm(dst ~ mut, family = binomial) +or_logistic = exp(summary(model)$coefficients[2,1]) +print(paste0('logistic OR:', or_logistic)) + +pval_logistic = summary(model)$coefficients[2,4] +print(paste0('logistic pval:', pval_logistic)) + +#===================================== +#OR calcs using the following 4 +#1) chisq.test +#2) fisher +#3) modified chisq.test +#4) logistic +#5) adjusted logistic? +#6) kinship (separate script) + +#====================================== +# TEST FOR a few muts: sapply and df +#=============================================== +#snps <- gene_snps_unique[1:2]# reassign so you test with subset of muts +snps <- gene_snps_unique +cat(paste0('Running calculations for:', length(snps), ' nssnps\n' +, 'gene: ', gene +, '\ndrug: ', drug )) + +# DV: 0 or 1 +dst = raw_data[[drug]] + +# initialise an empty df +ors_df = data.frame() + +x = sapply(snps,function(m){ + + mut = grepl(m,raw_data$all_muts_gene) + mut = as.numeric(mut) + cat(paste0('Running mutation:', m, '\n')) + + model<-glm(dst ~ mut, family = binomial) + + #------------------- + # allele frequency + #------------------- + afs = mean(mut) + + #------------------- + # logistic model + #------------------- + beta_logistic = summary(model)$coefficients[2,1] + + or_logistic = exp(summary(model)$coefficients[2,1]) + #print(paste0('logistic OR:', or_logistic)) + + pval_logistic = summary(model)$coefficients[2,4] + #print(paste0('logistic pval:', pval_logistic)) + + se_logistic = summary(model)$coefficients[2,2] + #print(paste0('logistic SE:', se_logistic)) + + zval_logistic = summary(model)$coefficients[2,3] + #print(paste0('logistic zval:', zval_logistic)) + + ci_mod = exp(confint(model))[2,] + #print(paste0('logistic CI:', ci_mod)) + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + #------------------- + # custom_chisq and fisher: OR p-value and CI + #------------------- + or_mychisq = mychisq_or(dst, mut) + #print(paste0('mychisq OR:', or_mychisq)) + + odds_fisher = fisher.test(table(dst, mut))$estimate + or_fisher = odds_fisher[[1]] + + pval_fisher = fisher.test(table(dst, mut))$p.value + + ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1] + ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2] + + #------------------- + # chi sq estimates + #------------------- + estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq + est_chisq = estimate_chisq[[1]]; print(est_chisq) + + pval_chisq = chisq.test(table(dst, mut))$p.value + + # build a row to append to df + row = data.frame(mutation = m + , af = afs + , beta_logistic = beta_logistic + , or_logistic = or_logistic + , pval_logistic = pval_logistic + , se_logistic = se_logistic + , zval_logistic = zval_logistic + , ci_low_logistic = ci_lower_logistic + , ci_hi_logistic = ci_upper_logistic + , or_mychisq = or_mychisq + , or_fisher = or_fisher + , pval_fisher = pval_fisher + , ci_low_fisher= ci_lower_fisher + , ci_hi_fisher = ci_upper_fisher + , est_chisq = est_chisq + , pval_chisq = pval_chisq + ) + #print(row) + + ors_df <<- rbind(ors_df, row) + +}) + +#%%====================================================== +# Writing file with calculated ORs and AFs +cat(paste0('writing output file: ' + , '\nFilen: ', outfile_af_or)) + +write.csv(ors_df, outfile_af_or + , row.names = F) + +cat(paste0('Finished writing:' + , outfile_af_or + , '\nNo. of rows: ', nrow(ors_df) + , '\nNo. of cols: ', ncol(ors_df))) +#%%====================================================== +cat('\n======sneak peek into a few muts with prominent or and p-vals=======\n') + +cat(paste0('=======================================' +, '\nmutation with highest logistic OR:' +, '\n=======================================\n')) +print(ors_df[which(ors_df$or_logistic == max(ors_df$or_logistic)),]) + +cat(paste0('=======================================' +, '\nmutation with highest mychisq OR:' +, '\n=======================================\n')) +print(ors_df[which(ors_df$or_mychisq == max(ors_df$or_mychisq)),]) + +# gives too many due to Inf +#cat(paste0('=======================================' +#, '\nmutation with highest fisher OR:' +#, '\n=======================================\n')) +#print(ors_df[which(ors_df$or_fisher == max(ors_df$or_fisher)),]) + +cat(paste0('=======================================' +, '\nmutation with lowest logistic pval:' +, '\n=======================================\n')) +print(ors_df[which(ors_df$pval_logistic == min(ors_df$pval_logistic)),]) + +cat(paste0('=======================================' +, '\nmutation with lowest fisher pval:' +, '\n=======================================\n')) +print(ors_df[which(ors_df$pval_fisher == min(ors_df$pval_fisher)),]) + +#********************************************************** +cat('End of script: calculated AF, OR, pvalues and saved file') +#**********************************************************