diff --git a/scripts/AF_and_OR_calcs.R b/scripts/AF_and_OR_calcs.R new file mode 100644 index 0000000..3a0160a --- /dev/null +++ b/scripts/AF_and_OR_calcs.R @@ -0,0 +1,581 @@ +######################################################### +# TASK: To calculate Allele Frequency and +# Odds Ratio from master data +# and add the calculated params to meta_data extracted from +# data_extraction.py +######################################################### +getwd() +setwd('~/git/LSHTM_analysis/scripts') +getwd() + +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#=========== +# input +#=========== +# infile1: Raw data +#indir = 'git/Data/pyrazinamide/input/original' +indir = paste0('~/git/Data') +in_filename = 'original_tanushree_data_v2.csv' +#in_filename = 'mtb_gwas_v3.csv' +infile = paste0(indir, '/', in_filename) +cat(paste0('Reading infile1: raw data', ' ', infile) ) + +# infile2: gene associated meta data file to extract valid snps and add calcs to. +# This is outfile3 from data_extraction.py +indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output') +in_filename_metadata = 'pnca_metadata.csv' +infile_metadata = paste0(indir_metadata, '/', in_filename_metadata) +cat(paste0('Reading infile2: gene associated metadata:', infile_metadata)) + +#=========== +# output +#=========== +# outdir = 'git/Data/pyrazinamide/output' +outdir = paste0('~/git/Data', '/', drug, '/', 'output') +out_filename = paste0(tolower(gene),'_', 'meta_data_with_AF_OR.csv') +outfile = paste0(outdir, '/', out_filename) +cat(paste0('Output file with full path:', outfile)) +#%% end of variable assignment for input and output files + +######################################################### +# 1: Read master/raw data stored in Data/ +######################################################### +raw_data_all = read.csv(infile, stringsAsFactors = F) + +# building cols to extract +dr_muts_col = paste0('dr_mutations_', drug) +other_muts_col = paste0('other_mutations_', drug) + +cat('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') + +raw_data = raw_data_all[,c("id" + , drug + , dr_muts_col + , other_muts_col)] +rm(raw_data_all) + +rm(indir, in_filename, infile) + +#=========== +# 1a: exclude na +#=========== +raw_data = raw_data[!is.na(raw_data[[drug]]),] + +total_samples = length(unique(raw_data$id)) +cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) + +# sanity check: should be true +is.numeric(total_samples) + +#=========== +# 1b: combine the two mutation columns +#=========== +all_muts_colname = paste0('all_mutations_', drug) +#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) +raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) +head(raw_data[[all_muts_colname]]) + +#=========== +# 1c: create yet another column that contains all the mutations but in lower case +#=========== +head(raw_data[[all_muts_colname]]) +raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) +head(raw_data$all_muts_gene) + +# sanity checks +#table(grepl("gene_p",raw_data$all_muts_gene)) +cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) +gene_match = tolower(gene_match) + +table(grepl(gene_match,raw_data$all_muts_gene)) + +# sanity check: should be TRUE +#sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples +# sanity check +if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ + cat('PASS: Total no. of samples match') +} else{ + cat('FAIL: No. of samples mismatch') +} + +######################################################### +# 2: Read valid snps for which OR +# can be calculated +######################################################### +cat(paste0('Reading metadata infile:', infile_metadata)) + +gene_metadata = read.csv(infile_metadata + #, file.choose() + , stringsAsFactors = F + , header = T) + + +# clear variables +rm(in_filename_metadata, infile_metadata) + +# count na in pyrazinamide column +tot_pza_na = sum(is.na(gene_metadata$pyrazinamide)) +expected_rows = nrow(gene_metadata) - tot_pza_na + +# drop na from the pyrazinamide colum +gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] + +# sanity check +if(nrow(gene_snps_or) == expected_rows){ + cat('PASS: no. of rows match with expected_rows') +} else{ + cat('FAIL: nrows mismatch.') +} + +# extract unique snps to iterate over for AF and OR calcs +gene_snps_unique = unique(gene_snps_or$mutation) + +cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) + +#===================================== +#OR calcs using the following 4 +#1) chisq.test +#2) fisher +#3) modified chisq.test +#4) logistic +#5) adjusted logistic? +#6) kinship (separate script) + +#====================================== +# TEST FOR ONE + +i = "pnca_p.ala134gly" # has a NA, should NOT exist +table(grepl(i,raw_data$all_muts_gene)) + +i = "pnca_p.trp68gly" +table(grepl(i,raw_data$all_muts_gene)) + +# IV +mut = grepl(i,raw_data$all_muts_gene) + +# DV +dst = raw_data[[drug]] #or raw_data[,drug] + +table(mut, dst) + +#=============================================== +# calculating OR + +#1) chisq : noy accurate for low counts +chisq.test(table(mut,dst)) +attributes(chisq.test(table(mut,dst))) +chisq.test(table(mut,dst))$p.value + +#2) modified chisq OR: custom function +#x = as.numeric(mut) +#y = dst +my_chisq_or = function(x,y){ + tab = as.matrix(table(x,y)) + a = tab[2,2] + if (a==0){ a<-0.5} + b = tab[2,1] + if (b==0){ b<-0.5} + c = tab[1,2] + if (c==0){ c<-0.5} + d = tab[1,1] + if (d==0){ d<-0.5} + (a/b)/(c/d) +} +my_chisq_or(mut, dst) + +#3) fisher +fisher.test(table(mut, dst)) + +or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher) +pval_fisher = fisher.test(table(mut, dst))$p.value; print(pval_fisher) # the same one to be used for custom chisq_or +ci_lb_fisher = fisher.test(table(mut, dst))$conf.int[1]; print(ci_lb_fisher) +ci_ub_fisher = fisher.test(table(mut, dst))$conf.int[2]; print(ci_ub_fisher) + +#4) logistic +summary(model<-glm(dst ~ mut + , family = binomial + #, control = glm.control(maxit = 1) + #, options(warn = 1) + )) +or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic) +pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic) + + +#5) logistic adjusted: sample id (# identical results as unadjusted) +#c = raw_data$id[grepl(i,raw_data$all_muts_gene)] +#sid = grepl(paste(c,collapse="|"), raw_data$id) # else warning that pattern length > 1 +#table(sid) +#table(mut, dst, sid) + +#summary(model2<-glm(dst ~ mut + sid +# , family = binomial + ##, control = glm.control(maxit = 1) + ##, options(warn = 1) +# )) +#or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(or_logistic2) +#pval_logistic2 = summary(model2)$coefficients[2,4]; print(pval_logistic2) + +#=============================================== + +###################### +# AF and OR for all muts: sapply +###################### +print(table(dst)); print(table(mut)) + +# af +afs = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + mean(mut) +}) + +afs +head(afs) + +#1) chi square: original +stat_chi = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + chisq.test(mut,dst)$statistic +}) + +stat_chi +head(stat_chi) + +## pval +pvals_chi = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + chisq.test(mut,dst)$p.value +}) + +pvals_chi +head(pvals_chi) + +#2) chi square: custom +ors_chi_cus = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + my_chisq_or(mut,dst) +}) + +ors_chi_cus +head(ors_chi_cus) + +## pval:fisher (use the same one for custom chi sqaure) +pvals_fisher = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + fisher.test(mut,dst)$p.value +}) + +pvals_fisher +head(pvals_fisher) + +#3) fisher +ors_fisher = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + fisher.test(mut,dst)$estimate; +}) + +ors_fisher +head(ors_fisher) + +## fisher ci (lower) +ci_lb_fisher = sapply(gene_snps_unique, function(m){ + mut = grepl(m,raw_data$all_muts_gene) + low_ci = fisher.test(table(mut, dst))$conf.int[1] + +}) + +ci_lb_fisher +head(ci_lb_fisher) + +## fisher ci (upper) +ci_ub_fisher = sapply(gene_snps_unique, function(m){ + mut = grepl(m,raw_data$all_muts_gene) + up_ci = fisher.test(table(mut, dst))$conf.int[2] + +}) + +ci_ub_fisher +head(ci_ub_fisher) + +#4) logistic or +ors_logistic = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + #print(table(dst, mut)) + model<-glm(dst ~ mut , family = binomial) + or_logistic = exp(summary(model)$coefficients[2,1]) + #pval_logistic = summary(model)$coefficients[2,4] +}) + +ors_logistic +head(ors_logistic) + +## logistic p-value +pvals_logistic = sapply(gene_snps_unique,function(m){ + mut = grepl(m,raw_data$all_muts_gene) + #print(table(dst, mut)) + model<-glm(dst ~ mut , family = binomial) + #or_logistic = exp(summary(model)$coefficients[2,1]) + pval_logistic = summary(model)$coefficients[2,4] +}) + +pvals_logistic +head(pvals_logistic) + +#============================================= + +# check ..hmmm +afs['pnca_p.trp68gly'] +afs['pnca_p.gln10pro'] +afs['pnca_p.leu4ser'] + +plot(density(log(ors))) +plot(-log10(pvals)) +hist(log(ors) + , breaks = 100) + +# sanity check +my_vars = c(afs + , stat_chi + , pvals_chi + , ors_chi_cus + , pvals_fisher + , ors_fisher + , ci_lb_fisher + , ci_ub_fisher + , ors_logistic + , pvals_logistic) + + +my_vars = c('afs', 'pvals_chi', 'ors_chi_cus') + +names(get('afs')) + +# check if names are equal + + +if ( all(sapply(list(names(afs), names(pvals_chi), names(ors_chi_cus)), function (x) x == names(ors_logistic))) +& compare(names(afs), names(pvals_chi), names(ors_chi_cus), names(stat_chi)) ){ + cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') +} else{ + cat('FAIL: names of ors, pvals and afs mismatch') +} + + +# FROM HERE +if (table(names(ors_fisher) == names(pvals_fisher)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){ +cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') +} else{ + cat('FAIL: names of ors, pvals and afs mismatch') +} + +# combine ors, pvals and afs +cat('Combining calculated params into a df: ors, pvals and afs') + +comb_AF_and_OR = data.frame(ors, pvals, afs) +cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR) + , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR)) + +cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR))) + +# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column +comb_AF_and_OR$mutation = rownames(comb_AF_and_OR) + +# sanity check +if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){ + cat('PASS: rownames and mutaion col values match') +}else{ + cat('FAIL: rownames and mutation col values mismatch') +} + +######################################################### +# 3: Merge meta data file + calculated num params +######################################################### +df1 = pnca_metadata +df2 = comb_AF_and_OR + +cat('checking commom col of the two dfs before merging:' + ,'\ndf1:', head(df1$mutation) + , '\ndf2:', head(df2$mutation)) + +cat(paste0('merging two dfs: ' + ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1) + ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2) + ,'\nexpected rows in merged df: ', nrow(df1) + ,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1))) + +merged_df = merge(df1 # big file + , df2 # small (afor file) + , by = "mutation" + , all.x = T) # because you want all the entries of the meta data + +# sanity check +if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){ + cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df))) +} else{ + cat('FAIL: no.of cols mistmatch') +} + +# quick check +i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc. +merged_df[merged_df$mutation == i,] + +# count na in each column +na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count + +# check last three cols: should be NA +if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){ + cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected', + '\nNo. of NA: ', na_count[[length(na_count)]]) +} else { + cat('FAIL: No. of NAs for OR, AF and Pvals mismatch') +} + +# reassign custom colnames +cat('Assigning custom colnames for the calculated params...') +colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR" +colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue" +colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF" + +colnames(merged_df) + +# add 3 more cols: log OR, neglog pvalue and AF_percent cols +merged_df$logor = log(merged_df$OR) +is.numeric(merged_df$logor) + +merged_df$neglog10pvalue = -log10(merged_df$pvalue) +is.numeric(merged_df$neglog10pvalue) + +merged_df$AF_percent = merged_df$AF*100 +is.numeric(merged_df$AF_percent) + +# check AFs +#i = 'pnca_p.trp68gly' +i = 'pnca_p.gln10pro' +#i = 'pnca_p.leu4ser' +merged_df[merged_df$mutation == i,] + +# FIXME: harcoding (beware!), NOT FATAL though! +ncol_added = 3 + +cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:' + , '\ncols added: logor, neglog10pvalue and AF_percent:' + , '\nno. of cols in merged_df now: ', ncol(merged_df))) + +#%% write file out: pnca_meta_data_with_AF_OR +#********************************************* +cat(paste0('writing output file: ' + , '\nFilename: ', out_filename + , '\nPath:', outdir)) + +write.csv(merged_df, outfile + , row.names = F) + +cat(paste0('Finished writing:' + , out_filename + , '\nNo. of rows: ', nrow(merged_df) + , '\nNo. of cols: ', ncol(merged_df))) +#************************************************ +cat('======================================================================') +rm(out_filename) +cat('End of script: calculated AF, OR, pvalues and saved file') +# End of script +#%% +# sanity check: Count NA in these four cols. +# However these need to be numeric else these +# will be misleading when counting NAs (i.e retrun 0) +#is.numeric(meta_with_afor$OR) +na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue') + +# loop through these vars and check if these are numeric. +# if not, then convert to numeric +check_all = NULL + +for (i in na_var){ + # cat(i) + check0 = is.numeric(meta_with_afor[,i]) + if (check0) { + check_all = c(check0, check_all) + cat('These are all numeric cols') + } else{ + cat('First converting to numeric') + check0 = as.numeric(meta_with_afor[,i]) + check_all = c(check0, check_all) + } +} + +# count na now that the respective cols are numeric +na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count +str(na_count) + +# extract how many NAs: +# should be all TRUE +# should be a single number since +# all the cols should have 'equal' and 'same' no. of NAs +# compare if the No of 'NA' are the same for all these cols +na_len = NULL +for (i in na_var){ + temp = na_count[[i]] + na_len = c(na_len, temp) +} + +cat('Checking how many NAs and if these are identical for the selected cols:') +my_nrows = NULL +for ( i in 1: (length(na_len)-1) ){ +# cat(compare(na_len[i]), na_len[i+1]) + c = compare(na_len[i], na_len[i+1]) + if ( c$result ) { + cat('PASS: No. of NAa in selected cols are identical') + my_nrows = na_len[i] } + else { + cat('FAIL: No. of NAa in selected cols mismatch') + } +} + +cat('No. of NAs in each of the selected cols: ', my_nrows) + +# yet more sanity checks: +cat('Check whether the ', my_nrows, 'indices are indeed the same') + +#which(is.na(meta_with_afor$OR)) + +# initialise an empty df with nrows as extracted above +na_count_df = data.frame(matrix(vector(mode = 'numeric' +# , length = length(na_var) + ) + , nrow = my_nrows +# , ncol = length(na_var) + )) + +# populate the df with the indices of the cols that are NA +for (i in na_var){ + cat(i) + na_i = which(is.na(meta_with_afor[i])) + na_count_df = cbind(na_count_df, na_i) + colnames(na_count_df)[which(na_var == i)] <- i +} + +# Now compare these indices to ensure these are the same +check2 = NULL +for ( i in 1: ( length(na_count_df)-1 ) ) { +# cat(na_count_df[i] == na_count_df[i+1]) + check_all = identical(na_count_df[[i]], na_count_df[[i+1]]) + check2 = c(check_all, check2) + if ( all(check2) ) { + cat('PASS: The indices for AF, OR, etc are all the same\n') + } else { + cat ('FAIL: Please check indices which are NA') + } +} + + +