added mychisq_or.R and af_or_calcs.R

2021-06-11 13:28:07 +01:00 · 2021-06-11 13:28:07 +01:00 · 96af746726
commit 96af746726
parent ad0b814f9c
2 changed files with 267 additions and 0 deletions
--- a/scripts/functions/af_or_calcs.R
+++ b/scripts/functions/af_or_calcs.R
@ -0,0 +1,254 @@
+my_afor <- function (  infile_master
+                     , infile_metadata
+                     , outfile
+                     , drug
+                     , gene
+                     , idcol = "id"
+                     , dr_muts_col
+                     , other_muts_col){
+
+  #===========================================
+  # 1: Read master/raw data stored in Data/
+  #===========================================
+  raw_data_all = read.csv(infile_master, stringsAsFactors = F)
+  
+  cat("\nExtracting columns based on variables:\n"
+      , drug
+      , "\n"
+      , dr_muts_col
+      , "\n"
+      , other_muts_col
+      , "\n===============================================================")
+  
+  raw_data = raw_data_all[,c(idcol
+                             , drug
+                             , dr_muts_col
+                             , other_muts_col)]
+  #rm(raw_data_all)
+  #rm(indir, in_filename_master, infile_master)
+  
+  #-------------------
+  # 1a: exclude na
+  #--------------------
+  raw_data = raw_data[!is.na(raw_data[[drug]]),]
+  
+  total_samples = length(unique(raw_data[[idcol]]))
+  cat(paste0("\nTotal samples without NA in", " ", drug, " is:", total_samples))
+ 
+  # sanity check: should  be true
+  cat("\nThis should be True:\n"
+      , is.numeric(total_samples))
+  
+  #----------------------------------------
+  # 1b: combine the two mutation columns
+  #-----------------------------------------
+  all_muts_colname = paste0("all_mutations_", drug)
+  cat(paste("\nNew column added:", all_muts_colname))
+  raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
+  head(raw_data[[all_muts_colname]])
+
+  #----------------------------------------------
+  # 1c: create yet another column that contains
+  # all the mutations but in lower case
+  #-----------------------------------------------
+  head(raw_data[[all_muts_colname]])
+  raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) 
+  head(raw_data$all_muts_gene)
+  
+  # sanity checks
+  gene_match = paste0(gene,"_p.")
+  cat(paste0("\nconverting gene match: ", gene_match, " ", "to lowercase\n"))
+  gene_match = tolower(gene_match)
+  
+  table(grepl(gene_match,raw_data$all_muts_gene))
+  
+  # sanity check: should be TRUE
+  if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
+    cat("\nPASS: Total no. of samples match\n")
+  } else{
+    cat("\nFAIL: No. of samples mismatch\n")
+    exit()
+  }
+  
+  #====================================================
+  # 2: Read valid snps for which OR 
+  # can be calculated
+  #=====================================================
+  cat(paste0("\nReading gene associated metadata:", infile_metadata))
+  
+  gene_metadata = read.csv(infile_metadata
+                           , stringsAsFactors = F
+                           , header = T)
+  
+  cat(paste("\nDim of gene_metadata:\n"
+            , dim(gene_metadata)))
+      
+  # count na in drug column
+  tot_drug_na = sum(is.na(gene_metadata[[drug]]))
+  expected_rows = nrow(gene_metadata) - tot_drug_na
+  
+  # drop na from the drug column
+  gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
+  
+  # sanity check
+  if(nrow(gene_snps_or) == expected_rows){
+    cat("\nPASS: no. of rows match with expected_rows")
+  } else{
+    cat("\nFAIL: nrows mismatch.")
+    exit()
+  }
+  
+  # extract unique snps to iterate over for AF and OR calcs
+  gene_snps_unique = unique(gene_snps_or$mutation) 
+  
+  cat(paste0("\nTotal no. of distinct comp snps to perform OR calcs: ", length(gene_snps_unique)))
+  
+  #==================================================
+  # OR calcs using the following 4
+  #1) logistic
+  #2) custom chisq.test
+  #3) fisher
+  #4) chisq.test
+  
+  # adjusted logistic (NO good)
+  # kinship (separate script)
+  #=================================================
+  #snps <- gene_snps_unique [1:5] # small test
+  snps <- gene_snps_unique
+  cat(paste0("\nRunning calculations for:"
+             , length(snps), " nssnps\n"
+             , "\ngene: ", gene
+             , "\ndrug: ", drug ))
+  
+  # DV: <drug> 0 or 1
+  dst = raw_data[[drug]]
+  
+  # initialise an empty df
+  ors_df = data.frame()
+  x = sapply(snps,function(m){
+    
+    mut = grepl(m,raw_data$all_muts_gene)
+    mut = as.numeric(mut)
+    cat(paste0("\nRunning mutation:", m, "\n"))
+    
+    model<-glm(dst ~ mut, family = binomial)
+    
+    #-------------------
+    # allele frequency
+    #-------------------
+    afs = mean(mut)
+    
+    #-------------------
+    # logistic model
+    #-------------------
+    beta_logistic = summary(model)$coefficients[2,1]
+    
+    or_logistic = exp(summary(model)$coefficients[2,1])
+    #print(paste0("logistic OR:", or_logistic))
+    
+    pval_logistic = summary(model)$coefficients[2,4]
+    #print(paste0("logistic pval:", pval_logistic))
+    
+    se_logistic = summary(model)$coefficients[2,2]
+    #print(paste0("logistic SE:", se_logistic))
+    
+    zval_logistic = summary(model)$coefficients[2,3]
+    #print(paste0("logistic zval:", zval_logistic))
+    
+    ci_mod = exp(confint(model))[2,]
+    #print(paste0("logistic CI:", ci_mod))
+    
+    ci_lower_logistic = ci_mod[["2.5 %"]]
+    ci_upper_logistic = ci_mod[["97.5 %"]]
+    
+    #-------------------
+    # custom_chisq and fisher: OR p-value and CI
+    #-------------------
+    or_mychisq = mychisq_or(dst, mut)
+    #print(paste0("mychisq OR:", or_mychisq))
+    
+    odds_fisher = fisher.test(table(dst, mut))$estimate
+    or_fisher = odds_fisher[[1]]
+    
+    pval_fisher = fisher.test(table(dst, mut))$p.value
+    
+    ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1]
+    ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2]
+    
+    #-------------------
+    # chi sq estimates 
+    #-------------------
+    estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq
+    est_chisq = estimate_chisq[[1]]; print(est_chisq)
+    
+    pval_chisq = chisq.test(table(dst, mut))$p.value                                                        
+    
+    # build a row to append to df
+    row = data.frame(mutation = m
+                     , af = afs
+                     , beta_logistic = beta_logistic
+                     , or_logistic = or_logistic
+                     , pval_logistic = pval_logistic
+                     , se_logistic = se_logistic
+                     , zval_logistic = zval_logistic
+                     , ci_low_logistic = ci_lower_logistic
+                     , ci_hi_logistic = ci_upper_logistic
+                     , or_mychisq = or_mychisq 
+                     , or_fisher = or_fisher
+                     , pval_fisher = pval_fisher
+                     , ci_low_fisher= ci_lower_fisher
+                     , ci_hi_fisher = ci_upper_fisher
+                     , est_chisq = est_chisq
+                     , pval_chisq = pval_chisq
+    )
+    
+    ors_df <<- rbind(ors_df, row)
+  })
+  
+  #==============================================
+  # Writing file with calculated ORs and AFs
+  #==============================================
+  cat(paste0("\nwriting output file: "
+             , "\nFile: ", outfile))
+  
+  write.csv(ors_df, outfile
+            , row.names = F)
+  
+  cat(paste0("\nFinished writing:"
+             , outfile
+             , "\nNo. of rows: ", nrow(ors_df)
+             , "\nNo. of cols: ", ncol(ors_df)))
+  cat("\n======sneak peek into a few muts with prominent or and p-vals=======\n")
+  
+  cat(paste0("======================================="
+             , "\nmutation with highest logistic OR:"
+             , "\n=======================================\n"))
+  print(ors_df[which(ors_df$or_logistic == max(ors_df$or_logistic)),])
+  
+  cat(paste0("======================================="
+             , "\nmutation with highest mychisq OR:"
+             , "\n=======================================\n"))
+  
+  print(ors_df[which(ors_df$or_mychisq == max(ors_df$or_mychisq)),])
+  
+  # gives too many due to Inf
+  #cat(paste0("======================================="
+  #, "\nmutation with highest fisher OR:"
+  #, "\n=======================================\n"))
+  #print(ors_df[which(ors_df$or_fisher == max(ors_df$or_fisher)),])
+  
+  cat(paste0("======================================="
+             , "\nmutation with lowest logistic pval:"
+             , "\n=======================================\n"))
+  print(ors_df[which(ors_df$pval_logistic == min(ors_df$pval_logistic)),])
+  
+  cat(paste0("======================================="
+             , "\nmutation with lowest fisher pval:"
+             , "\n=======================================\n"))
+  print(ors_df[which(ors_df$pval_fisher == min(ors_df$pval_fisher)),])
+  
+  ################################################################
+  cat("\nEnd of script: calculated AF, OR, pvalues and saved file\n")
+  #################################################################
+  
+  }
--- a/scripts/functions/mychisq_or.R
+++ b/scripts/functions/mychisq_or.R
@ -0,0 +1,13 @@
+mychisq_or = function(dst_numeric, mut_numeric){
+  tab = as.matrix(table(dst_numeric, mut_numeric))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+}