added script to test af_or_calcs

2021-06-11 13:33:25 +01:00 · 2021-06-11 13:33:25 +01:00 · 7686aa39b4
commit 7686aa39b4
parent 931f8ec2f9
3 changed files with 55 additions and 0 deletions
--- a/scripts/functions/af_or_calcs.R
+++ b/scripts/functions/af_or_calcs.R
@ -1,254 +0,0 @@
-my_afor <- function (  infile_master
-                     , infile_metadata
-                     , outfile
-                     , drug
-                     , gene
-                     , idcol = "id"
-                     , dr_muts_col
-                     , other_muts_col){
-
-  #===========================================
-  # 1: Read master/raw data stored in Data/
-  #===========================================
-  raw_data_all = read.csv(infile_master, stringsAsFactors = F)
-  
-  cat("\nExtracting columns based on variables:\n"
-      , drug
-      , "\n"
-      , dr_muts_col
-      , "\n"
-      , other_muts_col
-      , "\n===============================================================")
-  
-  raw_data = raw_data_all[,c(idcol
-                             , drug
-                             , dr_muts_col
-                             , other_muts_col)]
-  #rm(raw_data_all)
-  #rm(indir, in_filename_master, infile_master)
-  
-  #-------------------
-  # 1a: exclude na
-  #--------------------
-  raw_data = raw_data[!is.na(raw_data[[drug]]),]
-  
-  total_samples = length(unique(raw_data[[idcol]]))
-  cat(paste0("\nTotal samples without NA in", " ", drug, " is:", total_samples))
- 
-  # sanity check: should  be true
-  cat("\nThis should be True:\n"
-      , is.numeric(total_samples))
-  
-  #----------------------------------------
-  # 1b: combine the two mutation columns
-  #-----------------------------------------
-  all_muts_colname = paste0("all_mutations_", drug)
-  cat(paste("\nNew column added:", all_muts_colname))
-  raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
-  head(raw_data[[all_muts_colname]])
-
-  #----------------------------------------------
-  # 1c: create yet another column that contains
-  # all the mutations but in lower case
-  #-----------------------------------------------
-  head(raw_data[[all_muts_colname]])
-  raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) 
-  head(raw_data$all_muts_gene)
-  
-  # sanity checks
-  gene_match = paste0(gene,"_p.")
-  cat(paste0("\nconverting gene match: ", gene_match, " ", "to lowercase\n"))
-  gene_match = tolower(gene_match)
-  
-  table(grepl(gene_match,raw_data$all_muts_gene))
-  
-  # sanity check: should be TRUE
-  if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
-    cat("\nPASS: Total no. of samples match\n")
-  } else{
-    cat("\nFAIL: No. of samples mismatch\n")
-    exit()
-  }
-  
-  #====================================================
-  # 2: Read valid snps for which OR 
-  # can be calculated
-  #=====================================================
-  cat(paste0("\nReading gene associated metadata:", infile_metadata))
-  
-  gene_metadata = read.csv(infile_metadata
-                           , stringsAsFactors = F
-                           , header = T)
-  
-  cat(paste("\nDim of gene_metadata:\n"
-            , dim(gene_metadata)))
-      
-  # count na in drug column
-  tot_drug_na = sum(is.na(gene_metadata[[drug]]))
-  expected_rows = nrow(gene_metadata) - tot_drug_na
-  
-  # drop na from the drug column
-  gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
-  
-  # sanity check
-  if(nrow(gene_snps_or) == expected_rows){
-    cat("\nPASS: no. of rows match with expected_rows")
-  } else{
-    cat("\nFAIL: nrows mismatch.")
-    exit()
-  }
-  
-  # extract unique snps to iterate over for AF and OR calcs
-  gene_snps_unique = unique(gene_snps_or$mutation) 
-  
-  cat(paste0("\nTotal no. of distinct comp snps to perform OR calcs: ", length(gene_snps_unique)))
-  
-  #==================================================
-  # OR calcs using the following 4
-  #1) logistic
-  #2) custom chisq.test
-  #3) fisher
-  #4) chisq.test
-  
-  # adjusted logistic (NO good)
-  # kinship (separate script)
-  #=================================================
-  #snps <- gene_snps_unique [1:5] # small test
-  snps <- gene_snps_unique
-  cat(paste0("\nRunning calculations for:"
-             , length(snps), " nssnps\n"
-             , "\ngene: ", gene
-             , "\ndrug: ", drug ))
-  
-  # DV: <drug> 0 or 1
-  dst = raw_data[[drug]]
-  
-  # initialise an empty df
-  ors_df = data.frame()
-  x = sapply(snps,function(m){
-    
-    mut = grepl(m,raw_data$all_muts_gene)
-    mut = as.numeric(mut)
-    cat(paste0("\nRunning mutation:", m, "\n"))
-    
-    model<-glm(dst ~ mut, family = binomial)
-    
-    #-------------------
-    # allele frequency
-    #-------------------
-    afs = mean(mut)
-    
-    #-------------------
-    # logistic model
-    #-------------------
-    beta_logistic = summary(model)$coefficients[2,1]
-    
-    or_logistic = exp(summary(model)$coefficients[2,1])
-    #print(paste0("logistic OR:", or_logistic))
-    
-    pval_logistic = summary(model)$coefficients[2,4]
-    #print(paste0("logistic pval:", pval_logistic))
-    
-    se_logistic = summary(model)$coefficients[2,2]
-    #print(paste0("logistic SE:", se_logistic))
-    
-    zval_logistic = summary(model)$coefficients[2,3]
-    #print(paste0("logistic zval:", zval_logistic))
-    
-    ci_mod = exp(confint(model))[2,]
-    #print(paste0("logistic CI:", ci_mod))
-    
-    ci_lower_logistic = ci_mod[["2.5 %"]]
-    ci_upper_logistic = ci_mod[["97.5 %"]]
-    
-    #-------------------
-    # custom_chisq and fisher: OR p-value and CI
-    #-------------------
-    or_mychisq = mychisq_or(dst, mut)
-    #print(paste0("mychisq OR:", or_mychisq))
-    
-    odds_fisher = fisher.test(table(dst, mut))$estimate
-    or_fisher = odds_fisher[[1]]
-    
-    pval_fisher = fisher.test(table(dst, mut))$p.value
-    
-    ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1]
-    ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2]
-    
-    #-------------------
-    # chi sq estimates 
-    #-------------------
-    estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq
-    est_chisq = estimate_chisq[[1]]; print(est_chisq)
-    
-    pval_chisq = chisq.test(table(dst, mut))$p.value                                                        
-    
-    # build a row to append to df
-    row = data.frame(mutation = m
-                     , af = afs
-                     , beta_logistic = beta_logistic
-                     , or_logistic = or_logistic
-                     , pval_logistic = pval_logistic
-                     , se_logistic = se_logistic
-                     , zval_logistic = zval_logistic
-                     , ci_low_logistic = ci_lower_logistic
-                     , ci_hi_logistic = ci_upper_logistic
-                     , or_mychisq = or_mychisq 
-                     , or_fisher = or_fisher
-                     , pval_fisher = pval_fisher
-                     , ci_low_fisher= ci_lower_fisher
-                     , ci_hi_fisher = ci_upper_fisher
-                     , est_chisq = est_chisq
-                     , pval_chisq = pval_chisq
-    )
-    
-    ors_df <<- rbind(ors_df, row)
-  })
-  
-  #==============================================
-  # Writing file with calculated ORs and AFs
-  #==============================================
-  cat(paste0("\nwriting output file: "
-             , "\nFile: ", outfile))
-  
-  write.csv(ors_df, outfile
-            , row.names = F)
-  
-  cat(paste0("\nFinished writing:"
-             , outfile
-             , "\nNo. of rows: ", nrow(ors_df)
-             , "\nNo. of cols: ", ncol(ors_df)))
-  cat("\n======sneak peek into a few muts with prominent or and p-vals=======\n")
-  
-  cat(paste0("======================================="
-             , "\nmutation with highest logistic OR:"
-             , "\n=======================================\n"))
-  print(ors_df[which(ors_df$or_logistic == max(ors_df$or_logistic)),])
-  
-  cat(paste0("======================================="
-             , "\nmutation with highest mychisq OR:"
-             , "\n=======================================\n"))
-  
-  print(ors_df[which(ors_df$or_mychisq == max(ors_df$or_mychisq)),])
-  
-  # gives too many due to Inf
-  #cat(paste0("======================================="
-  #, "\nmutation with highest fisher OR:"
-  #, "\n=======================================\n"))
-  #print(ors_df[which(ors_df$or_fisher == max(ors_df$or_fisher)),])
-  
-  cat(paste0("======================================="
-             , "\nmutation with lowest logistic pval:"
-             , "\n=======================================\n"))
-  print(ors_df[which(ors_df$pval_logistic == min(ors_df$pval_logistic)),])
-  
-  cat(paste0("======================================="
-             , "\nmutation with lowest fisher pval:"
-             , "\n=======================================\n"))
-  print(ors_df[which(ors_df$pval_fisher == min(ors_df$pval_fisher)),])
-  
-  ################################################################
-  cat("\nEnd of script: calculated AF, OR, pvalues and saved file\n")
-  #################################################################
-  
-  }