From e2cdee2d08269ec9bb2d7d3fc1cb29237a7740ac Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 18 Jan 2022 17:36:54 +0000
Subject: [PATCH] added additional check in combining_df_plotting.R to account
 for check when generating merged_df2 as muts NOT present in mcsm can create
 trouble, so fixed that and ran it successfully for alr and katg

---
 scripts/functions/combining_dfs_plotting.R    | 72 +++++++++++++------
 scripts/functions/plotting_data.R             |  4 +-
 .../tests/test_combining_dfs_plotting.R       | 14 ++--
 scripts/plotting/Header_TT.R                  |  2 -
 4 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R
index 60731ad..e7faaa2 100644
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@@ -53,20 +53,20 @@ combining_dfs_plotting <- function(  my_df_u
         , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
         , "\nNA in AF:", sum(is.na(my_df_u$af)))
   }
-  
-  # or kin
-  if (identical(sum(is.na(my_df_u$or_kin))
-                , sum(is.na(my_df_u$pwald_kin))
-                , sum(is.na(my_df_u$af_kin)))){
-    cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
-    na_count = sum(is.na(my_df_u$af_kin))
-    cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
-  } else{
-    cat("\nFAIL: NA count mismatch"
-        , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
-        , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
-        , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
-  }
+  # 
+  # # or kin
+  # if (identical(sum(is.na(my_df_u$or_kin))
+  #               , sum(is.na(my_df_u$pwald_kin))
+  #               , sum(is.na(my_df_u$af_kin)))){
+  #   cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
+  #   na_count = sum(is.na(my_df_u$af_kin))
+  #   cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
+  # } else{
+  #   cat("\nFAIL: NA count mismatch"
+  #       , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
+  #       , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
+  #       , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
+  # }
   
   str(gene_metadata)
   
@@ -98,7 +98,7 @@ combining_dfs_plotting <- function(  my_df_u
   # merging_cols = merging_cols[[1]]
   merging_cols = 'mutationinformation'
   
-  cat("\nLinking column being used: mutationinformation")
+  cat("\nLinking column being used:", merging_cols)
   
   # important checks!
   table(nchar(my_df_u$mutationinformation))
@@ -111,6 +111,7 @@ combining_dfs_plotting <- function(  my_df_u
                      , y = my_df_u
                      , by = merging_cols
                      , all.y = T)
+                     #, all.x = T)
   
   cat("\nDim of merged_df2: ", dim(merged_df2))
   
@@ -138,6 +139,17 @@ combining_dfs_plotting <- function(  my_df_u
   
   head(merged_df2$position)
   
+  merged_muts_u = unique(merged_df2$mutationinformation)
+  meta_muts_u = unique(gene_metadata$mutationinformation)
+  # find the index where it differs
+  cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
+      , "\nLength of unique meta muts:",length(meta_muts_u) )
+  
+  meta_muts_all    = gene_metadata$mutationinformation
+  merged_muts      = merged_df2$mutationinformation
+  discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
+  discrepancy      = meta_muts_all[! meta_muts_all %in% merged_muts]
+  
   # sanity check 
   cat("\nChecking nrows in merged_df2")
   if(nrow(gene_metadata) == nrow(merged_df2)){
@@ -145,16 +157,36 @@ combining_dfs_plotting <- function(  my_df_u
         ,"\nExpected no. of rows: ",nrow(gene_metadata) 
         ,"\nGot no. of rows: ", nrow(merged_df2))
   } else{
-    cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+    cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
         , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
         , "\nGot no. of rows: ", nrow(merged_df2)
         , "\nFinding discrepancy")
-    merged_muts_u = unique(merged_df2$mutationinformation)
-    meta_muts_u = unique(gene_metadata$mutationinformation)
     # find the index where it differs
-    unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-    quit()
+    cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
+        , "\nLength of unique meta muts:",length(meta_muts_u)
+        , "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
+        , "These correspond to:", discrepancy, "entries"
+        , "\nThese problematic muts are:\n"
+        , discrepancy_uniq)
+    #quit()
+    cat("\nChecking again...")
+    expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
+    if (nrow(merged_df2) == expected_nrows_df2){
+      cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
+          ,"\nExpected no. of rows: ", expected_nrows_df2
+          ,"\nGot no. of rows: ", nrow(merged_df2))
+    }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
+            , "\nExpected no. of rows after merge: ", expected_nrows_df2
+            , "\nGot no. of rows: ", nrow(merged_df2)
+            , "\nQuitting!")
+      quit()
+      
+      }
+  
   }
+  
+  
+  
 
   # Quick formatting: ordering df and pretty labels
   
diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R
index e32813b..67a3f2c 100755
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@@ -20,8 +20,8 @@ library(dplyr)
 #lig_dist_cutoff  =  10 or global var LigDist_cutoff
 
 plotting_data <- function(df
-                          , lig_dist_colname = '' 
-                          , lig_dist_cutoff = '') {
+                          , lig_dist_colname 
+                          , lig_dist_cutoff) {
 my_df       = data.frame()
 my_df_u     = data.frame()
 my_df_u_lig = data.frame()
diff --git a/scripts/functions/tests/test_combining_dfs_plotting.R b/scripts/functions/tests/test_combining_dfs_plotting.R
index 87a1929..7b591e2 100644
--- a/scripts/functions/tests/test_combining_dfs_plotting.R
+++ b/scripts/functions/tests/test_combining_dfs_plotting.R
@@ -36,8 +36,9 @@ source("combining_dfs_plotting.R")
 #---------------------
 # call: import_dirs()
 #---------------------
-gene = 'gid'
-drug = 'streptomycin'
+#gene = 'gid'
+#drug = 'streptomycin'
+source("/home/tanu/git/LSHTM_analysis/config/alr.R")
 
 import_dirs(drug_name = drug, gene_name = gene)
 
@@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T)
 # call function: plotting_data()
 #-------------------------------
 pd_df = plotting_data(df = mcsm_comb_data
-                      , ligand_dist_colname = 'ligand_distance'
-                      , lig_dist_cutoff = 10
+                      , lig_dist_colname = LigDist_colname
+                      , lig_dist_cutoff = LigDist_cutoff)
+
 my_df_u = pd_df[[2]] 
 
 #======================================
@@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata
 #-----------------------------------------
 all_plot_dfs = combining_dfs_plotting(my_df_u
                        , gene_metadata
-                       , lig_dist_colname = 'ligand_distance'
-                       , lig_dist_cutoff = 10)
+                       , lig_dist_colname = LigDist_colname
+                       , lig_dist_cutoff = LigDist_cutoff)
 
 merged_df2          = all_plot_dfs[[1]]
 merged_df3          = all_plot_dfs[[2]]
diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R
index 0b78307..b574c44 100755
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source)  # source all your R scripts!
 
 # set plot script dir
 plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"
-
-