added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg

2022-01-18 17:36:54 +00:00 · 2022-01-18 17:36:54 +00:00 · e2cdee2d08
commit e2cdee2d08
parent 8f8a9db92c
4 changed files with 62 additions and 30 deletions
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@ -53,20 +53,20 @@ combining_dfs_plotting <- function(  my_df_u
        , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
        , "\nNA in AF:", sum(is.na(my_df_u$af)))
  }
-  
-  # or kin
-  if (identical(sum(is.na(my_df_u$or_kin))
-                , sum(is.na(my_df_u$pwald_kin))
-                , sum(is.na(my_df_u$af_kin)))){
-    cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
-    na_count = sum(is.na(my_df_u$af_kin))
-    cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
-  } else{
-    cat("\nFAIL: NA count mismatch"
-        , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
-        , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
-        , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
-  }
+  # 
+  # # or kin
+  # if (identical(sum(is.na(my_df_u$or_kin))
+  #               , sum(is.na(my_df_u$pwald_kin))
+  #               , sum(is.na(my_df_u$af_kin)))){
+  #   cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
+  #   na_count = sum(is.na(my_df_u$af_kin))
+  #   cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
+  # } else{
+  #   cat("\nFAIL: NA count mismatch"
+  #       , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
+  #       , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
+  #       , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
+  # }
  
  str(gene_metadata)
  
@ -98,7 +98,7 @@ combining_dfs_plotting <- function(  my_df_u
  # merging_cols = merging_cols[[1]]
  merging_cols = 'mutationinformation'
  
-  cat("\nLinking column being used: mutationinformation")
+  cat("\nLinking column being used:", merging_cols)
  
  # important checks!
  table(nchar(my_df_u$mutationinformation))
@ -111,6 +111,7 @@ combining_dfs_plotting <- function(  my_df_u
                     , y = my_df_u
                     , by = merging_cols
                     , all.y = T)
+                     #, all.x = T)
  
  cat("\nDim of merged_df2: ", dim(merged_df2))
  
@ -138,6 +139,17 @@ combining_dfs_plotting <- function(  my_df_u
  
  head(merged_df2$position)
  
+  merged_muts_u = unique(merged_df2$mutationinformation)
+  meta_muts_u = unique(gene_metadata$mutationinformation)
+  # find the index where it differs
+  cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
+      , "\nLength of unique meta muts:",length(meta_muts_u) )
+  
+  meta_muts_all    = gene_metadata$mutationinformation
+  merged_muts      = merged_df2$mutationinformation
+  discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
+  discrepancy      = meta_muts_all[! meta_muts_all %in% merged_muts]
+  
  # sanity check 
  cat("\nChecking nrows in merged_df2")
  if(nrow(gene_metadata) == nrow(merged_df2)){
@ -145,16 +157,36 @@ combining_dfs_plotting <- function(  my_df_u
        ,"\nExpected no. of rows: ",nrow(gene_metadata) 
        ,"\nGot no. of rows: ", nrow(merged_df2))
  } else{
-    cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+    cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
        , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
        , "\nGot no. of rows: ", nrow(merged_df2)
        , "\nFinding discrepancy")
-    merged_muts_u = unique(merged_df2$mutationinformation)
-    meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
-    unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-    quit()
+    cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
+        , "\nLength of unique meta muts:",length(meta_muts_u)
+        , "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
+        , "These correspond to:", discrepancy, "entries"
+        , "\nThese problematic muts are:\n"
+        , discrepancy_uniq)
+    #quit()
+    cat("\nChecking again...")
+    expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
+    if (nrow(merged_df2) == expected_nrows_df2){
+      cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
+          ,"\nExpected no. of rows: ", expected_nrows_df2
+          ,"\nGot no. of rows: ", nrow(merged_df2))
+    }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
+            , "\nExpected no. of rows after merge: ", expected_nrows_df2
+            , "\nGot no. of rows: ", nrow(merged_df2)
+            , "\nQuitting!")
+      quit()
+      
+      }
+  
  }
+  
+  
+  

  # Quick formatting: ordering df and pretty labels
  
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@ -20,8 +20,8 @@ library(dplyr)
 #lig_dist_cutoff  =  10 or global var LigDist_cutoff

 plotting_data <- function(df
-                          , lig_dist_colname = '' 
-                          , lig_dist_cutoff = '') {
+                          , lig_dist_colname 
+                          , lig_dist_cutoff) {
 my_df       = data.frame()
 my_df_u     = data.frame()
 my_df_u_lig = data.frame()
--- a/scripts/functions/tests/test_combining_dfs_plotting.R
+++ b/scripts/functions/tests/test_combining_dfs_plotting.R
@ -36,8 +36,9 @@ source("combining_dfs_plotting.R")
 #---------------------
 # call: import_dirs()
 #---------------------
-gene = 'gid'
-drug = 'streptomycin'
+#gene = 'gid'
+#drug = 'streptomycin'
+source("/home/tanu/git/LSHTM_analysis/config/alr.R")

 import_dirs(drug_name = drug, gene_name = gene)

@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T)
 # call function: plotting_data()
 #-------------------------------
 pd_df = plotting_data(df = mcsm_comb_data
-                      , ligand_dist_colname = 'ligand_distance'
-                      , lig_dist_cutoff = 10
+                      , lig_dist_colname = LigDist_colname
+                      , lig_dist_cutoff = LigDist_cutoff)
+
 my_df_u = pd_df[[2]] 

 #======================================
@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata
 #-----------------------------------------
 all_plot_dfs = combining_dfs_plotting(my_df_u
                       , gene_metadata
-                       , lig_dist_colname = 'ligand_distance'
-                       , lig_dist_cutoff = 10)
+                       , lig_dist_colname = LigDist_colname
+                       , lig_dist_cutoff = LigDist_cutoff)

 merged_df2          = all_plot_dfs[[1]]
 merged_df3          = all_plot_dfs[[2]]
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source)  # source all your R scripts!

 # set plot script dir
 plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"
-
-