added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg

2022-01-18 17:36:54 +00:00 · 2022-01-18 17:36:54 +00:00 · e2cdee2d08
commit e2cdee2d08
parent 8f8a9db92c
4 changed files with 62 additions and 30 deletions
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@ -53,20 +53,20 @@ combining_dfs_plotting <- function(  my_df_u
        , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
        , "\nNA in AF:", sum(is.na(my_df_u$af)))
  }
-  
-  # or kin
-  if (identical(sum(is.na(my_df_u$or_kin))
-                , sum(is.na(my_df_u$pwald_kin))
-                , sum(is.na(my_df_u$af_kin)))){
-    cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
-    na_count = sum(is.na(my_df_u$af_kin))
-    cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
-  } else{
-    cat("\nFAIL: NA count mismatch"
-        , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
-        , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
-        , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
-  }
+  # 
+  # # or kin
+  # if (identical(sum(is.na(my_df_u$or_kin))
+  #               , sum(is.na(my_df_u$pwald_kin))
+  #               , sum(is.na(my_df_u$af_kin)))){
+  #   cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
+  #   na_count = sum(is.na(my_df_u$af_kin))
+  #   cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
+  # } else{
+  #   cat("\nFAIL: NA count mismatch"
+  #       , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
+  #       , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
+  #       , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
+  # }
  
  str(gene_metadata)
  
@ -98,7 +98,7 @@ combining_dfs_plotting <- function(  my_df_u
  # merging_cols = merging_cols[[1]]
  merging_cols = 'mutationinformation'
  
-  cat("\nLinking column being used: mutationinformation")
+  cat("\nLinking column being used:", merging_cols)
  
  # important checks!
  table(nchar(my_df_u$mutationinformation))
@ -111,6 +111,7 @@ combining_dfs_plotting <- function(  my_df_u
                     , y = my_df_u
                     , by = merging_cols
                     , all.y = T)
+                     #, all.x = T)
  
  cat("\nDim of merged_df2: ", dim(merged_df2))
  
@ -138,6 +139,17 @@ combining_dfs_plotting <- function(  my_df_u
  
  head(merged_df2$position)
  
+  merged_muts_u = unique(merged_df2$mutationinformation)
+  meta_muts_u = unique(gene_metadata$mutationinformation)
+  # find the index where it differs
+  cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
+      , "\nLength of unique meta muts:",length(meta_muts_u) )
+  
+  meta_muts_all    = gene_metadata$mutationinformation
+  merged_muts      = merged_df2$mutationinformation
+  discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
+  discrepancy      = meta_muts_all[! meta_muts_all %in% merged_muts]
+  
  # sanity check 
  cat("\nChecking nrows in merged_df2")
  if(nrow(gene_metadata) == nrow(merged_df2)){
@ -145,16 +157,36 @@ combining_dfs_plotting <- function(  my_df_u
        ,"\nExpected no. of rows: ",nrow(gene_metadata) 
        ,"\nGot no. of rows: ", nrow(merged_df2))
  } else{
-    cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+    cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
        , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
        , "\nGot no. of rows: ", nrow(merged_df2)
        , "\nFinding discrepancy")
-    merged_muts_u = unique(merged_df2$mutationinformation)
-    meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
-    unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-    quit()
+    cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
+        , "\nLength of unique meta muts:",length(meta_muts_u)
+        , "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
+        , "These correspond to:", discrepancy, "entries"
+        , "\nThese problematic muts are:\n"
+        , discrepancy_uniq)
+    #quit()
+    cat("\nChecking again...")
+    expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
+    if (nrow(merged_df2) == expected_nrows_df2){
+      cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
+          ,"\nExpected no. of rows: ", expected_nrows_df2
+          ,"\nGot no. of rows: ", nrow(merged_df2))
+    }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
+            , "\nExpected no. of rows after merge: ", expected_nrows_df2
+            , "\nGot no. of rows: ", nrow(merged_df2)
+            , "\nQuitting!")
+      quit()
+      
+      }
+  
  }
+  
+  
+  

  # Quick formatting: ordering df and pretty labels