added additional check in combining_df_plotting.R to account for check when generating merged_df2 as muts NOT present in mcsm can create trouble, so fixed that and ran it successfully for alr and katg

2022-01-18 17:36:54 +00:00 · 2022-01-18 17:36:54 +00:00 · e2cdee2d08
commit e2cdee2d08
parent 8f8a9db92c
4 changed files with 62 additions and 30 deletions
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@ -53,20 +53,20 @@ combining_dfs_plotting <- function(  my_df_u
        , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
        , "\nNA in AF:", sum(is.na(my_df_u$af)))
  }
-  
+  # 
-  # or kin
+  # # or kin
-  if (identical(sum(is.na(my_df_u$or_kin))
+  # if (identical(sum(is.na(my_df_u$or_kin))
-                , sum(is.na(my_df_u$pwald_kin))
+  #               , sum(is.na(my_df_u$pwald_kin))
-                , sum(is.na(my_df_u$af_kin)))){
+  #               , sum(is.na(my_df_u$af_kin)))){
-    cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
+  #   cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
-    na_count = sum(is.na(my_df_u$af_kin))
+  #   na_count = sum(is.na(my_df_u$af_kin))
-    cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
+  #   cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
-  } else{
+  # } else{
-    cat("\nFAIL: NA count mismatch"
+  #   cat("\nFAIL: NA count mismatch"
-        , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
+  #       , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
-        , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
+  #       , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
-        , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
+  #       , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
-  }
+  # }
  str(gene_metadata)
@ -98,7 +98,7 @@ combining_dfs_plotting <- function(  my_df_u
  # merging_cols = merging_cols[[1]]
  merging_cols = 'mutationinformation'
-  cat("\nLinking column being used: mutationinformation")
+  cat("\nLinking column being used:", merging_cols)
  # important checks!
  table(nchar(my_df_u$mutationinformation))
@ -111,6 +111,7 @@ combining_dfs_plotting <- function(  my_df_u
                     , y = my_df_u
                     , by = merging_cols
                     , all.y = T)
                     #, all.x = T)
  cat("\nDim of merged_df2: ", dim(merged_df2))
@ -138,6 +139,17 @@ combining_dfs_plotting <- function(  my_df_u
  head(merged_df2$position)
  merged_muts_u = unique(merged_df2$mutationinformation)
  meta_muts_u = unique(gene_metadata$mutationinformation)
  # find the index where it differs
  cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
      , "\nLength of unique meta muts:",length(meta_muts_u) )
  meta_muts_all    = gene_metadata$mutationinformation
  merged_muts      = merged_df2$mutationinformation
  discrepancy_uniq = unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
  discrepancy      = meta_muts_all[! meta_muts_all %in% merged_muts]
  # sanity check 
  cat("\nChecking nrows in merged_df2")
  if(nrow(gene_metadata) == nrow(merged_df2)){
@ -145,17 +157,37 @@ combining_dfs_plotting <- function(  my_df_u
        ,"\nExpected no. of rows: ",nrow(gene_metadata) 
        ,"\nGot no. of rows: ", nrow(merged_df2))
  } else{
-    cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+    cat("\nWARNING: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
        , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
        , "\nGot no. of rows: ", nrow(merged_df2)
        , "\nFinding discrepancy")
    merged_muts_u = unique(merged_df2$mutationinformation)
    meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
-    unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
+    cat("\nLength of unique mcsm_muts:", length(merged_muts_u)
        , "\nLength of unique meta muts:",length(meta_muts_u)
        , "\nLength of unique muts in meta muts NOT in mcsm muts:", length(discrepancy_uniq)
        , "These correspond to:", discrepancy, "entries"
        , "\nThese problematic muts are:\n"
        , discrepancy_uniq)
    #quit()
    cat("\nChecking again...")
    expected_nrows_df2 = nrow(gene_metadata) - length(discrepancy)
    if (nrow(merged_df2) == expected_nrows_df2){
      cat("\nPASS: nrow(merged_df2) is as expected after accounting for discrepancy"
          ,"\nExpected no. of rows: ", expected_nrows_df2
          ,"\nGot no. of rows: ", nrow(merged_df2))
    }else{ cat("\nFAIL: nrow(merged_df2) is NOT as expected even after accounting for discrepancy"
            , "\nExpected no. of rows after merge: ", expected_nrows_df2
            , "\nGot no. of rows: ", nrow(merged_df2)
            , "\nQuitting!")
      quit()
      }
  }
  # Quick formatting: ordering df and pretty labels
  #------------------------------
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@ -20,8 +20,8 @@ library(dplyr)
 #lig_dist_cutoff  =  10 or global var LigDist_cutoff
 plotting_data <- function(df
-                          , lig_dist_colname = '' 
+                          , lig_dist_colname 
-                          , lig_dist_cutoff = '') {
+                          , lig_dist_cutoff) {
 my_df       = data.frame()
 my_df_u     = data.frame()
 my_df_u_lig = data.frame()
--- a/scripts/functions/tests/test_combining_dfs_plotting.R
+++ b/scripts/functions/tests/test_combining_dfs_plotting.R
@ -36,8 +36,9 @@ source("combining_dfs_plotting.R")
 #---------------------
 # call: import_dirs()
 #---------------------
-gene = 'gid'
+#gene = 'gid'
-drug = 'streptomycin'
+#drug = 'streptomycin'
 source("/home/tanu/git/LSHTM_analysis/config/alr.R")
 import_dirs(drug_name = drug, gene_name = gene)
@ -59,8 +60,9 @@ mcsm_comb_data = read.csv(infile_params, header = T)
 # call function: plotting_data()
 #-------------------------------
 pd_df = plotting_data(df = mcsm_comb_data
-                      , ligand_dist_colname = 'ligand_distance'
+                      , lig_dist_colname = LigDist_colname
-                      , lig_dist_cutoff = 10
+                      , lig_dist_cutoff = LigDist_cutoff)
 my_df_u = pd_df[[2]] 
 #======================================
@ -84,8 +86,8 @@ gene_metadata <- read.csv(infile_metadata
 #-----------------------------------------
 all_plot_dfs = combining_dfs_plotting(my_df_u
                       , gene_metadata
-                       , lig_dist_colname = 'ligand_distance'
+                       , lig_dist_colname = LigDist_colname
-                       , lig_dist_cutoff = 10)
+                       , lig_dist_cutoff = LigDist_cutoff)
 merged_df2          = all_plot_dfs[[1]]
 merged_df3          = all_plot_dfs[[2]]
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source)  # source all your R scripts!
 # set plot script dir
 plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"
`@ -188,5 +188,3 @@ map(paste0(func_path, source_files), source) # source all your R scripts!`

	`# set plot script dir`	`# set plot script dir`
	`plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"`	`plot_script_path = "~/git/LSHTM_analysis/scripts/plotting/"`