From 3d45780c1afff3f98a1988fdb53d9cf06b25ea56 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 1 Feb 2022 16:23:03 +0000
Subject: [PATCH] updated docs for dm_om_data.R

---
 scripts/functions/corr_plot_df.R            | 128 --------------------
 scripts/functions/dm_om_data.R              |  48 +++++---
 scripts/functions/tests/test_corr_plot_df.R |   7 --
 scripts/plotting/get_plotting_dfs.R         |  50 ++++----
 4 files changed, 54 insertions(+), 179 deletions(-)
 delete mode 100644 scripts/functions/corr_plot_df.R
 delete mode 100644 scripts/functions/tests/test_corr_plot_df.R

diff --git a/scripts/functions/corr_plot_df.R b/scripts/functions/corr_plot_df.R
deleted file mode 100644
index 0493156..0000000
--- a/scripts/functions/corr_plot_df.R
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for Correlation plots: 
-# corr_data_extract()
-# Input:
-    # corr_plot_df: data with all parameters (my_use case)
-        # merged_df3 or merged_df2!?
-    # gene: [sanity check]
-    # drug: relates to a column name that will need to extracted
-    # ligand_dist_colname = LigDist_colname (variable from plotting_globals()
-   
-#colnames_to_extract = c("mutationinformation"
- #                         , "duet_affinity_change")
-#display_colnames_key = c(mutationinformation = "MUT"
-#                           , duet_affinity_change = "DUET")
-# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
-# No formatting applied to these cols i.e display name
-
-# TO DO: SHINY
-    #1) Corr type?
-    #2)
-##################################################################
-corr_data_extract <- function(corr_plot_df
-                              #, gene_name = gene
-                              , drug_name = drug
-                              , ligand_dist_colname = LigDist_colname
-                              , colnames_to_extract
-                              , colnames_display_key
-                              , extract_scaled_cols = F){
-  
-  if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
-  #if ( missing(colnames_to_extract) ){
-      
-    cat("\n=========================================="
-        , "\nCORR PLOTS data: ALL params"
-        , "\n=========================================")
-    
-    cat("\nExtracting default columns for"
-        #, "\nGene name:", gene
-        , "\nDrug name:", drug)
-    
-    colnames_to_extract =  c(drug
-                             #, "mutationinformation"
-                             , "mutation_info_labels"  
-                             , "duet_stability_change" 
-                             , "ligand_affinity_change"
-                             #, "ligand_distance"
-                             , ligand_dist_colname
-                             , "ddg_foldx"
-                             , "deepddg"
-                             , "asa"
-                             , "rsa"
-                             , "kd_values"
-                             , "rd_values"
-                             , "af"
-                             , "log10_or_mychisq"
-                             , "neglog_pval_fisher"
-                             , "ddg_dynamut2"
-                             , "consurf_score"
-                             , "snap2_score"
-                             , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
-                             , "mcsm_na_affinity"
-                             , "mcsm_ppi2_affinity"
-    )
-    
-    # [optional] arg: extract_scaled_cols
-    if (extract_scaled_cols){
-      cat("\nExtracting scaled columns as well...\n")
-      all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
-      colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
-      
-    }else{
-      colnames_to_extract = colnames_to_extract
-    }
-    
-    corr_df = corr_plot_df[, colnames(corr_plot_df)%in%colnames_to_extract]
-
-    # arg: colnames_display_key
-    colnames_display_key =  c(duet_stability_change  = "DUET"
-                            , ligand_affinity_change = "mCSM-lig"
-                            #, ligand_distance        = "ligand_distance"
-                            #, ligand_dist_colname        = "ligand_distance"
-                            , ddg_foldx              = "FoldX"
-                            , deepddg                = "DeepDDG"
-                            , asa                    = "ASA"
-                            , rsa                    = "RSA"
-                            , kd_values              = "KD"
-                            , rd_values              = "RD"
-                            , af                     = "MAF"
-                            , log10_or_mychisq       = "Log (OR)"
-                            , neglog_pval_fisher     = "-Log (P)"
-                            , ddg_dynamut2           = "Dynamut2"
-                            , consurf_score          = "Consurf"
-                            , snap2_score            = "SNAP2"
-                            , ddg_dynamut            = "Dynamut"
-                            , ddg_encom              = "ENCoM-DDG"
-                            , ddg_mcsm               = "mCSM"
-                            , ddg_sdm                = "SDM"
-                            , ddg_duet               = "DUET-d"
-                            , dds_encom              = "ENCoM-DDS"
-                            , mcsm_na_affinity       = "mCSM-NA"
-                            , mcsm_ppi2_affinity     = "mCSM-PPI2")
-
-  # COMMENT: This only works when all the columns are in the namekey vector.
-  # If one is missing, there is no error, but it also renamed as "NA.        
-  #names(corr_df) <- colnames_display_key[names(corr_df)]
-  
-  # Solution: to use plyr::rename() 
-  # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
-  corr_df = plyr::rename(corr_df
-                         , replace = colnames_display_key
-                         , warn_missing = T
-                         , warn_duplicated = T)
-  
-  cat("\nExtracted ncols:", ncol(corr_df)
-      ,"\nRenaming successful")
-  
-  cat("\nSneak peak...")
-  print(head(corr_df))
-  
-  # Move drug column to the end
-  last_col = colnames(corr_df[ncol(corr_df)])
-  corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
-  
-  return(corr_df_f)
-  }
-  
-}
diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R
index 221e0fc..7f4a119 100644
--- a/scripts/functions/dm_om_data.R
+++ b/scripts/functions/dm_om_data.R
@@ -1,28 +1,40 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for dm om plots: 
-  # generating WF and LF data for each of the parameters
-   # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
+  # generating WF and LF data for each of the parameters:
+    # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
   # Called by get_plotting_dfs.R
 
 # dm_om_wf_lf_data()
-# Input: data with all parameters (merged_df3, my_use case)
-# gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values]
-# colnames_to_extract = c("mutationinformation"
-#                         , "duet_affinity_change...")
-# ligand_dist_colname     = LigDist_colname # from globals
-# dr_muts                 = dr_muts_col # from globals ...dr_mutations_<drug>
-# other_muts              = other_muts_col # from globals ...other_mutations_<drug>
-# snp_colname             = "mutationinformation"
-# aa_pos_colname          = "position" # to sort df by
-# mut_colname             = "mutation"
-# mut_info_colname        = "mutation_info"
-# mut_info_label_colname  = "mutation_info_labels" # if empty, below used
-# dr_other_muts_labels    = c("DM", "OM") # only used if ^^ = ""
-# categ_cols_to_factor: converts the cols with '_outcome'and 'info' to factor
+# INPUT: 
+    # df: merged_df3 (data with all parameters)
+      # NOTE*: merged_df2 will not be appropriate as it brings up most params as significant!?,atleast for gid
+    # gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values]
+    # colnames_to_extract     : columns to extract, either user-specified. 
+      #By default it is c("mutationinformation" , "duet_affinity_change...")
+    # ligand_dist_colname     : column name containing ligand distance. By deafult, it is LigDist_colname (imported from globals)
+    # dr_muts                 : dr_muts_col (imported from globals; dr_mutations_<drug>)
+    # other_muts              : other_muts_col (imported from globals ...other_mutations_<drug>)
+    # snp_colname             : SNP column name. By default it is "mutationinformation"
+    # aa_pos_colname          : Column name containing the aa position. This is used to sort the df by.
+    # mut_colname             : Column name containing snp info in format "<abc_pXXdef>. By default, it is "mutation"
+    # mut_info_colname        : Column name containing mutation info whether it is DM or OM. By default, it is "mutation_info"
+    # mut_info_label_colname  : Column containing pre-formatted labels for mutation info. 
+      # For my use case, this is called "mutation_info_labels"
+      # This column has short labels like DM and OM coresponding to dr_muts and other_muts.
+      # NOTE*: if this is left empty, then the arg ('dr_other_muts_labels') will be used
+    # dr_other_muts_labels    : User specified labels, must correspond to dr_muts and other_muts. 
+      # NOTE*: Only used if the arg (mut_info_label_colname) is empty!
+    # categ_cols_to_factor    : Column names to convert to factors. These mainly correspond to the outcome columns associated with the
+      # arg ('colnames_to_extract'). These have the suffix "_outcome" in their colnames. Additionally column 'mutation_info' is also 
+      # converted to factor. By default, it converts the cols with '_outcome'and 'info' to factor.
+      # Users are able to provide a vector of their corresponding column names
 
+# RETURNS: List
+    # WF nd LF data grouped by mutation_info i.e DM (drug mutations) and OM (other mutations)
+    
 # TO DO: SHINY
-#1) 
+#1) df to choose (merged_df3 or merged_df2)
 #2)
 ##################################################################
 dm_om_wf_lf_data <- function(df
@@ -48,7 +60,7 @@ dm_om_wf_lf_data <- function(df
   
   # common_dfs
   common_dfsL     = list(
-    wf_duet       = data.frame()
+      wf_duet       = data.frame()
     , lf_duet     = data.frame()
     , wf_mcsm_lig = data.frame()
     , lf_mcsm_lig = data.frame()
diff --git a/scripts/functions/tests/test_corr_plot_df.R b/scripts/functions/tests/test_corr_plot_df.R
deleted file mode 100644
index 4376a58..0000000
--- a/scripts/functions/tests/test_corr_plot_df.R
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env Rscript       
-source("~/git/LSHTM_analysis/config/gid.R")
-source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
-
-m3 = corr_data_extract(merged_df3); head(m3)
-m2 = corr_data_extract(meregd_df2); head(m2)
-m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S)
\ No newline at end of file
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index 5864ebe..18b7a2f 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -88,13 +88,6 @@ merged_df3      = all_plot_dfs[[2]]
 merged_df2_comp = all_plot_dfs[[3]]
 merged_df3_comp = all_plot_dfs[[4]]
 #======================================================================
-####################################################################
-#                        Data for combining other dfs
-####################################################################
-
-#source("other_dfs_data.R")
-# Fixed this at source i.e python script
-# Moved: "other_dfs_data.R" to redundant/
 
 ####################################################################
 #                        Data for subcols barplot (~heatmap)
@@ -109,35 +102,39 @@ merged_df3_comp = all_plot_dfs[[4]]
 #                        Data for logoplots
 ####################################################################
 
-#source(paste0(plot_script_path, "logo_data.R"))
-#s1 = c("\nSuccessfully sourced logo_data.R")
+#source(paste0(plot_script_path, "logo_data_msa.R"))
+#s1 = c("\nSuccessfully sourced logo_data_msa.R")
 #cat(s1)
-# input data is merged_df3
-# so repurposed it into a function so params can be passed instead to generate
-# data required for plotting.
-# Moved "logo_data.R" to redundant/
-
-source(paste0(plot_script_path, "logo_data_msa.R"))
-s1 = c("\nSuccessfully sourced logo_data_msa.R")
-cat(s1)
 
 ####################################################################
 #                     Data for DM OM Plots: WF and LF dfs
-#                       My function: dm_om_wf_lf_data() 
-####################################################################
+#                   My function: dm_om_wf_lf_data()
+#                 location: scripts/functions/dm_om_data.R
 #source("other_plots_data.R")
-# converted to a function
-# moved old one to redundant.
-source(paste0(plot_script_path, "dm_om_data.R"))
- 
-s2 = c("\nSuccessfully sourced other_plots_data.R")
-cat(s2)
+####################################################################
+
+#source(paste0(plot_script_path, "dm_om_data.R"))
+#s2 = c("\nSuccessfully sourced other_plots_data.R")
+#cat(s2)
 
 ####################################################################
 #                  Data for Lineage barplots: WF and LF dfs
+#               My function: lineage_plot_data()
+#           location: scripts/functions/lineage_plot_data.R
 ####################################################################
  
-source(paste0(plot_script_path, "lineage_data.R"))
+#source(paste0(plot_script_path, "lineage_data.R"))
+# converted to a function. Moved lineage_data.R to redundant/
+lineage_dfL = lineage_plot_data(df = merged_df2
+                                , lineage_column_name = "lineage"
+                                , remove_empty_lineage = F
+                                , lineage_label_col_name = "lineage_labels"
+                                , id_colname = "id"
+                                , snp_colname = "mutationinformation"
+                                )
+
+lin_wf = lineage_dfL[['lin_wf']]
+lin_lf = lineage_dfL[['lin_lf']]
 
 s3 = c("\nSuccessfully sourced lineage_data.R")
 cat(s3)
@@ -145,6 +142,7 @@ cat(s3)
 ####################################################################
 #                  Data for corr plots:
 #               My function: corr_data_extract()
+#          location: scripts/functions/corr_plot_data.R
 ####################################################################
 # make sure the above script works because merged_df2_combined is needed