updated docs for dm_om_data.R

2022-02-01 16:23:03 +00:00 · 2022-02-01 16:23:03 +00:00 · 3d45780c1a
commit 3d45780c1a
parent e795c00831
4 changed files with 54 additions and 179 deletions
--- a/scripts/functions/corr_plot_df.R
+++ b/scripts/functions/corr_plot_df.R
@ -1,128 +0,0 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for Correlation plots: 
 # corr_data_extract()
 # Input:
    # corr_plot_df: data with all parameters (my_use case)
        # merged_df3 or merged_df2!?
    # gene: [sanity check]
    # drug: relates to a column name that will need to extracted
    # ligand_dist_colname = LigDist_colname (variable from plotting_globals()
 #colnames_to_extract = c("mutationinformation"
 #                         , "duet_affinity_change")
 #display_colnames_key = c(mutationinformation = "MUT"
 #                           , duet_affinity_change = "DUET")
 # extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
 # No formatting applied to these cols i.e display name
 # TO DO: SHINY
    #1) Corr type?
    #2)
 ##################################################################
 corr_data_extract <- function(corr_plot_df
                              #, gene_name = gene
                              , drug_name = drug
                              , ligand_dist_colname = LigDist_colname
                              , colnames_to_extract
                              , colnames_display_key
                              , extract_scaled_cols = F){
  if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
  #if ( missing(colnames_to_extract) ){
    cat("\n=========================================="
        , "\nCORR PLOTS data: ALL params"
        , "\n=========================================")
    cat("\nExtracting default columns for"
        #, "\nGene name:", gene
        , "\nDrug name:", drug)
    colnames_to_extract =  c(drug
                             #, "mutationinformation"
                             , "mutation_info_labels"  
                             , "duet_stability_change" 
                             , "ligand_affinity_change"
                             #, "ligand_distance"
                             , ligand_dist_colname
                             , "ddg_foldx"
                             , "deepddg"
                             , "asa"
                             , "rsa"
                             , "kd_values"
                             , "rd_values"
                             , "af"
                             , "log10_or_mychisq"
                             , "neglog_pval_fisher"
                             , "ddg_dynamut2"
                             , "consurf_score"
                             , "snap2_score"
                             , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
                             , "mcsm_na_affinity"
                             , "mcsm_ppi2_affinity"
    )
    # [optional] arg: extract_scaled_cols
    if (extract_scaled_cols){
      cat("\nExtracting scaled columns as well...\n")
      all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
      colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
    }else{
      colnames_to_extract = colnames_to_extract
    }
    corr_df = corr_plot_df[, colnames(corr_plot_df)%in%colnames_to_extract]
    # arg: colnames_display_key
    colnames_display_key =  c(duet_stability_change  = "DUET"
                            , ligand_affinity_change = "mCSM-lig"
                            #, ligand_distance        = "ligand_distance"
                            #, ligand_dist_colname        = "ligand_distance"
                            , ddg_foldx              = "FoldX"
                            , deepddg                = "DeepDDG"
                            , asa                    = "ASA"
                            , rsa                    = "RSA"
                            , kd_values              = "KD"
                            , rd_values              = "RD"
                            , af                     = "MAF"
                            , log10_or_mychisq       = "Log (OR)"
                            , neglog_pval_fisher     = "-Log (P)"
                            , ddg_dynamut2           = "Dynamut2"
                            , consurf_score          = "Consurf"
                            , snap2_score            = "SNAP2"
                            , ddg_dynamut            = "Dynamut"
                            , ddg_encom              = "ENCoM-DDG"
                            , ddg_mcsm               = "mCSM"
                            , ddg_sdm                = "SDM"
                            , ddg_duet               = "DUET-d"
                            , dds_encom              = "ENCoM-DDS"
                            , mcsm_na_affinity       = "mCSM-NA"
                            , mcsm_ppi2_affinity     = "mCSM-PPI2")
  # COMMENT: This only works when all the columns are in the namekey vector.
  # If one is missing, there is no error, but it also renamed as "NA.        
  #names(corr_df) <- colnames_display_key[names(corr_df)]
  # Solution: to use plyr::rename() 
  # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
  corr_df = plyr::rename(corr_df
                         , replace = colnames_display_key
                         , warn_missing = T
                         , warn_duplicated = T)
  cat("\nExtracted ncols:", ncol(corr_df)
      ,"\nRenaming successful")
  cat("\nSneak peak...")
  print(head(corr_df))
  # Move drug column to the end
  last_col = colnames(corr_df[ncol(corr_df)])
  corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
  return(corr_df_f)
  }
 }
--- a/scripts/functions/dm_om_data.R
+++ b/scripts/functions/dm_om_data.R
@ -1,28 +1,40 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for dm om plots: 
-  # generating WF and LF data for each of the parameters
+  # generating WF and LF data for each of the parameters:
    # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
  # Called by get_plotting_dfs.R
 # dm_om_wf_lf_data()
-# Input: data with all parameters (merged_df3, my_use case)
+# INPUT: 
    # df: merged_df3 (data with all parameters)
      # NOTE*: merged_df2 will not be appropriate as it brings up most params as significant!?,atleast for gid
    # gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values]
-# colnames_to_extract = c("mutationinformation"
+    # colnames_to_extract     : columns to extract, either user-specified. 
-#                         , "duet_affinity_change...")
+      #By default it is c("mutationinformation" , "duet_affinity_change...")
-# ligand_dist_colname     = LigDist_colname # from globals
+    # ligand_dist_colname     : column name containing ligand distance. By deafult, it is LigDist_colname (imported from globals)
-# dr_muts                 = dr_muts_col # from globals ...dr_mutations_<drug>
+    # dr_muts                 : dr_muts_col (imported from globals; dr_mutations_<drug>)
-# other_muts              = other_muts_col # from globals ...other_mutations_<drug>
+    # other_muts              : other_muts_col (imported from globals ...other_mutations_<drug>)
-# snp_colname             = "mutationinformation"
+    # snp_colname             : SNP column name. By default it is "mutationinformation"
-# aa_pos_colname          = "position" # to sort df by
+    # aa_pos_colname          : Column name containing the aa position. This is used to sort the df by.
-# mut_colname             = "mutation"
+    # mut_colname             : Column name containing snp info in format "<abc_pXXdef>. By default, it is "mutation"
-# mut_info_colname        = "mutation_info"
+    # mut_info_colname        : Column name containing mutation info whether it is DM or OM. By default, it is "mutation_info"
-# mut_info_label_colname  = "mutation_info_labels" # if empty, below used
+    # mut_info_label_colname  : Column containing pre-formatted labels for mutation info. 
-# dr_other_muts_labels    = c("DM", "OM") # only used if ^^ = ""
+      # For my use case, this is called "mutation_info_labels"
-# categ_cols_to_factor: converts the cols with '_outcome'and 'info' to factor
+      # This column has short labels like DM and OM coresponding to dr_muts and other_muts.
      # NOTE*: if this is left empty, then the arg ('dr_other_muts_labels') will be used
    # dr_other_muts_labels    : User specified labels, must correspond to dr_muts and other_muts. 
      # NOTE*: Only used if the arg (mut_info_label_colname) is empty!
    # categ_cols_to_factor    : Column names to convert to factors. These mainly correspond to the outcome columns associated with the
      # arg ('colnames_to_extract'). These have the suffix "_outcome" in their colnames. Additionally column 'mutation_info' is also 
      # converted to factor. By default, it converts the cols with '_outcome'and 'info' to factor.
      # Users are able to provide a vector of their corresponding column names
 # RETURNS: List
    # WF nd LF data grouped by mutation_info i.e DM (drug mutations) and OM (other mutations)
 # TO DO: SHINY
-#1) 
+#1) df to choose (merged_df3 or merged_df2)
 #2)
 ##################################################################
 dm_om_wf_lf_data <- function(df
--- a/scripts/functions/tests/test_corr_plot_df.R
+++ b/scripts/functions/tests/test_corr_plot_df.R
@ -1,7 +0,0 @@
 #!/usr/bin/env Rscript       
 source("~/git/LSHTM_analysis/config/gid.R")
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 m3 = corr_data_extract(merged_df3); head(m3)
 m2 = corr_data_extract(meregd_df2); head(m2)
 m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S)
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@ -88,13 +88,6 @@ merged_df3      = all_plot_dfs[[2]]
 merged_df2_comp = all_plot_dfs[[3]]
 merged_df3_comp = all_plot_dfs[[4]]
 #======================================================================
 ####################################################################
 #                        Data for combining other dfs
 ####################################################################
 #source("other_dfs_data.R")
 # Fixed this at source i.e python script
 # Moved: "other_dfs_data.R" to redundant/
 ####################################################################
 #                        Data for subcols barplot (~heatmap)
@ -109,35 +102,39 @@ merged_df3_comp = all_plot_dfs[[4]]
 #                        Data for logoplots
 ####################################################################
-#source(paste0(plot_script_path, "logo_data.R"))
+#source(paste0(plot_script_path, "logo_data_msa.R"))
-#s1 = c("\nSuccessfully sourced logo_data.R")
+#s1 = c("\nSuccessfully sourced logo_data_msa.R")
 #cat(s1)
 # input data is merged_df3
 # so repurposed it into a function so params can be passed instead to generate
 # data required for plotting.
 # Moved "logo_data.R" to redundant/
 source(paste0(plot_script_path, "logo_data_msa.R"))
 s1 = c("\nSuccessfully sourced logo_data_msa.R")
 cat(s1)
 ####################################################################
 #                     Data for DM OM Plots: WF and LF dfs
 #                   My function: dm_om_wf_lf_data()
-####################################################################
+#                 location: scripts/functions/dm_om_data.R
 #source("other_plots_data.R")
-# converted to a function
+####################################################################
 # moved old one to redundant.
 source(paste0(plot_script_path, "dm_om_data.R"))
-s2 = c("\nSuccessfully sourced other_plots_data.R")
+#source(paste0(plot_script_path, "dm_om_data.R"))
-cat(s2)
+#s2 = c("\nSuccessfully sourced other_plots_data.R")
 #cat(s2)
 ####################################################################
 #                  Data for Lineage barplots: WF and LF dfs
 #               My function: lineage_plot_data()
 #           location: scripts/functions/lineage_plot_data.R
 ####################################################################
-source(paste0(plot_script_path, "lineage_data.R"))
+#source(paste0(plot_script_path, "lineage_data.R"))
 # converted to a function. Moved lineage_data.R to redundant/
 lineage_dfL = lineage_plot_data(df = merged_df2
                                , lineage_column_name = "lineage"
                                , remove_empty_lineage = F
                                , lineage_label_col_name = "lineage_labels"
                                , id_colname = "id"
                                , snp_colname = "mutationinformation"
                                )
 lin_wf = lineage_dfL[['lin_wf']]
 lin_lf = lineage_dfL[['lin_lf']]
 s3 = c("\nSuccessfully sourced lineage_data.R")
 cat(s3)
@ -145,6 +142,7 @@ cat(s3)
 ####################################################################
 #                  Data for corr plots:
 #               My function: corr_data_extract()
 #          location: scripts/functions/corr_plot_data.R
 ####################################################################
 # make sure the above script works because merged_df2_combined is needed