From d13484e8f5b17f78c9594e34256a7a10c5af6be5 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 1 Feb 2022 16:25:58 +0000
Subject: [PATCH] added function lineage_plot_data.R and corresponding test
 script, also  renamed corr_plot_df.R to corr_plot_data and corresponing test
 script

---
 scripts/functions/corr_plot_data.R            | 130 +++++++++++
 scripts/functions/lineage_plot_data.R         | 208 ++++++++++++++++++
 scripts/functions/tests/test_corr_plot_data.R |   7 +
 .../functions/tests/test_lineage_plot_data.R  |  19 ++
 4 files changed, 364 insertions(+)
 create mode 100644 scripts/functions/corr_plot_data.R
 create mode 100644 scripts/functions/lineage_plot_data.R
 create mode 100644 scripts/functions/tests/test_corr_plot_data.R
 create mode 100644 scripts/functions/tests/test_lineage_plot_data.R

diff --git a/scripts/functions/corr_plot_data.R b/scripts/functions/corr_plot_data.R
new file mode 100644
index 0000000..12945fc
--- /dev/null
+++ b/scripts/functions/corr_plot_data.R
@@ -0,0 +1,130 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for Correlation plots: 
+
+# corr_data_extract()
+# INPUT:
+    # df: data with all parameters (my_use case)
+        # merged_df3 or merged_df2!?
+    # gene: [sanity check]
+    # drug: relates to a column name that will need to extracted
+    # ligand_dist_colname = LigDist_colname (variable from plotting_globals()
+   
+# colnames_to_extract = c("mutationinformation", "duet_affinity_change")
+# display_colnames_key = c(mutationinformation = "MUT" , duet_affinity_change = "DUET")
+# extract_scaled_cols = T or F, so that parameters with the _scaled suffix can be extracted.
+    # NOTE*: No formatting applied to these cols i.e display name
+
+# RETURNS: DF
+    # containing all the columns required for generating downstream correlation plots
+
+# TO DO: SHINY
+    #1) Corr type?
+    #2)
+##################################################################
+corr_data_extract <- function(df
+                              #, gene_name = gene
+                              , drug_name = drug
+                              , ligand_dist_colname = LigDist_colname
+                              , colnames_to_extract
+                              , colnames_display_key
+                              , extract_scaled_cols = F){
+  
+  if ( missing(colnames_to_extract) || missing(colnames_display_key) ){
+  #if ( missing(colnames_to_extract) ){
+      
+    cat("\n=========================================="
+        , "\nCORR PLOTS data: ALL params"
+        , "\n=========================================")
+    
+    cat("\nExtracting default columns for"
+        #, "\nGene name:", gene
+        , "\nDrug name:", drug)
+    
+    colnames_to_extract =  c(drug
+                             #, "mutationinformation"
+                             , "mutation_info_labels"  
+                             , "duet_stability_change" 
+                             , "ligand_affinity_change"
+                             #, "ligand_distance"
+                             , ligand_dist_colname
+                             , "ddg_foldx"
+                             , "deepddg"
+                             , "asa"
+                             , "rsa"
+                             , "kd_values"
+                             , "rd_values"
+                             , "af"
+                             , "log10_or_mychisq"
+                             , "neglog_pval_fisher"
+                             , "ddg_dynamut2"
+                             , "consurf_score"
+                             , "snap2_score"
+                             , "ddg_dynamut", "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet"
+                             , "mcsm_na_affinity"
+                             , "mcsm_ppi2_affinity"
+    )
+    
+    # [optional] arg: extract_scaled_cols
+    if (extract_scaled_cols){
+      cat("\nExtracting scaled columns as well...\n")
+      all_scaled_cols = colnames(merged_df3)[grep(".*scaled", colnames(merged_df3))]
+      colnames_to_extract = c(colnames_to_extract, all_scaled_cols)
+      
+    }else{
+      colnames_to_extract = colnames_to_extract
+    }
+    
+    corr_df = df[, colnames(df)%in%colnames_to_extract]
+
+    # arg: colnames_display_key
+    colnames_display_key =  c(duet_stability_change  = "DUET"
+                            , ligand_affinity_change = "mCSM-lig"
+                            #, ligand_distance        = "ligand_distance"
+                            #, ligand_dist_colname        = "ligand_distance"
+                            , ddg_foldx              = "FoldX"
+                            , deepddg                = "DeepDDG"
+                            , asa                    = "ASA"
+                            , rsa                    = "RSA"
+                            , kd_values              = "KD"
+                            , rd_values              = "RD"
+                            , af                     = "MAF"
+                            , log10_or_mychisq       = "Log (OR)"
+                            , neglog_pval_fisher     = "-Log (P)"
+                            , ddg_dynamut2           = "Dynamut2"
+                            , consurf_score          = "Consurf"
+                            , snap2_score            = "SNAP2"
+                            , ddg_dynamut            = "Dynamut"
+                            , ddg_encom              = "ENCoM-DDG"
+                            , ddg_mcsm               = "mCSM"
+                            , ddg_sdm                = "SDM"
+                            , ddg_duet               = "DUET-d"
+                            , dds_encom              = "ENCoM-DDS"
+                            , mcsm_na_affinity       = "mCSM-NA"
+                            , mcsm_ppi2_affinity     = "mCSM-PPI2")
+
+  # COMMENT: This only works when all the columns are in the namekey vector.
+  # If one is missing, there is no error, but it also renamed as "NA.        
+  #names(corr_df) <- colnames_display_key[names(corr_df)]
+  
+  # Solution: to use plyr::rename() 
+  # Consider using requireNamespace() instead of library() so its function names doesn't collide with dplyr's.
+  corr_df = plyr::rename(corr_df
+                         , replace = colnames_display_key
+                         , warn_missing = T
+                         , warn_duplicated = T)
+  
+  cat("\nExtracted ncols:", ncol(corr_df)
+      ,"\nRenaming successful")
+  
+  cat("\nSneak peak...")
+  print(head(corr_df))
+  
+  # Move drug column to the end
+  last_col = colnames(corr_df[ncol(corr_df)])
+  corr_df_f = corr_df %>% dplyr::relocate(all_of(drug), .after = last_col)
+  
+  return(corr_df_f)
+  }
+  
+}
diff --git a/scripts/functions/lineage_plot_data.R b/scripts/functions/lineage_plot_data.R
new file mode 100644
index 0000000..116f156
--- /dev/null
+++ b/scripts/functions/lineage_plot_data.R
@@ -0,0 +1,208 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for lineage plots
+# Called by get_plotting_dfs.R
+
+# lineage_plot_data()
+# INPUT: 
+  # df                    : merged_df2 (data with 1:many relationship b/w snp and lineage)
+    # NOTE*: DO NOT use merged_df3 as it loses the 1:many relationship)
+  # lineage_column_name   : Column name that contains lineage info
+  # remove_empty_lineage  : where lineage info is missing, whether to omit those or not
+  # lineage_label_col_name: Column containing pre-formatted lineage labels. 
+    # For my case, this is called "lineage_labels"
+    # This column has short labels like L1, L2, L3, etc. 
+    # if this is left empty, then the lineage_column_name will be used
+  # id_colname            : sample-id column. Used to calculate SNP count
+  # snp_colname           : SNP column. Used to calculate SNP diversity
+
+# RETURNS: List
+  # WF and LF data for lineage-wise snp count and snp diversity
+
+# TO DO: SHINY
+#1) remove empty positions
+#2) select lineages to display?
+#########################################################
+
+lineage_plot_data <- function(df
+                            , lineage_column_name = "lineage"
+                            , remove_empty_lineage = T
+                            , lineage_label_col_name = "lineage_labels"
+                            , id_colname = "id"
+                            , snp_colname = "mutationinformation"){
+
+    ################################################################
+    # Get WF and LF data with lineage count, and snp diversity
+    ################################################################
+    # Initialise output list
+    lineage_dataL = list(
+        lin_wf = data.frame()
+      , lin_lf = data.frame())
+    
+    table(df[[lineage_column_name]])
+  
+    #------------------------
+    # Check lineage counts
+    # Including missing 
+    #------------------------
+    if (missing(remove_empty_lineage)){
+      
+        miss_ll = table(df[[lineage_column_name]] == "")[[2]]    
+        rm_ll   = which(df[[lineage_column_name]] == "")
+      
+        if (length(rm_ll) == miss_ll){
+        cat("\nNo. of samples with missing lineage classification:"
+            , miss_ll
+            , "Removing these...")
+        df = df[-rm_ll,]
+        df = droplevels(df)
+        }else{
+          cat("\nSomething went wrong...numbers mismatch"
+            , "samples with missing lineages:", mis_all
+            , "No. of corresponding indices to remove:", rm_ll)
+            }
+        }else{
+      df = df
+      df = droplevels(df)
+      }
+
+    #------------------------
+    # Lineage labels column
+    #------------------------
+    if (lineage_label_col_name == ""){
+      cat("\nLineage label column missing..."
+         , "\nUsing the column:" , lineage_column_name, "as labels as well")
+      lin_labels = lineage_column_name
+      
+        #------------------------------------------
+        if ( !is.factor((df[[lin_labels]])) ){
+          df[lin_labels] = as.factor(df[lin_labels])
+          df[lin_labels] = factor()
+          }else{
+           cat("\nLineage label column already factor")
+           }
+        #------------------------------------------
+      }else{
+      #lin_labels = "lineage_labels"
+      lin_labels = lineage_label_col_name
+      cat("\nLineage label column present"
+          , "\nUsing it, column name:", lin_labels)
+       #------------------------------------------
+       if ( !is.factor((df[[lin_labels]])) ){
+          df[lin_labels] = as.factor(df[lin_labels])
+          }else{
+          cat("\nLineage label already factor")  
+          }
+       #------------------------------------------
+      }
+    
+    # This is how lineage labels will appear
+    cat("\nLineage labels will appear as below\n")
+    print( table(df[[lin_labels]]) )
+    cat("\n")
+    cat( "Class of", lin_labels, ":", class(df[[lin_labels]]) )
+    cat("\n")
+    print( "No. of levels:", nlevels(df[[lin_labels]]) )
+
+    #==========================================
+    # WF data: lineages with 
+    # snp count
+    # total_samples
+    # snp diversity (perc)
+    #==========================================
+    cat("\nCreating WF Lineage data...")
+    
+    sel_lineages = levels(df[[lin_labels]])
+
+    lin_wf = data.frame(sel_lineages) #4, 1
+    total_snps_u = NULL
+    total_samples = NULL
+
+    for (i in sel_lineages){
+      #print(i)
+      curr_total = length(unique(df[[id_colname]])[df[[lin_labels]]==i])
+      #print(curr_total)
+      total_samples = c(total_samples, curr_total)
+      print(total_samples)
+
+      foo = df[df[[lin_labels]]==i,]
+      print(paste0(i, "=======\n"))
+      print(length(unique(foo[[snp_colname]])))
+      curr_count = length(unique(foo[[snp_colname]]))
+      
+      total_snps_u = c(total_snps_u, curr_count)
+    }
+    
+    lin_wf
+
+    # Add these counts as columns to the df
+    lin_wf$num_snps_u = total_snps_u
+    lin_wf$total_samples = total_samples
+    lin_wf
+    
+    #----------------------
+    # Add SNP diversity
+    #----------------------
+    lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
+    lin_wf
+
+    #----------------------
+    # Add some formatting
+    #----------------------
+    # SNP diversity 
+    lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
+    lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
+
+    # should be as you like it to appear
+    lin_wf$sel_lineages
+
+    # Important: Relevel factors so that x-axis categ appear as you want
+    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
+    #levels(lin_lf$sel_lineages)
+    
+    lineage_dataL[['lin_wf']] = lin_wf
+    
+    cat("\nCOMPLETED: Successfully created WF lineage data")
+
+    #=================================
+    # LF data: lineages with 
+    # snp count
+    # total_samples
+    # snp diversity (perc)
+    #=================================
+    cat("\nCreating LF Lineage data...")
+    
+    names(lin_wf)
+    tot_cols = ncol(lin_wf)
+    pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
+    pivot_cols_n = length(pivot_cols)
+    
+    expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
+    
+    lin_lf <- gather(lin_wf
+                     , count_categ
+                     , p_count
+                     , num_snps_u:total_samples
+                     , factor_key = TRUE)
+    lin_lf
+    
+    # quick checks
+    if ( nrow(lin_lf )  ==  expected_rows ){
+      cat("\nPASS: Lineage LF data created"
+          , "\nnrow: ", nrow(lin_lf)
+          , "\nncol: ", ncol(lin_lf))
+    } else {
+      cat("\nFAIL: numbers mismatch"
+          , "\nExpected nrow: ", expected_rows)
+    }
+    
+    # Important: Relevel factors so that x-axis categ appear as you want
+    #lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c())
+    #levels(lin_lf$sel_lineages)
+    
+    lineage_dataL[['lin_lf']] = lin_lf
+    
+    cat("\nCOMPLETED: Successfully created LF lineage data")
+    return(lineage_dataL)
+# end bracket
+}
diff --git a/scripts/functions/tests/test_corr_plot_data.R b/scripts/functions/tests/test_corr_plot_data.R
new file mode 100644
index 0000000..4376a58
--- /dev/null
+++ b/scripts/functions/tests/test_corr_plot_data.R
@@ -0,0 +1,7 @@
+#!/usr/bin/env Rscript       
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+m3 = corr_data_extract(merged_df3); head(m3)
+m2 = corr_data_extract(meregd_df2); head(m2)
+m3S = corr_data_extract(merged_df3, extract_scaled_cols = T); head(m3S)
\ No newline at end of file
diff --git a/scripts/functions/tests/test_lineage_plot_data.R b/scripts/functions/tests/test_lineage_plot_data.R
new file mode 100644
index 0000000..4a3c7cf
--- /dev/null
+++ b/scripts/functions/tests/test_lineage_plot_data.R
@@ -0,0 +1,19 @@
+#!/usr/bin/env Rscript       
+#source("~/git/LSHTM_analysis/config/alr.R")
+#source("~/git/LSHTM_analysis/config/embb.R")
+source("~/git/LSHTM_analysis/config/gid.R")
+#source("~/git/LSHTM_analysis/config/katg.R")
+#source("~/git/LSHTM_analysis/config/pnca.R")
+#source("~/git/LSHTM_analysis/config/rpob.R")
+
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+#####################################################################
+lineage_dfL = lineage_plot_data(df = merged_df2
+                                , lineage_column_name = "lineage"
+                                , remove_empty_lineage = F
+                                , lineage_label_col_name = "lineage_labels"
+                                , id_colname = "id"
+                                , snp_colname = "mutationinformation"
+                                )
+lin_wf = lineage_dfL[['lin_wf']]
+lin_lf = lineage_dfL[['lin_lf']]
\ No newline at end of file