From 0d8979dfcb773d566475b68b76ddebd7616ab2ce Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 1 Aug 2022 14:09:46 +0100
Subject: [PATCH] separted cols

---
 .../plotting/mcsm_mean_affinity_ensemble.R    | 190 ++++++------------
 .../plotting/mcsm_mean_stability_ensemble.R   |  95 ++++++---
 2 files changed, 129 insertions(+), 156 deletions(-)

diff --git a/scripts/plotting/mcsm_mean_affinity_ensemble.R b/scripts/plotting/mcsm_mean_affinity_ensemble.R
index 35b67b4..ef6efcc 100644
--- a/scripts/plotting/mcsm_mean_affinity_ensemble.R
+++ b/scripts/plotting/mcsm_mean_affinity_ensemble.R
@@ -60,145 +60,87 @@ all_colnames = as.data.frame(colnames(df3))
 common_cols  = c("mutationinformation"
                  , "position"
                  , "dst_mode"
-                 #, "mutation_info_labels"
+                 , "mutation_info_labels"
                  , "sensitivity"
                  , "ligand_distance")
 
 all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
-scaled_cols = c("duet_scaled"          , "duet_stability_change"
-                , "deepddg_scaled"      , "deepddg"   
-                , "ddg_dynamut2_scaled" , "ddg_dynamut2"
-                , "foldx_scaled"        , "ddg_foldx"
-                , "affinity_scaled"     , "ligand_affinity_change"
-                , "mmcsm_lig_scaled"    , "mmcsm_lig"                
-                , "mcsm_ppi2_scaled"   , "mcsm_ppi2_affinity"
-                , "mcsm_na_scaled"     , "mcsm_na_affinity"
-                #, "consurf_scaled"      , "consurf_score"
-                #, "snap2_scaled"        , "snap2_score"
-                #, "provean_scaled"      , "provean_score"
-                )
 all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
-outcome_cols_aff = c("duet_outcome"
-                 , "deepddg_outcome"
-                 , "ddg_dynamut2_outcome"
-                 , "foldx_outcome"
-                 #, "ddg_foldx", "foldx_scaled"
-                 , "ligand_outcome"
-                 , "mmcsm_lig_outcome"
-                 , "mcsm_ppi2_outcome"
-                 , "mcsm_na_outcome"
-                 # consurf outcome doesn't exist
-                 #,"provean_outcome"
-                 #,"snap2_outcome"
-                 )
+
+#===================
+# stability cols
+#===================
+raw_cols_stability =  c("duet_stability_change"
+                        , "deepddg"
+                        , "ddg_dynamut2"
+                        , "ddg_foldx")
+
+scaled_cols_stability = c("duet_scaled"       
+                          , "deepddg_scaled"   
+                          , "ddg_dynamut2_scaled"
+                          , "foldx_scaled")
+
+outcome_cols_stability = c("duet_outcome"
+                           , "deepddg_outcome"
+                           , "ddg_dynamut2_outcome"
+                           , "foldx_outcome")
+
+#===================
+# affinity cols
+#===================
+raw_cols_affinity =  c("ligand_affinity_change"
+                       , "mmcsm_lig"
+                       , "mcsm_ppi2_affinity"
+                       , "mcsm_na_affinity")
+
+scaled_cols_affinity = c("affinity_scaled" 
+                         , "mmcsm_lig_scaled" 
+                         , "mcsm_ppi2_scaled" 
+                         , "mcsm_na_scaled" )
+
+outcome_cols_affinity  = c( "ligand_outcome"
+                            , "mmcsm_lig_outcome"
+                            , "mcsm_ppi2_outcome"
+                            , "mcsm_na_outcome")
+
+#===================
+# conservation cols
+#===================
+# raw_cols_conservation =  c("consurf_score"
+#                            , "snap2_score"
+#                            , "provean_score")
+# 
+# scaled_cols_conservation = c("consurf_scaled"
+#                              , "snap2_scaled"
+#                              , "provean_scaled")
+# 
+# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
+# outcome_cols_conservation = c("provean_outcome"
+#                               , "snap2_outcome"
+#                               #consurf outcome doesn't exist
+# )
+
+######################################################################
 cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols
+                                                    , raw_cols
                                                     , scaled_cols
-                                                    , outcome_cols_aff)]
-
-cols_to_extract  = cols_to_consider[cols_to_consider%in%c(common_cols
-                                                          , outcome_cols_aff)]
+                                                    , outcome_cols_affinity)]
 
+# cols_to_extract  = cols_to_consider[cols_to_consider%in%c(common_cols
+#                                                           , outcome_cols_affinity)]
 ##############################################################
 #####################
-# Ensemble affinity
+# Ensemble affinity: affinity_cols
 #####################
 # extract outcome cols and map numeric values to the categories
 # Destabilising == 0, and stabilising == 1 so rescaling can let -1 be destabilising
-df3_plot = df3[, cols_to_extract]
 
-df3_plot[, outcome_cols_aff] <- sapply(df3_plot[, outcome_cols_aff]
-                             , function(x){ifelse(x == "Destabilising", 0, 1)})
-
-#=====================================
-# Stability (4 cols): average the scores
-# across predictors ==> average by
-# position ==> scale b/w -1 and 1
-
-# column to average: ens_stability
-#=====================================
-cols_to_average = which(colnames(df3_plot)%in%outcome_cols_aff)
-
-# ensemble average across predictors
-df3_plot$ens_stability = rowMeans(df3_plot[,cols_to_average])
-
-head(df3_plot$position); head(df3_plot$mutationinformation)
-head(df3_plot$ens_stability)
-table(df3_plot$ens_stability)
-
-# ensemble average of predictors by position
-mean_ens_stability_by_position <- df3_plot %>%
-  dplyr::group_by(position) %>%
-  dplyr::summarize(avg_ens_stability = mean(ens_stability))
-
-# REscale b/w -1 and 1
-#en_stab_min = min(mean_ens_stability_by_position['avg_ens_stability'])
-#en_stab_max = max(mean_ens_stability_by_position['avg_ens_stability']) 
-
-# scale the average stability value between -1 and 1
-# mean_ens_by_position['averaged_stability3_scaled'] = lapply(mean_ens_by_position['averaged_stability3']
-#                                                        , function(x) ifelse(x < 0, x/abs(en3_min), x/en3_max))
-
-mean_ens_stability_by_position['avg_ens_stability_scaled'] = lapply(mean_ens_stability_by_position['avg_ens_stability']
-                                                                  , function(x) {
-                                                                    scales::rescale(x, to  = c(-1,1)
-                                                                                    #, from = c(en_stab_min,en_stab_max))
-                                                                                    , from = c(0,1))
-                                                                  })
-cat(paste0('Average stability scores:\n'
-           , head(mean_ens_stability_by_position['avg_ens_stability'])
-           , '\n---------------------------------------------------------------'
-           , '\nAverage stability scaled scores:\n'
-           , head(mean_ens_stability_by_position['avg_ens_stability_scaled'])))
-
-# convert to a data frame
-mean_ens_stability_by_position = as.data.frame(mean_ens_stability_by_position)
-
-#FIXME: sanity checks
-# TODO: predetermine the bounds
-# l_bound_ens = min(mean_ens_stability_by_position['avg_ens_stability_scaled'])
-# u_bound_ens = max(mean_ens_stability_by_position['avg_ens_stability_scaled'])
+# df3_plot = df3[, cols_to_extract]
 # 
-# if ( (l_bound_ens == -1) && (u_bound_ens == 1) ){
-#   cat(paste0("PASS: ensemble stability scores averaged by position and then scaled"
-#         , "\nmin ensemble averaged stability: ", l_bound_ens
-#         , "\nmax ensemble averaged stability: ", u_bound_ens))
-# }else{
-#   cat(paste0("FAIL: avergaed duet scores could not be scaled b/w -1 and 1"
-#         , "\nmin ensemble averaged stability: ", l_bound_ens
-#         , "\nmax ensemble averaged stability: ", u_bound_ens))
-#   quit()
-# } 
-##################################################################
-#%%
-affinity_outcome_colnames = c("ligand_outcome", "mmcsm_lig_outcome"
-                              , "mcsm_ppi2_outcome"
-                              , "mcsm_na_outcome")
-
-outcome_cols_affinity = colnames(df3)[colnames(df3)%in%affinity_outcome_colnames]
-outcome_cols_affinity =  c("ligand_outcome"
-                           ,"mmcsm_lig_outcome")
-
-cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols, scaled_cols, outcome_cols_aff, outcome_cols_affinity)]
-cols_to_extract = cols_to_consider[cols_to_consider%in%c(common_cols, outcome_cols_aff)]
-
-foo = df3[, cols_to_consider]
-df3_plot_orig = df3[, cols_to_extract]
-
-############################
-# Ensemble affinity: ligand
-############################
-# extract ligand affinity outcome cols and map numeric values to the categories
-# Destabilising == 1, and stabilising == 0
-cols_to_extract_affinity = cols_to_consider[cols_to_consider%in%c(common_cols
-                                                                  , outcome_cols_affinity)]
-
-
-df3_plot_affinity = df3[, cols_to_extract_affinity]
-names(df3_plot_affinity)
-
-df3_plot_affinity[, outcome_cols_affinity] <- sapply(df3_plot_affinity[, outcome_cols_affinity]
-                                                     , function(x){ifelse(x == "Destabilising", 1, 0)})
+# df3_plot[, outcome_cols_affinity] <- sapply(df3_plot[, outcome_cols_affinity]
+#                              , function(x){ifelse(x == "Destabilising", 0, 1)})
 
+df3_plot = df3[, c(common_cols, scaled_cols)]
 #=====================================
 # Affintiy (2 cols): average the scores
 # across predictors ==> average by
@@ -206,7 +148,7 @@ df3_plot_affinity[, outcome_cols_affinity] <- sapply(df3_plot_affinity[, outcome
 
 # column to average: ens_affinity
 #=====================================
-cols_to_average_affinity = which(colnames(df3_plot_affinity)%in%outcome_cols_affinity)
+cols_to_average_affinity = which(colnames(df3_plot)%in%outcome_cols_affinity)
 cols_to_average_affinity
 
 # ensemble average across predictors
diff --git a/scripts/plotting/mcsm_mean_stability_ensemble.R b/scripts/plotting/mcsm_mean_stability_ensemble.R
index 6d63764..b6ce86f 100644
--- a/scripts/plotting/mcsm_mean_stability_ensemble.R
+++ b/scripts/plotting/mcsm_mean_stability_ensemble.R
@@ -53,54 +53,85 @@ all_colnames = as.data.frame(colnames(df3))
 common_cols  = c("mutationinformation"
                  , "position"
                  , "dst_mode"
-                 #, "mutation_info_labels"
+                 , "mutation_info_labels"
                  , "sensitivity"
                  , "ligand_distance")
 
 all_colnames$`colnames(df3)`[grep("scaled", all_colnames$`colnames(df3)`)]
-scaled_cols = c("duet_scaled"          , "duet_stability_change"
-                , "deepddg_scaled"      , "deepddg"   
-                , "ddg_dynamut2_scaled" , "ddg_dynamut2"
-                , "foldx_scaled"        , "ddg_foldx"
-                , "affinity_scaled"     , "ligand_affinity_change"
-                , "mmcsm_lig_scaled"    , "mmcsm_lig"                
-                , "mcsm_ppi2_scaled"   , "mcsm_ppi2_affinity"
-                , "mcsm_na_scaled"     , "mcsm_na_affinity"
-                #, "consurf_scaled"      , "consurf_score"
-                #, "snap2_scaled"        , "snap2_score"
-                #, "provean_scaled"      , "provean_score"
-)
 all_colnames$`colnames(df3)`[grep("outcome", all_colnames$`colnames(df3)`)]
-outcome_cols_aff = c("duet_outcome"
-                     , "deepddg_outcome"
-                     , "ddg_dynamut2_outcome"
-                     , "foldx_outcome"
-                     #, "ddg_foldx", "foldx_scaled"
-                     , "ligand_outcome"
-                     , "mmcsm_lig_outcome"
-                     , "mcsm_ppi2_outcome"
-                     , "mcsm_na_outcome"
-                     # consurf outcome doesn't exist
-                     #,"provean_outcome"
-                     #,"snap2_outcome"
-)
+
+#===================
+# stability cols
+#===================
+raw_cols_stability =  c("duet_stability_change"
+                        , "deepddg"
+                        , "ddg_dynamut2"
+                        , "ddg_foldx")
+
+scaled_cols_stability = c("duet_scaled"       
+                          , "deepddg_scaled"   
+                          , "ddg_dynamut2_scaled"
+                          , "foldx_scaled")
+
+outcome_cols_stability = c("duet_outcome"
+                           , "deepddg_outcome"
+                           , "ddg_dynamut2_outcome"
+                           , "foldx_outcome")
+
+#===================
+# affinity cols
+#===================
+raw_cols_affinity =  c("ligand_affinity_change"
+                       , "mmcsm_lig"
+                       , "mcsm_ppi2_affinity"
+                       , "mcsm_na_affinity")
+
+scaled_cols_affinity = c("affinity_scaled" 
+                         , "mmcsm_lig_scaled" 
+                         , "mcsm_ppi2_scaled" 
+                         , "mcsm_na_scaled" )
+
+outcome_cols_affinity  = c( "ligand_outcome"
+                            , "mmcsm_lig_outcome"
+                            , "mcsm_ppi2_outcome"
+                            , "mcsm_na_outcome")
+
+#===================
+# conservation cols
+#===================
+# raw_cols_conservation =  c("consurf_score"
+#                            , "snap2_score"
+#                            , "provean_score")
+# 
+# scaled_cols_conservation = c("consurf_scaled"
+#                              , "snap2_scaled"
+#                              , "provean_scaled")
+# 
+# # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
+# outcome_cols_conservation = c("provean_outcome"
+#                               , "snap2_outcome"
+#                               #consurf outcome doesn't exist
+# )
+
+###########################################################
 cols_to_consider = colnames(df3)[colnames(df3)%in%c(common_cols
-                                                    , scaled_cols
-                                                    , outcome_cols)]
+                                                    , raw_cols_stability
+                                                    , scaled_cols_stability
+                                                    , outcome_cols_stability)]
 
 cols_to_extract  = cols_to_consider[cols_to_consider%in%c(common_cols
-                                                          , outcome_cols)]
+                                                          , outcome_cols_stability)]
 
 ##############################################################
 #####################
-# Ensemble stability
+# Ensemble stability: outcome_cols_stability
 #####################
 # extract outcome cols and map numeric values to the categories
 # Destabilising == 0, and stabilising == 1, so rescaling can let -1 be destabilising
 df3_plot = df3[, cols_to_extract]
 
 # assign numeric values to outcome
-df3_plot[, outcome_cols] <- sapply(df3_plot[, outcome_cols]
+df3_plot[, outcome_cols_stability] <- sapply(df3_plot[, outcome_cols_stability]
                              , function(x){ifelse(x == "Destabilising", 0, 1)})
 table(df3$duet_outcome)
 table(df3_plot$duet_outcome)
@@ -111,7 +142,7 @@ table(df3_plot$duet_outcome)
 
 # column to average: ens_stability
 #=====================================
-cols_to_average = which(colnames(df3_plot)%in%outcome_cols)
+cols_to_average = which(colnames(df3_plot)%in%outcome_cols_stability)
 
 # ensemble average across predictors
 df3_plot$ens_stability = rowMeans(df3_plot[,cols_to_average])