From 4147a6b90ffaed58104a536c4065d2a404e51328 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 22 Aug 2022 13:05:53 +0100
Subject: [PATCH] a massive waste of time

---
 scripts/functions/dm_om_data.R      | 1119 ++++++++++++++-------------
 scripts/functions/plotting_data.R   |  198 +++--
 scripts/plotting/get_plotting_dfs.R |   29 +-
 3 files changed, 726 insertions(+), 620 deletions(-)

diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R
index a7867ce..2217f5c 100644
--- a/scripts/functions/dm_om_data.R
+++ b/scripts/functions/dm_om_data.R
@@ -1,40 +1,46 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Script to format data for dm om plots: 
-  # generating WF and LF data for each of the parameters:
-    # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
-  # Called by get_plotting_dfs.R
+# generating WF and LF data for each of the parameters:
+# duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
+# Called by get_plotting_dfs.R
 
 ##################################################################
 # from plotting_globals.R
 # DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname 
-gene
+#gene
 
 dm_om_wf_lf_data <- function(df
-                          , gene              # from globals
-                          , colnames_to_extract
-                          #, LigDist_colname # from globals used
-                          #, ppi2Dist_colname #from globals used 
-                          #, naDist_colname #from globals used
-                          , snp_colname             = "mutationinformation"
-                          , aa_pos_colname          = "position"
-                          , mut_colname             = "mutation"
-                          , mut_info_colname        = "dst_mode"
-                          , mut_info_label_colname  = "mutation_info_labels"
-                          , categ_cols_to_factor){
+                             , gene              # from globals
+                             , colnames_to_extract
+                             #, LigDist_colname # from globals used
+                             #, ppi2Dist_colname #from globals used 
+                             #, naDist_colname #from globals used
+                             , snp_colname             = "mutationinformation"
+                             , aa_pos_colname          = "position"
+                             , mut_colname             = "mutation"
+                             , mut_info_colname        = "dst_mode"
+                             , mut_info_label_colname  = "mutation_info_labels"
+                             , categ_cols_to_factor){
   
   df = as.data.frame(df)
   df$maf2 = log10(df$maf) # can't see otherwise
   sum(is.na(df$maf2))
   
   # Initialise the required dfs based on gene name
+  #geneL_normal  = c("pnca")
+  #geneL_na      = c("gid", "rpob")
+  #geneL_ppi2    = c("alr", "embb", "katg", "rpob")
+  
+  #ADDED: IMPORTANT for rpob to be in both to make sure all data is returned
   geneL_normal  = c("pnca")
-  geneL_na      = c("gid", "rpob")
-  geneL_ppi2    = c("alr", "embb", "katg", "rpob")
+  geneL_both    = c("rpob")
+  geneL_ppi2    = c("alr", "embb", "katg")
+  geneL_na      = c("gid")
   
   # common_dfs
   common_dfsL     = list(
-      wf_duet     = data.frame()
+    wf_duet     = data.frame()
     , lf_duet     = data.frame()
     , wf_mcsm_lig = data.frame()
     , lf_mcsm_lig = data.frame()
@@ -58,15 +64,7 @@ dm_om_wf_lf_data <- function(df
   if (tolower(gene)%in%geneL_normal){
     wf_lf_dataL   = common_dfsL
   }
-
- if (tolower(gene)%in%geneL_na){
-    additional_dfL = list(
-      wf_mcsm_na   = data.frame()
-      , lf_mcsm_na = data.frame()
-    )
-    wf_lf_dataL    = c(common_dfsL, additional_dfL)
-  }
-
+  
   if (tolower(gene)%in%geneL_ppi2){
     additional_dfL   = list(
       wf_mcsm_ppi2   = data.frame()
@@ -74,6 +72,25 @@ dm_om_wf_lf_data <- function(df
     )
     wf_lf_dataL      = c(common_dfsL, additional_dfL)
   }
+  
+  if (tolower(gene)%in%geneL_na){
+    additional_dfL = list(
+      wf_mcsm_na   = data.frame()
+      , lf_mcsm_na = data.frame()
+    )
+    wf_lf_dataL    = c(common_dfsL, additional_dfL)
+  }
+  
+  if (tolower(gene)%in%geneL_both){
+    additional_dfL = list(
+      wf_mcsm_ppi2   = data.frame(),
+      lf_mcsm_ppi2 = data.frame(),
+      wf_mcsm_na   = data.frame(),
+      lf_mcsm_na = data.frame()
+    )
+    wf_lf_dataL      = c(common_dfsL, additional_dfL)
+  }
+  
   cat("\nInitializing an empty list of length:"
       , length(wf_lf_dataL))
   
@@ -109,7 +126,7 @@ dm_om_wf_lf_data <- function(df
   
   ppi2_dist_dn = paste0("PPI Dist(", angstroms_symbol, ")"); ppi2_dist_dn
   mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
-
+  
   #=======================================================================
   if(missing(categ_cols_to_factor)){
     categ_cols_to_factor = grep( "_outcome|_info", colnames(df) )
@@ -130,7 +147,7 @@ dm_om_wf_lf_data <- function(df
   }
   
   cat("\ncols changed to factor are:\n", colnames(df)[categ_cols_to_factor] )
-
+  
   #=======================================================================
   if (missing(colnames_to_extract)){
     # NOTE: these vars are from globals
@@ -155,7 +172,7 @@ dm_om_wf_lf_data <- function(df
                         , "mmcsm_lig"             , "mmcsm_lig_scaled"      , "mmcsm_lig_outcome"
                         , "ligand_affinity_change", "affinity_scaled"       , "ligand_outcome"     , LigDist_colname
     )
-
+    
     display_common_colnames = c(snp_colname
                                 , mut_colname            , "dst_mode"          , mut_info_label_colname
                                 , aa_pos_colname
@@ -180,7 +197,7 @@ dm_om_wf_lf_data <- function(df
     }else{
       stop("Abort: Length mismatch: b/w ncols to extract and disply name")
     }
-  
+    
     # ordering is important!
     # static_cols_end = c(lig_dist_dn
     #                     , "ASA"
@@ -201,10 +218,10 @@ dm_om_wf_lf_data <- function(df
       # Rename cols: display names
       colnames(comb_df_sl) = display_colnames
       #colnames(comb_df)[colnames(comb_df)%in%colnames_to_extract] <- display_colnames
-
+      
       static_cols_end =  static_cols_end_common
       cat("\nend colnames for gene:", static_cols_end)
-      }
+    }
     
     if (tolower(gene)%in%geneL_ppi2){
       colnames_to_extract = c(common_colnames, "mcsm_ppi2_affinity"       ,"mcsm_ppi2_scaled" , "mcsm_ppi2_outcome"  , ppi2Dist_colname)
@@ -219,7 +236,7 @@ dm_om_wf_lf_data <- function(df
       # ordering is important!
       static_cols_end = c(ppi2_dist_dn, static_cols_end_common)
       cat("\nend colnames for gene:", static_cols_end)
-      }
+    }
     
     if (tolower(gene)%in%geneL_na){
       colnames_to_extract = c(common_colnames     ,"mcsm_na_affinity"     , "mcsm_na_scaled"  , "mcsm_na_outcome"   , naDist_colname)
@@ -237,543 +254,575 @@ dm_om_wf_lf_data <- function(df
       
     }
     
+    if (tolower(gene)%in%geneL_both){
+      colnames_to_extract = c(
+        common_colnames, 
+        "mcsm_ppi2_affinity" ,
+        "mcsm_ppi2_scaled" , 
+        "mcsm_ppi2_outcome"  , 
+        ppi2Dist_colname,
+        "mcsm_na_affinity"   , 
+        "mcsm_na_scaled"  , 
+        "mcsm_na_outcome"   , 
+        naDist_colname
+      )
+      display_colnames    = c(
+        display_common_colnames,
+        "mcsm_ppi2_affinity", 
+        mcsm_ppi2_dn, 
+        "mcsm_ppi2_outcome",
+        ppi2_dist_dn,
+        "mcsm_na_affinity",
+        mcsm_na_dn,
+        "mcsm_na_outcome",
+        na_dist_dn
+      )
+      comb_df_sl      = df[, colnames_to_extract]
+      colnames(comb_df_sl)   = display_colnames
+      comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2_dist_dn]]<DistCutOff,]
+      comb_df_sl_na   = comb_df_sl[comb_df_sl[[na_dist_dn]]<DistCutOff,]
+      static_cols_end = c(na_dist_dn, static_cols_end_common)
+      
+    }
+    
+    
     # Affinity filtered data: mcsm-lig: COMMON for all genes, mcsm-lig --> LigDist_colname
     comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dist_dn]]<DistCutOff,]
     
   }
   
-#======================
-# Selecting dfs
-# with appropriate cols
-#=======================
-static_cols_start =  c(snp_colname
-                       , aa_pos_colname
-                       , mut_colname
-                       , mut_info_label_colname)
-
-# static_cols_end
-cat("\nEnd colnames for gene:", static_cols_end)
-
-#########################################################################
-#==============
-# Distance and genomics
-#==============
-# WF data: dist + genomics
-cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
-wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen)
-
-#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen
-expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen))
-expected_rows_lf
-
-# LF dist and genomics
-lf_dist_gen = tidyr::gather(wf_dist_gen
-                        , key = param_type
-                        , value = param_value
-                        , all_of(duet_dn):tail(static_cols_end,1)
-                        , factor_key = TRUE)
-
-if (nrow(lf_dist_gen) == expected_rows_lf){
-  cat("\nPASS: long format data created for Distance and Genomics")
-}else{
-  cat("\nFAIL: long format data could not be created for Distance and Genomics")
-  quit()
-}
-
-# DROP duet cols
-drop_cols = c(duet_dn, "duet_outcome"); drop_cols
-table(lf_dist_gen$param_type)
-lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,]
-lf_dist_gen$param_type = factor(lf_dist_gen$param_type)
-table(lf_dist_gen$param_type)
-
-# NEW columns [outcome and outcome colname]
-lf_dist_gen$outcome_colname = mut_info_colname
-lf_dist_gen$outcome         = lf_dist_gen[[mut_info_label_colname]]
-head(lf_dist_gen)
-
-wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols))
-
-colnames(wf_dist_gen)
-colnames(lf_dist_gen)
-
-
-# Assign them to the output list
-wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen
-wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen
-##########################################################
-
-#==============
-# DUET
-#==============
-# WF data: duet
-cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
-wf_duet = comb_df_sl[, cols_to_select_duet]
-
-#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
-expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
-expected_rows_lf
-
-# LF data: duet
-lf_duet = tidyr::gather(wf_duet
-                  , key = param_type
-                  , value = param_value
-                  , all_of(duet_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_duet) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", duet_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-table(lf_duet$param_type)
-
-# NEW columns [outcome and outcome colname]
-lf_duet$outcome_colname = "duet_outcome"
-lf_duet$outcome         = lf_duet$duet_outcome
-
-# DROP static cols
-lf_duet  = lf_duet[!lf_duet$param_type%in%c(static_cols_end),]
-lf_duet$param_type = factor(lf_duet$param_type)
-table(lf_duet$param_type); colnames(lf_duet)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_duet']] = wf_duet
-wf_lf_dataL[['lf_duet']] = lf_duet
-
-############################################################################
-#==============
-# FoldX
-#==============
-# WF data: Foldx
-cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
-wf_foldx = comb_df_sl[, cols_to_select_foldx]
-
-pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
-expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
-expected_rows_lf
-
-# LF data: Foldx
-lf_foldx = gather(wf_foldx
-                 , key = param_type
-                 , value = param_value
-                 , all_of(foldx_dn):tail(static_cols_end,1)
-                 , factor_key = TRUE)
-
-if (nrow(lf_foldx) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", foldx_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW column
-lf_foldx$outcome_colname = "foldx_outcome"
-lf_foldx$outcome         = lf_foldx$foldx_outcome
-
-# DROP static cols
-lf_foldx  = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),]
-lf_foldx$param_type = factor(lf_foldx$param_type)
-table(lf_foldx$param_type); colnames(lf_foldx)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_foldx']] = wf_foldx
-wf_lf_dataL[['lf_foldx']] = lf_foldx
-
-############################################################################
-#==============
-# Deepddg
-#==============
-# WF data: deepddg
-cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
-wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
-
-pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
-expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
-expected_rows_lf
-
-# LF data: Deepddg
-lf_deepddg = gather(wf_deepddg
-                  , key = param_type
-                  , value = param_value
-                  , all_of(deepddg_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_deepddg) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", deepddg_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_deepddg$outcome_colname = "deepddg_outcome"
-lf_deepddg$outcome         = lf_deepddg$deepddg_outcome
-
-# DROP static cols
-lf_deepddg  = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),]
-lf_deepddg$param_type = factor(lf_deepddg$param_type)
-table(lf_deepddg$param_type); colnames(lf_deepddg)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_deepddg']] = wf_deepddg
-wf_lf_dataL[['lf_deepddg']] = lf_deepddg
-############################################################################
-#==============
-# Dynamut2: LF
-#==============
-# WF data: dynamut2
-cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
-wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
-
-pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
-expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
-expected_rows_lf
-
-# LF data: dynamut2
-lf_dynamut2 = gather(wf_dynamut2
-                     , key = param_type
-                     , value = param_value
-                     , all_of(dynamut2_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_dynamut2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", dynamut2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
-lf_dynamut2$outcome         = lf_dynamut2$ddg_dynamut2_outcome
-
-# DROP static cols
-lf_dynamut2  = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),]
-lf_dynamut2$param_type = factor(lf_dynamut2$param_type)
-table(lf_dynamut2$param_type); colnames(lf_dynamut2)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
-wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
-
-######################################################################################
-#==================
-# Consurf: LF
-#https://consurf.tau.ac.il/overview.php
-# consurf_score:
-# <0 (below average): slowly evolving i.e CONSERVED
-# >0 (above average): rapidly evolving, i.e VARIABLE 
-#table(df$consurf_colour_rev)
-# TODO
-#1--> "most_variable", 2--> "", 3-->"",  4-->""
-#5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
-#====================
-# WF data: consurf
-cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
-wf_consurf = comb_df_sl[, cols_to_select_consurf]
-
-pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
-expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-expected_rows_lf
-
-# when outcome didn't exist
-#cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
-#wf_consurf = comb_df_sl[, cols_to_select_consurf]
-# 
-# pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
-# expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
-# expected_rows_lf
-
-# LF data: consurf
-lf_consurf = gather(wf_consurf
-                    , key = param_type
-                    , value = param_value
-                    , all_of(consurf_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_consurf) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", consurf_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_consurf$outcome_colname = "consurf_outcome"
-lf_consurf$outcome         = lf_consurf$consurf_outcome
-
-# DROP static cols
-lf_consurf  = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),]
-lf_consurf$param_type = factor(lf_consurf$param_type)
-table(lf_consurf$param_type); colnames(lf_consurf)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_consurf']] = wf_consurf
-wf_lf_dataL[['lf_consurf']] = lf_consurf
-###########################################################################
-#==============
-# SNAP2: LF
-#==============
-# WF data: snap2
-cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
-wf_snap2 = comb_df_sl[, cols_to_select_snap2]
-
-pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
-expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
-expected_rows_lf
-
-# LF data: snap2
-lf_snap2 = gather(wf_snap2
-                  , key = param_type
-                  , value = param_value
-                  , all_of(snap2_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_snap2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", snap2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_snap2$outcome_colname = "snap2_outcome"
-lf_snap2$outcome         = lf_snap2$snap2_outcome
-
-# DROP static cols
-lf_snap2  = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),]
-lf_snap2$param_type = factor(lf_snap2$param_type)
-table(lf_snap2$param_type); colnames(lf_snap2)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_snap2']] = wf_snap2
-wf_lf_dataL[['lf_snap2']] = lf_snap2
-
-#==============
-# Provean2: LF
-#==============
-# WF data: provean
-cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
-wf_provean = comb_df_sl[, cols_to_select_provean]
-
-pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
-expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
-expected_rows_lf
-
-# LF data: provean
-lf_provean = gather(wf_provean
-                    , key = param_type
-                    , value = param_value
-                    , all_of(provean_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_provean) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", provean_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_provean$outcome_colname = "provean_outcome"
-lf_provean$outcome         = lf_provean$provean_outcome
-
-# DROP static cols
-lf_provean  = lf_provean[!lf_provean$param_type%in%c(static_cols_end),]
-lf_provean$param_type = factor(lf_provean$param_type)
-table(lf_provean$param_type); colnames(lf_provean)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_provean']] = wf_provean
-wf_lf_dataL[['lf_provean']] = lf_provean
-
-
-###########################################################################
-# AFFINITY cols
-###########################################################################
-#=========================
-# mCSM-lig:
-# data filtered by cut off
-#=========================
-#---------------------
-# mCSM-lig: WF and lF
-#----------------------
-# WF data: mcsm_lig
-cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
-wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
-
-pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
-expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
-expected_rows_lf
-
-# LF data: mcsm_lig
-lf_mcsm_lig = gather(wf_mcsm_lig
-                     , key = param_type
-                     , value = param_value
-                     , all_of(mcsm_lig_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_mcsm_lig) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", mcsm_lig_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for mcsm_lig")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_mcsm_lig$outcome_colname = "ligand_outcome"
-lf_mcsm_lig$outcome         = lf_mcsm_lig$ligand_outcome
-
-# DROP static cols
-lf_mcsm_lig  = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),]
-lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type)
-table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
-wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
-
-#=========================
-# mmCSM-lig2:
-# data filtered by cut off
-#=========================
-#---------------------
-# mmCSM-lig2: WF and lF
-#----------------------
-# WF data: mmcsm_lig2
-cols_to_select_mmcsm_lig2 = c(static_cols_start,  c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end)
-wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df
-
-pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2
-expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2))
-expected_rows_lf
-
-# LF data: mmcsm_lig2
-lf_mmcsm_lig2 = gather(wf_mmcsm_lig2
-                       , key = param_type
-                       , value = param_value
-                       , all_of(mmcsm_lig_dn2):tail(static_cols_end,1)
-                       , factor_key = TRUE)
-
-if (nrow(lf_mmcsm_lig2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", mmcsm_lig_dn2)
-}else{
-  cat("\nFAIL: long format data could not be created for mmcsm_lig2")
-  quit()
-}
-
-# NEW columns [outcome and outcome colname]
-lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome"
-lf_mmcsm_lig2$outcome         = lf_mmcsm_lig2$mmcsm_lig_outcome
-
-# DROP static cols
-lf_mmcsm_lig2  = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),]
-lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type)
-table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2)
-
-# Assign them to the output list
-wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2
-wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2
-
-#=========================
-# mcsm-ppi2 affinity
-# data filtered by cut off
-#========================
-if (tolower(gene)%in%geneL_ppi2){
-  #-----------------
-  # mCSM-PPI2: WF and lF
-  #-----------------
-  # WF data: mcsm-ppi2
-  cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
-  #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
-  wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
+  #======================
+  # Selecting dfs
+  # with appropriate cols
+  #=======================
+  static_cols_start =  c(snp_colname
+                         , aa_pos_colname
+                         , mut_colname
+                         , mut_info_label_colname)
   
-  pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
-  expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
+  # static_cols_end
+  cat("\nEnd colnames for gene:", static_cols_end)
+  
+  #########################################################################
+  #==============
+  # Distance and genomics
+  #==============
+  # WF data: dist + genomics
+  cols_to_select_dist_gen = c(static_cols_start, c("duet_outcome", duet_dn), static_cols_end)
+  wf_dist_gen = comb_df_sl[, cols_to_select_dist_gen]; head(wf_dist_gen)
+  
+  #pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+  pivot_cols_dist_gen = cols_to_select_dist_gen[1: (length(static_cols_start) + 1)]; pivot_cols_dist_gen
+  expected_rows_lf = nrow(wf_dist_gen) * (length(wf_dist_gen) - length(pivot_cols_dist_gen))
   expected_rows_lf
   
-  # LF data: mcsm-ppi2
-  lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
-                        , key = param_type
-                        , value = param_value
-                        , all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
-                        , factor_key = TRUE)
+  # LF dist and genomics
+  lf_dist_gen = tidyr::gather(wf_dist_gen
+                              , key = param_type
+                              , value = param_value
+                              , all_of(duet_dn):tail(static_cols_end,1)
+                              , factor_key = TRUE)
   
-  if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
+  if (nrow(lf_dist_gen) == expected_rows_lf){
+    cat("\nPASS: long format data created for Distance and Genomics")
+  }else{
+    cat("\nFAIL: long format data could not be created for Distance and Genomics")
+    quit()
+  }
+  
+  # DROP duet cols
+  drop_cols = c(duet_dn, "duet_outcome"); drop_cols
+  table(lf_dist_gen$param_type)
+  lf_dist_gen = lf_dist_gen[!lf_dist_gen$param_type%in%drop_cols,]
+  lf_dist_gen$param_type = factor(lf_dist_gen$param_type)
+  table(lf_dist_gen$param_type)
+  
+  # NEW columns [outcome and outcome colname]
+  lf_dist_gen$outcome_colname = mut_info_colname
+  lf_dist_gen$outcome         = lf_dist_gen[[mut_info_label_colname]]
+  head(lf_dist_gen)
+  
+  wf_dist_gen = subset(wf_dist_gen, select = !(names(wf_dist_gen) %in% drop_cols))
+  
+  colnames(wf_dist_gen)
+  colnames(lf_dist_gen)
+  
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_dist_gen']] = wf_dist_gen
+  wf_lf_dataL[['lf_dist_gen']] = lf_dist_gen
+  ##########################################################
+  
+  #==============
+  # DUET
+  #==============
+  # WF data: duet
+  cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
+  wf_duet = comb_df_sl[, cols_to_select_duet]
+  
+  #pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+  pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
+  expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
+  expected_rows_lf
+  
+  # LF data: duet
+  lf_duet = tidyr::gather(wf_duet
+                          , key = param_type
+                          , value = param_value
+                          , all_of(duet_dn):tail(static_cols_end,1)
+                          , factor_key = TRUE)
+  
+  if (nrow(lf_duet) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", duet_dn)
   }else{
     cat("\nFAIL: long format data could not be created for duet")
     quit()
   }
   
+  table(lf_duet$param_type)
+  
   # NEW columns [outcome and outcome colname]
-  lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome"
-  lf_mcsm_ppi2$outcome         = lf_mcsm_ppi2$mcsm_ppi2_outcome
+  lf_duet$outcome_colname = "duet_outcome"
+  lf_duet$outcome         = lf_duet$duet_outcome
   
   # DROP static cols
-  lf_mcsm_ppi2  = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),]
-  lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type)
-  table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2)
+  lf_duet  = lf_duet[!lf_duet$param_type%in%c(static_cols_end),]
+  lf_duet$param_type = factor(lf_duet$param_type)
+  table(lf_duet$param_type); colnames(lf_duet)
   
   # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
-  wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
+  wf_lf_dataL[['wf_duet']] = wf_duet
+  wf_lf_dataL[['lf_duet']] = lf_duet
   
-}
-
-
-
-#====================
-# mcsm-NA affinity
-# data filtered by cut off
-#====================
-if (tolower(gene)%in%geneL_na){
-  #---------------
-  # mCSM-NA: WF and lF
-  #-----------------
-  # WF data: mcsm-na
-  cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
-  #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
-  wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
+  ############################################################################
+  #==============
+  # FoldX
+  #==============
+  # WF data: Foldx
+  cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
+  wf_foldx = comb_df_sl[, cols_to_select_foldx]
   
-  pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
-  expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+  pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
+  expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
   expected_rows_lf
   
-  # LF data: mcsm-na
-  lf_mcsm_na = gather(wf_mcsm_na
+  # LF data: Foldx
+  lf_foldx = gather(wf_foldx
+                    , key = param_type
+                    , value = param_value
+                    , all_of(foldx_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+  
+  if (nrow(lf_foldx) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", foldx_dn)
+  }else{
+    cat("\nFAIL: long format data could not be created for duet")
+    quit()
+  }
+  
+  # NEW column
+  lf_foldx$outcome_colname = "foldx_outcome"
+  lf_foldx$outcome         = lf_foldx$foldx_outcome
+  
+  # DROP static cols
+  lf_foldx  = lf_foldx[!lf_foldx$param_type%in%c(static_cols_end),]
+  lf_foldx$param_type = factor(lf_foldx$param_type)
+  table(lf_foldx$param_type); colnames(lf_foldx)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_foldx']] = wf_foldx
+  wf_lf_dataL[['lf_foldx']] = lf_foldx
+  
+  ############################################################################
+  #==============
+  # Deepddg
+  #==============
+  # WF data: deepddg
+  cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
+  wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
+  
+  pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
+  expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
+  expected_rows_lf
+  
+  # LF data: Deepddg
+  lf_deepddg = gather(wf_deepddg
                       , key = param_type
                       , value = param_value
-                      , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                      , all_of(deepddg_dn):tail(static_cols_end,1)
                       , factor_key = TRUE)
   
-  if (nrow(lf_mcsm_na) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", mcsm_na_dn)
+  if (nrow(lf_deepddg) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", deepddg_dn)
   }else{
     cat("\nFAIL: long format data could not be created for duet")
     quit()
   }
   
   # NEW columns [outcome and outcome colname]
-  lf_mcsm_na$outcome_colname = "mcsm_na_outcome"
-  lf_mcsm_na$outcome         = lf_mcsm_na$mcsm_na_outcome
+  lf_deepddg$outcome_colname = "deepddg_outcome"
+  lf_deepddg$outcome         = lf_deepddg$deepddg_outcome
   
   # DROP static cols
-  lf_mcsm_na  = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),]
-  lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type)
-  table(lf_mcsm_na$param_type); colnames(lf_mcsm_na)
+  lf_deepddg  = lf_deepddg[!lf_deepddg$param_type%in%c(static_cols_end),]
+  lf_deepddg$param_type = factor(lf_deepddg$param_type)
+  table(lf_deepddg$param_type); colnames(lf_deepddg)
   
   # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
-  wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
-
-}
-
-return(wf_lf_dataL)
+  wf_lf_dataL[['wf_deepddg']] = wf_deepddg
+  wf_lf_dataL[['lf_deepddg']] = lf_deepddg
+  ############################################################################
+  #==============
+  # Dynamut2: LF
+  #==============
+  # WF data: dynamut2
+  cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
+  wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
+  
+  pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
+  expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
+  expected_rows_lf
+  
+  # LF data: dynamut2
+  lf_dynamut2 = gather(wf_dynamut2
+                       , key = param_type
+                       , value = param_value
+                       , all_of(dynamut2_dn):tail(static_cols_end,1)
+                       , factor_key = TRUE)
+  
+  if (nrow(lf_dynamut2) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", dynamut2_dn)
+  }else{
+    cat("\nFAIL: long format data could not be created for duet")
+    quit()
+  }
+  
+  # NEW columns [outcome and outcome colname]
+  lf_dynamut2$outcome_colname = "ddg_dynamut2_outcome"
+  lf_dynamut2$outcome         = lf_dynamut2$ddg_dynamut2_outcome
+  
+  # DROP static cols
+  lf_dynamut2  = lf_dynamut2[!lf_dynamut2$param_type%in%c(static_cols_end),]
+  lf_dynamut2$param_type = factor(lf_dynamut2$param_type)
+  table(lf_dynamut2$param_type); colnames(lf_dynamut2)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
+  wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
+  
+  ######################################################################################
+  #==================
+  # Consurf: LF
+  #https://consurf.tau.ac.il/overview.php
+  # consurf_score:
+  # <0 (below average): slowly evolving i.e CONSERVED
+  # >0 (above average): rapidly evolving, i.e VARIABLE 
+  #table(df$consurf_colour_rev)
+  # TODO
+  #1--> "most_variable", 2--> "", 3-->"",  4-->""
+  #5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
+  #====================
+  # WF data: consurf
+  cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
+  wf_consurf = comb_df_sl[, cols_to_select_consurf]
+  
+  pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
+  expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
+  expected_rows_lf
+  
+  # when outcome didn't exist
+  #cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
+  #wf_consurf = comb_df_sl[, cols_to_select_consurf]
+  # 
+  # pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start))]; pivot_cols_consurf
+  # expected_rows_lf = nrow(wf_consurf) * (length(wf_consurf) - length(pivot_cols_consurf))
+  # expected_rows_lf
+  
+  # LF data: consurf
+  lf_consurf = gather(wf_consurf
+                      , key = param_type
+                      , value = param_value
+                      , all_of(consurf_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+  
+  if (nrow(lf_consurf) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", consurf_dn)
+  }else{
+    cat("\nFAIL: long format data could not be created for duet")
+    quit()
+  }
+  
+  # NEW columns [outcome and outcome colname]
+  lf_consurf$outcome_colname = "consurf_outcome"
+  lf_consurf$outcome         = lf_consurf$consurf_outcome
+  
+  # DROP static cols
+  lf_consurf  = lf_consurf[!lf_consurf$param_type%in%c(static_cols_end),]
+  lf_consurf$param_type = factor(lf_consurf$param_type)
+  table(lf_consurf$param_type); colnames(lf_consurf)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_consurf']] = wf_consurf
+  wf_lf_dataL[['lf_consurf']] = lf_consurf
+  ###########################################################################
+  #==============
+  # SNAP2: LF
+  #==============
+  # WF data: snap2
+  cols_to_select_snap2 = c(static_cols_start, c("snap2_outcome", snap2_dn), static_cols_end)
+  wf_snap2 = comb_df_sl[, cols_to_select_snap2]
+  
+  pivot_cols_snap2 = cols_to_select_snap2[1: (length(static_cols_start) + 1)]; pivot_cols_snap2
+  expected_rows_lf = nrow(wf_snap2) * (length(wf_snap2) - length(pivot_cols_snap2))
+  expected_rows_lf
+  
+  # LF data: snap2
+  lf_snap2 = gather(wf_snap2
+                    , key = param_type
+                    , value = param_value
+                    , all_of(snap2_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+  
+  if (nrow(lf_snap2) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", snap2_dn)
+  }else{
+    cat("\nFAIL: long format data could not be created for duet")
+    quit()
+  }
+  
+  # NEW columns [outcome and outcome colname]
+  lf_snap2$outcome_colname = "snap2_outcome"
+  lf_snap2$outcome         = lf_snap2$snap2_outcome
+  
+  # DROP static cols
+  lf_snap2  = lf_snap2[!lf_snap2$param_type%in%c(static_cols_end),]
+  lf_snap2$param_type = factor(lf_snap2$param_type)
+  table(lf_snap2$param_type); colnames(lf_snap2)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_snap2']] = wf_snap2
+  wf_lf_dataL[['lf_snap2']] = lf_snap2
+  
+  #==============
+  # Provean2: LF
+  #==============
+  # WF data: provean
+  cols_to_select_provean = c(static_cols_start, c("provean_outcome", provean_dn), static_cols_end)
+  wf_provean = comb_df_sl[, cols_to_select_provean]
+  
+  pivot_cols_provean = cols_to_select_provean[1: (length(static_cols_start) + 1)]; pivot_cols_provean
+  expected_rows_lf = nrow(wf_provean) * (length(wf_provean) - length(pivot_cols_provean))
+  expected_rows_lf
+  
+  # LF data: provean
+  lf_provean = gather(wf_provean
+                      , key = param_type
+                      , value = param_value
+                      , all_of(provean_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+  
+  if (nrow(lf_provean) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", provean_dn)
+  }else{
+    cat("\nFAIL: long format data could not be created for duet")
+    quit()
+  }
+  
+  # NEW columns [outcome and outcome colname]
+  lf_provean$outcome_colname = "provean_outcome"
+  lf_provean$outcome         = lf_provean$provean_outcome
+  
+  # DROP static cols
+  lf_provean  = lf_provean[!lf_provean$param_type%in%c(static_cols_end),]
+  lf_provean$param_type = factor(lf_provean$param_type)
+  table(lf_provean$param_type); colnames(lf_provean)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_provean']] = wf_provean
+  wf_lf_dataL[['lf_provean']] = lf_provean
+  
+  
+  ###########################################################################
+  # AFFINITY cols
+  ###########################################################################
+  #=========================
+  # mCSM-lig:
+  # data filtered by cut off
+  #=========================
+  #---------------------
+  # mCSM-lig: WF and lF
+  #----------------------
+  # WF data: mcsm_lig
+  cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
+  wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
+  
+  pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
+  expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
+  expected_rows_lf
+  
+  # LF data: mcsm_lig
+  lf_mcsm_lig = gather(wf_mcsm_lig
+                       , key = param_type
+                       , value = param_value
+                       , all_of(mcsm_lig_dn):tail(static_cols_end,1)
+                       , factor_key = TRUE)
+  
+  if (nrow(lf_mcsm_lig) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", mcsm_lig_dn)
+  }else{
+    cat("\nFAIL: long format data could not be created for mcsm_lig")
+    quit()
+  }
+  
+  # NEW columns [outcome and outcome colname]
+  lf_mcsm_lig$outcome_colname = "ligand_outcome"
+  lf_mcsm_lig$outcome         = lf_mcsm_lig$ligand_outcome
+  
+  # DROP static cols
+  lf_mcsm_lig  = lf_mcsm_lig[!lf_mcsm_lig$param_type%in%c(static_cols_end),]
+  lf_mcsm_lig$param_type = factor(lf_mcsm_lig$param_type)
+  table(lf_mcsm_lig$param_type); colnames(lf_mcsm_lig)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
+  wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
+  
+  #=========================
+  # mmCSM-lig2:
+  # data filtered by cut off
+  #=========================
+  #---------------------
+  # mmCSM-lig2: WF and lF
+  #----------------------
+  # WF data: mmcsm_lig2
+  cols_to_select_mmcsm_lig2 = c(static_cols_start,  c("mmcsm_lig_outcome", mmcsm_lig_dn2), static_cols_end)
+  wf_mmcsm_lig2 = comb_df_sl_lig[, cols_to_select_mmcsm_lig2] # filtered df
+  
+  pivot_cols_mmcsm_lig2 = cols_to_select_mmcsm_lig2[1: (length(static_cols_start) + 1)]; pivot_cols_mmcsm_lig2
+  expected_rows_lf = nrow(wf_mmcsm_lig2) * (length(wf_mmcsm_lig2) - length(pivot_cols_mmcsm_lig2))
+  expected_rows_lf
+  
+  # LF data: mmcsm_lig2
+  lf_mmcsm_lig2 = gather(wf_mmcsm_lig2
+                         , key = param_type
+                         , value = param_value
+                         , all_of(mmcsm_lig_dn2):tail(static_cols_end,1)
+                         , factor_key = TRUE)
+  
+  if (nrow(lf_mmcsm_lig2) == expected_rows_lf){
+    cat("\nPASS: long format data created for ", mmcsm_lig_dn2)
+  }else{
+    cat("\nFAIL: long format data could not be created for mmcsm_lig2")
+    quit()
+  }
+  
+  # NEW columns [outcome and outcome colname]
+  lf_mmcsm_lig2$outcome_colname = "mmcsm_lig_outcome"
+  lf_mmcsm_lig2$outcome         = lf_mmcsm_lig2$mmcsm_lig_outcome
+  
+  # DROP static cols
+  lf_mmcsm_lig2  = lf_mmcsm_lig2[!lf_mmcsm_lig2$param_type%in%c(static_cols_end),]
+  lf_mmcsm_lig2$param_type = factor(lf_mmcsm_lig2$param_type)
+  table(lf_mmcsm_lig2$param_type); colnames(lf_mmcsm_lig2)
+  
+  # Assign them to the output list
+  wf_lf_dataL[['wf_mmcsm_lig2']] = wf_mmcsm_lig2
+  wf_lf_dataL[['lf_mmcsm_lig2']] = lf_mmcsm_lig2
+  
+  #=========================
+  # mcsm-ppi2 affinity
+  # data filtered by cut off
+  #========================
+  if (tolower(gene)%in%geneL_ppi2){
+    #-----------------
+    # mCSM-PPI2: WF and lF
+    #-----------------
+    # WF data: mcsm-ppi2
+    cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
+    #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
+    wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
+    
+    pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
+    expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
+    expected_rows_lf
+    
+    # LF data: mcsm-ppi2
+    lf_mcsm_ppi2 = gather(wf_mcsm_ppi2
+                          , key = param_type
+                          , value = param_value
+                          , all_of(mcsm_ppi2_dn):tail(static_cols_end,1)
+                          , factor_key = TRUE)
+    
+    if (nrow(lf_mcsm_ppi2) == expected_rows_lf){
+      cat("\nPASS: long format data created for ", mcsm_ppi2_dn)
+    }else{
+      cat("\nFAIL: long format data could not be created for duet")
+      quit()
+    }
+    
+    # NEW columns [outcome and outcome colname]
+    lf_mcsm_ppi2$outcome_colname = "mcsm_ppi2_outcome"
+    lf_mcsm_ppi2$outcome         = lf_mcsm_ppi2$mcsm_ppi2_outcome
+    
+    # DROP static cols
+    lf_mcsm_ppi2  = lf_mcsm_ppi2[!lf_mcsm_ppi2$param_type%in%c(static_cols_end),]
+    lf_mcsm_ppi2$param_type = factor(lf_mcsm_ppi2$param_type)
+    table(lf_mcsm_ppi2$param_type); colnames(lf_mcsm_ppi2)
+    
+    # Assign them to the output list
+    wf_lf_dataL[['wf_mcsm_ppi2']] = wf_mcsm_ppi2
+    wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
+    
+  }
+  
+  
+  
+  #====================
+  # mcsm-NA affinity
+  # data filtered by cut off
+  #====================
+  if (tolower(gene)%in%geneL_na){
+    #---------------
+    # mCSM-NA: WF and lF
+    #-----------------
+    # WF data: mcsm-na
+    cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
+    #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+    wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
+    
+    pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
+    expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+    expected_rows_lf
+    
+    # LF data: mcsm-na
+    lf_mcsm_na = gather(wf_mcsm_na
+                        , key = param_type
+                        , value = param_value
+                        , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                        , factor_key = TRUE)
+    
+    if (nrow(lf_mcsm_na) == expected_rows_lf){
+      cat("\nPASS: long format data created for ", mcsm_na_dn)
+    }else{
+      cat("\nFAIL: long format data could not be created for duet")
+      quit()
+    }
+    
+    # NEW columns [outcome and outcome colname]
+    lf_mcsm_na$outcome_colname = "mcsm_na_outcome"
+    lf_mcsm_na$outcome         = lf_mcsm_na$mcsm_na_outcome
+    
+    # DROP static cols
+    lf_mcsm_na  = lf_mcsm_na[!lf_mcsm_na$param_type%in%c(static_cols_end),]
+    lf_mcsm_na$param_type = factor(lf_mcsm_na$param_type)
+    table(lf_mcsm_na$param_type); colnames(lf_mcsm_na)
+    
+    # Assign them to the output list
+    wf_lf_dataL[['wf_mcsm_na']] = wf_mcsm_na
+    wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
+    
+  }
+  
+  return(wf_lf_dataL)
 }
 ############################################################################
diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R
index 47c707d..ea17d82 100755
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@@ -12,20 +12,19 @@ geneL_na      = c("gid", "rpob")
 geneL_ppi2    = c("alr", "embb", "katg", "rpob")
 
 if (tolower(gene)%in%geneL_na){
-  
   infilename_nca = paste0("/home/tanu/git/Misc/mcsm_na_dist/"
-                        , tolower(gene), "_nca_distances.csv")
+                          , tolower(gene), "_nca_distances.csv")
 }
 #========================================================
 # plotting_data(): formatting data for plots
 # input args: 
- ## input csv file
- ## lig cut off dist, default = 10 Ang
+## input csv file
+## lig cut off dist, default = 10 Ang
 # output: list of 4 dfs, that need to be decompressed
-  ## my_df
-  ## my_df_u
-  ## my_df_u_lig
-  ## dup_muts
+## my_df
+## my_df_u
+## my_df_u_lig
+## dup_muts
 #========================================================
 #lig_dist_colname = 'ligand_distance' or global var LigDist_colname
 #lig_dist_cutoff  =  10 or global var LigDist_cutoff
@@ -34,80 +33,121 @@ plotting_data <- function(df
                           , gene # ADDED
                           , lig_dist_colname 
                           , lig_dist_cutoff) {
-my_df       = data.frame()
-my_df_u     = data.frame()
-my_df_u_lig = data.frame()
-dup_muts    = data.frame()
+  my_df       = data.frame()
+  my_df_u     = data.frame()
+  my_df_u_lig = data.frame()
+  dup_muts    = data.frame()
   
-#===========================
-# Read file: struct params 
-#===========================
-#df = read.csv(infile_params, header = T)
-
-cat("\nInput dimensions:", dim(df)) 
-
-#==================================
-# extract unique mutation entries
-#==================================
-
-# check for duplicate mutations
-if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
-  cat(paste0("\nCAUTION:", " Duplicate mutations identified"
-             , "\nExtracting these...\n"))
-  #cat(my_df[duplicated(my_df$mutationinformation),])
-  dup_muts = df[duplicated(df$mutationinformation),]
-  dup_muts_nu = length(unique(dup_muts$mutationinformation))
-  cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
-             , "\nNo. of unique duplicate mutations:", dup_muts_nu
-             , "\n\nExtracting df with unique mutations only\n"))
-  my_df_u = df[!duplicated(df$mutationinformation),]
-}else{
-  cat(paste0("\nNo duplicate mutations detected\n"))
-  my_df_u = df
-}
-
-upos = unique(my_df_u$position)
-cat("\nDim of clean df:"); cat(dim(my_df_u), "\n")
-cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
-#===============================================
-# ADD : na distance column for genes with nucleic acid affinity
-#===============================================
-#gid_na_distcol
-if (tolower(gene)%in%geneL_na){
-
-  distcol_nca_name = read.csv(infilename_nca, header = F)
-  head(distcol_nca_name)
-  colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
-  head(distcol_nca_name)
-  class(distcol_nca_name)
-
-  mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
-  mcol
-  head(my_df_u$mutationinformation)
-  head(distcol_nca_name$mutationinformation)
+  #===========================
+  # Read file: struct params 
+  #===========================
+  #df = read.csv(infile_params, header = T)
   
-  my_df_u = merge(my_df_u, distcol_nca_name, 
-                     by = "mutationinformation",
-                     all = T)
-
-} 
-#===============================================
-# extract mutations <10 Angstroms and symbol
-#===============================================
-table(my_df_u[[lig_dist_colname]] < lig_dist_cutoff)
-
-my_df_u_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
-
-cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
-
-# return list of DFs
-my_df = df
-#df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
-all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
-#all_df = Map(setNames, all_df, df_names)
-
-return(all_df)
+  cat("\nInput dimensions:", dim(df)) 
+  
+  #==================================
+  # extract unique mutation entries
+  #==================================
+  
+  # check for duplicate mutations
+  if ( length(unique(df$mutationinformation)) != length(df$mutationinformation)){
+    cat(paste0("\nCAUTION:", " Duplicate mutations identified"
+               , "\nExtracting these...\n"))
+    #cat(my_df[duplicated(my_df$mutationinformation),])
+    dup_muts = df[duplicated(df$mutationinformation),]
+    dup_muts_nu = length(unique(dup_muts$mutationinformation))
+    cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts)
+               , "\nNo. of unique duplicate mutations:", dup_muts_nu
+               , "\n\nExtracting df with unique mutations only\n"))
+    my_df_u = df[!duplicated(df$mutationinformation),]
+  } else {
+    cat(paste0("\nNo duplicate mutations detected\n"))
+    my_df_u = df
+  }
+  
+  upos = unique(my_df_u$position)
+  cat("\nDim of clean df:"); cat(dim(my_df_u), "\n")
+  cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n")
+  #===============================================
+  # ADD : na distance column for genes with nucleic acid affinity
+  #===============================================
+  # if (tolower(gene)%in%geneL_na){
+  # 
+  #   distcol_nca_name = read.csv(infilename_nca, header = F)
+  #   head(distcol_nca_name)
+  #   colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
+  #   head(distcol_nca_name)
+  #   class(distcol_nca_name)
+  # 
+  #   mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
+  #   mcol
+  #   head(my_df_u$mutationinformation)
+  #   head(distcol_nca_name$mutationinformation)
+  #   
+  #   my_df_u = merge(my_df_u, distcol_nca_name, 
+  #                      by = "mutationinformation",
+  #                      all = T)
+  # 
+  # } 
+  
+  if (tolower(gene)%in%geneL_na){
+    distcol_nca_name = read.csv(infilename_nca, header = F)
+    
+    if (tolower(gene)=='rpob'){
+      print('WARNING: running special-case handler for rpoB')
+      
+      # create 5uhc equivalent column for mutationinformation
+      my_df_u$X5uhc_mutationinformation = paste0(my_df_u$wild_type,
+                                                 my_df_u$X5uhc_position,
+                                                 my_df_u$mutant_type)
+      
+      colnames(distcol_nca_name) <- c("X5uhc_mutationinformation", "nca_distance")
+      
+      # do stuff here
+      mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
+      cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol))
+      
+      head(my_df_u$mutationinformation)
+      head(distcol_nca_name$X5uhc_mutationinformation)
+      
+      my_df_u = merge(my_df_u, distcol_nca_name, 
+                      by = "X5uhc_mutationinformation",
+                      all = T)
+      
+    } else {
+      head(distcol_nca_name)
+      colnames(distcol_nca_name) <- c("mutationinformation", "nca_distance")
+      head(distcol_nca_name)
+      class(distcol_nca_name)
+      mcol = colnames(distcol_nca_name)[colnames(distcol_nca_name)%in%colnames(my_df_u)]
+      cat(paste0("\nMerging for gene: ", tolower(gene), "\non column: ", mcol))
+      head(my_df_u$mutationinformation)
+      head(distcol_nca_name$mutationinformation)
+      
+      my_df_u = merge(my_df_u, distcol_nca_name, 
+                      by = "mutationinformation",
+                      all = T)
+    }
+  } 
+  
+  #===============================================
+  # extract mutations <10 Angstroms and symbol
+  #===============================================
+  table(my_df_u[[lig_dist_colname]] < lig_dist_cutoff)
+  
+  my_df_u_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
+  
+  cat(paste0("There are ", nrow(my_df_u_lig), " sites lying within 10\u212b of the ligand\n"))
+  
+  # return list of DFs
+  my_df = df
+  #df_names = c("my_df", "my_df_u", "my_df_u_lig", "dup_muts")
+  all_df = list(my_df, my_df_u, my_df_u_lig, dup_muts)
+  #all_df = Map(setNames, all_df, df_names)
+  
+  return(all_df)
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
+
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index f06f5d7..e4df5be 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -60,8 +60,8 @@ pd_df = plotting_data(mcsm_df
 my_df   = pd_df[[1]] 
 my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
 
-max_ang <- round(max(my_df_u[LigDist_colname]))
-min_ang <- round(min(my_df_u[LigDist_colname]))
+max_ang <- round(max(my_df_u[[LigDist_colname]]))
+min_ang <- round(min(my_df_u[[LigDist_colname]]))
 
 cat("\nLigand distance colname:", LigDist_colname
     , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
@@ -128,6 +128,11 @@ geneL_normal  = c("pnca")
 geneL_na      = c("gid", "rpob")
 geneL_ppi2    = c("alr", "embb", "katg", "rpob")
 
+# geneL_normal  = c("pnca")
+# geneL_both    = c("rpob")
+# geneL_ppi2    = c("alr", "embb", "katg")
+# geneL_na      = c("gid")
+
 all_dm_om_df = dm_om_wf_lf_data(df = merged_df3, gene = gene)
 
 wf_duet      = all_dm_om_df[['wf_duet']]
@@ -158,15 +163,27 @@ lf_provean   = all_dm_om_df[['lf_provean']]
 wf_dist_gen   = all_dm_om_df[['wf_dist_gen']]
 lf_dist_gen   = all_dm_om_df[['lf_dist_gen']]
 
+# ppi2 genes
+if (tolower(gene)%in%geneL_ppi2){
+  wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
+  lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
+}
+
+# na genes
 if (tolower(gene)%in%geneL_na){
   wf_mcsm_na   = all_dm_om_df[['wf_mcsm_na']]
   lf_mcsm_na   = all_dm_om_df[['lf_mcsm_na']]
 }
 
-if (tolower(gene)%in%geneL_ppi2){
-  wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
-  lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
-}
+# both ppi2+na genes:: NOT NEEDED Here as its is handled by the two ifs above
+# if (tolower(gene)%in%geneL_both){
+#   wf_mcsm_ppi2 = all_dm_om_df[['wf_mcsm_ppi2']]
+#   lf_mcsm_ppi2 = all_dm_om_df[['lf_mcsm_ppi2']]
+#   
+#   wf_mcsm_na   = all_dm_om_df[['wf_mcsm_na']]
+#   lf_mcsm_na   = all_dm_om_df[['lf_mcsm_na']]
+# }
+
 
 s2 = c("\nSuccessfully sourced other_plots_data.R")
 cat(s2)