From 05ab89ec09b6475da86ad28baeef9bcb75c29fb2 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 5 Aug 2022 14:36:02 +0100
Subject: [PATCH] git trimmed downthe dm_om_data.R

---
 scripts/functions/combining_dfs_plotting.R    |  47 +-
 scripts/functions/dm_om_data.R                | 423 +++++-------------
 scripts/functions/plotting_globals.R          |   4 +
 scripts/plotting/get_plotting_dfs.R           |   2 +-
 .../plotting/plotting_thesis/preformatting.R  |  43 +-
 5 files changed, 168 insertions(+), 351 deletions(-)

diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R
index 7354bba..ee9df5e 100644
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@@ -343,20 +343,45 @@ combining_dfs_plotting <- function(  my_df_u
         , "\nNo. of rows merged_df3: ", nrow(merged_df3))
     quit()
   }
-  #---------------------------------------------
-  # add columns that are needed to generate plots with revised colnames and strings
-  #----------------------------------------------
-  merged_df3['sensitivity'] = ifelse(merged_df3['dst_mode'] == 1, "R", "S")
-  merged_df3['mutation_info_labels'] = ifelse(merged_df3['mutation_info_labels'] == "DM", "R", "S")
+  #=========================================
+  # NEW: add consurf outcome
+  #=========================================
+  consurf_colOld = "consurf_colour_rev"
+  consurf_colNew = "consurf_outcome"
+  merged_df3[[consurf_colNew]] = merged_df3[[consurf_colOld]]
+  merged_df3[[consurf_colNew]] = as.factor(merged_df3[[consurf_colNew]])
+  merged_df3[[consurf_colNew]]
+  #levels(merged_df3$consurf_outcome) = c("nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
 
-  merged_df2['sensitivity'] = ifelse(merged_df2['dst_mode'] == 1, "R", "S")
-  merged_df2['mutation_info_labels'] = ifelse(merged_df2['mutation_info_labels'] == "DM", "R", "S")
+  merged_df2[[consurf_colNew]] = merged_df2[[consurf_colOld]]
+  merged_df2[[consurf_colNew]] = as.factor(merged_df2[[consurf_colNew]])
+  merged_df2[[consurf_colNew]]
   
-  #check1 = all(table(merged_df3["mutation_info_labels"]) == table(merged_df3['sensitivity']))
-  #check2 = all(table(merged_df2["mutation_info_labels"]) == table(merged_df2['sensitivity']))
+  #=========================================
+  # NEW: fixed case for SNAP2 labels
+  #=========================================
+  snap2_colname = "snap2_outcome"
+  merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "effect", "Effect")
+  merged_df3[[snap2_colname]] <- str_replace(merged_df3[[snap2_colname]], "neutral", "Neutral")
+  
+  merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "effect", "Effect")
+  merged_df2[[snap2_colname]] <- str_replace(merged_df2[[snap2_colname]], "neutral", "Neutral")
+  
+  #---------------------------------------------
+  # NEW: add columns that are needed to generate
+  # plots with revised colnames and strings
+  #----------------------------------------------
+  merged_df3$sensitivity = ifelse(merged_df3$dst_mode == 1, "R", "S")
+  merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info_labels == "DM", "R", "S")
 
-  check1 = all(merged_df3["mutation_info_labels"] == merged_df3['sensitivity'])
-  check2 = all(merged_df2["mutation_info_labels"] == merged_df2['sensitivity'])
+  merged_df2$sensitivity = ifelse(merged_df2$dst_mode == 1, "R", "S")
+  merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info_labels == "DM", "R", "S")
+  
+  # for epistasis: fill na where dst: No equivalent in merged_df3
+  merged_df2$dst2 = ifelse(is.na(merged_df2$dst), merged_df2$dst_mode, merged_df2$dst)
+  
+  check1 = all(merged_df3$mutation_info_labels == merged_df3$sensitivity)
+  check2 = all(merged_df2$mutation_info_labels == merged_df2$sensitivity)
                               
   if(check1 && check2){
     cat("PASS: merged_df3 and merged_df2 have mutation info labels as R and S" 
diff --git a/scripts/functions/dm_om_data.R b/scripts/functions/dm_om_data.R
index f80103f..21e4245 100644
--- a/scripts/functions/dm_om_data.R
+++ b/scripts/functions/dm_om_data.R
@@ -5,47 +5,16 @@
     # duet, mcsm-lig, foldx, deepddg, dynamut2, mcsm-na, mcsm-ppi2, encom, dynamut..etc
   # Called by get_plotting_dfs.R
 
-# dm_om_wf_lf_data()
-# INPUT: 
-    # df: merged_df3 (data with all parameters)
-      # NOTE*: merged_df2 will not be appropriate as it brings up most params as significant!?,atleast for gid
-    # gene: [conditional generation of dfs like mcsm-NA, mcsm-ppi2 as not all genes have all these values]
-    # colnames_to_extract     : columns to extract, either user-specified. 
-      #By default it is c("mutationinformation" , "duet_affinity_change...")
-    # ligand_dist_colname     : column name containing ligand distance. By deafult, it is LigDist_colname (imported from globals)
-    # dr_muts                 : dr_muts_col (imported from globals; dr_mutations_<drug>)
-    # other_muts              : other_muts_col (imported from globals ...other_mutations_<drug>)
-    # snp_colname             : SNP column name. By default it is "mutationinformation"
-    # aa_pos_colname          : Column name containing the aa position. This is used to sort the df by.
-    # mut_colname             : Column name containing snp info in format "<abc_pXXdef>. By default, it is "mutation"
-    # mut_info_colname        : Column name containing mutation info whether it is DM or OM. By default, it is "mutation_info"
-    # mut_info_label_colname  : Column containing pre-formatted labels for mutation info. 
-      # For my use case, this is called "mutation_info_labels"
-      # This column has short labels like DM and OM coresponding to dr_muts and other_muts.
-      # NOTE*: if this is left empty, then the arg ('dr_other_muts_labels') will be used
-    # dr_other_muts_labels    : User specified labels, must correspond to dr_muts and other_muts. 
-      # NOTE*: Only used if the arg (mut_info_label_colname) is empty!
-    # categ_cols_to_factor    : Column names to convert to factors. These mainly correspond to the outcome columns associated with the
-      # arg ('colnames_to_extract'). These have the suffix "_outcome" in their colnames. Additionally column 'mutation_info' is also 
-      # converted to factor. By default, it converts the cols with '_outcome'and 'info' to factor.
-      # Users are able to provide a vector of their corresponding column names
-
-# RETURNS: List
-    # WF nd LF data grouped by mutation_info i.e DM (drug mutations) and OM (other mutations)
-    
-# TO DO: SHINY
-#1) df to choose (merged_df3 or merged_df2)
-#2)
 ##################################################################
-DistCutOff = 10
-#LigDist_colname  # = "ligand_distance" # from globals 
-ppi2Dist_colname  = "interface_dist"
-naDist_colname    = "TBC"
+# from plotting_globals.R
+# DistCutOff, LigDist_colname, ppi2Dist_colname, naDist_colname 
 
 dm_om_wf_lf_data <- function(df
                           , gene_name               = gene # from globals
                           , colnames_to_extract
                           , ligand_dist_colname     = LigDist_colname # from globals
+                          #, ppi2Dist_colname #from globals used 
+                          #, naDist_colname #from globals used
                           , dr_muts                 = dr_muts_col # from globals
                           , other_muts              = other_muts_col # from globals
                           , snp_colname             = "mutationinformation"
@@ -53,28 +22,19 @@ dm_om_wf_lf_data <- function(df
                           , mut_colname             = "mutation"
                           , mut_info_colname        = "mutation_info"
                           , mut_info_label_colname  = "mutation_info_labels" # if empty, below used
-                          , dr_other_muts_labels    = c("DM", "OM") # only used if ^^ = ""
+                          #, dr_other_muts_labels    = c("DM", "OM") # only used if ^^ = ""
                           , categ_cols_to_factor){
   
   df = as.data.frame(df)
- 
-  df['sensitivity'] = ifelse(df['dst_mode'] == 1, "R", "S")
-  table(df['sensitivity'])
-  
-  df[[mut_info_label_colname]] = ifelse(df[[mut_info_label_colname]] == "DM", "R", "S")
-  table(df[[mut_info_label_colname]])
-  
-  
+
   # Initialise the required dfs based on gene name
   geneL_normal  = c("pnca")
-  #geneL_na_dy   = c("gid")
   geneL_na      = c("gid", "rpob")
-  geneL_dy      = c("gid")
   geneL_ppi2    = c("alr", "embb", "katg", "rpob")
   
   # common_dfs
   common_dfsL     = list(
-      wf_duet       = data.frame()
+      wf_duet     = data.frame()
     , lf_duet     = data.frame()
     , wf_mcsm_lig = data.frame()
     , lf_mcsm_lig = data.frame()
@@ -110,24 +70,6 @@ dm_om_wf_lf_data <- function(df
     )
     wf_lf_dataL      = c(common_dfsL, additional_dfL)
   }
-  
-  if (tolower(gene_name)%in%geneL_dy){
-    additional_dfL  = list( 
-      wf_mcsm_na    = data.frame()
-      , lf_mcsm_na  = data.frame()
-      , wf_dynamut  = data.frame()
-      , lf_dynamut  = data.frame()
-      , wf_encomddg = data.frame()
-      , lf_encomddg = data.frame()
-      , wf_encomdds = data.frame()
-      , lf_encomdds = data.frame()
-      , wf_sdm      = data.frame()
-      , lf_sdm      = data.frame()
-      , wf_mcsm     = data.frame()
-      , lf_mcsm     = data.frame() 
-    )
-    wf_lf_dataL     = c(common_dfsL, additional_dfL)
-  }
   cat("\nInitializing an empty list of length:"
       , length(wf_lf_dataL))
   
@@ -137,26 +79,21 @@ dm_om_wf_lf_data <- function(df
   colnames_to_extract = c(snp_colname
         , mut_colname, mut_info_colname, mut_info_label_colname
         , aa_pos_colname
-        ,  LigDist_colname
-        , ppi2Dist_colname, naDist_colname
+        , LigDist_colname  # from globals
+        , ppi2Dist_colname # from globals
+        , naDist_colname   # from globals
         , "duet_stability_change" , "duet_scaled"        , "duet_outcome"
         , "ligand_affinity_change", "affinity_scaled"    , "ligand_outcome"
         , "ddg_foldx"             , "foldx_scaled"       , "foldx_outcome"
         , "deepddg"               , "deepddg_scaled"     , "deepddg_outcome"
         , "asa"                   , "rsa"
         , "rd_values"             , "kd_values"
-        , "log10_or_mychisq"      , "neglog_pval_fisher" , "af"
-        , "ddg_dynamut2"          , "ddg_dynamut2_scaled",  "ddg_dynamut2_outcome"
+        , "log10_or_mychisq"      , "neglog_pval_fisher" , "maf" #"af"
+        , "ddg_dynamut2"          , "ddg_dynamut2_scaled", "ddg_dynamut2_outcome"
         , "mcsm_ppi2_affinity"    , "mcsm_ppi2_scaled"   , "mcsm_ppi2_outcome"
-        , "consurf_score"         , "consurf_scaled"    #, "consurf_outcome"
+        , "consurf_score"         , "consurf_scaled"     , "consurf_outcome" # exists now
         , "snap2_score"           , "snap2_scaled"       , "snap2_outcome"
-        , "mcsm_na_affinity"      , "mcsm_na_scaled"     , "mcsm_na_outcome"
-        , "ddg_dynamut"           , "ddg_dynamut_scaled" , "ddg_dynamut_outcome"
-        , "ddg_encom"             , "ddg_encom_scaled"   ,  "ddg_encom_outcome"
-        , "dds_encom"             , "dds_encom_scaled"   ,  "dds_encom_outcome"
-        , "ddg_mcsm"              , "ddg_mcsm_scaled"    ,  "ddg_mcsm_outcome"    
-        , "ddg_sdm"               , "ddg_sdm_scaled"     ,  "ddg_sdm_outcome"
-        , "ddg_duet"              , "ddg_duet_scaled"    ,  "ddg_duet_outcome")
+        , "mcsm_na_affinity"      , "mcsm_na_scaled"     , "mcsm_na_outcome")
   }else{
     colnames_to_extract = c(mut_colname, mut_info_colname, mut_info_label_colname
                             , aa_pos_colname, LigDist_colname
@@ -186,47 +123,29 @@ dm_om_wf_lf_data <- function(df
 #=======================================================================
 table(comb_df_s[[mut_info_colname]])
 
-# further checks to make sure dr and other muts are indeed unique
-dr_muts = comb_df_s[comb_df_s[[mut_info_colname]] == dr_muts,]
-dr_muts_names = unique(dr_muts$mutation)
-
-other_muts = comb_df_s[comb_df_s[[mut_info_colname]] == other_muts,]
-other_muts_names = unique(other_muts$mutation)
-
-if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
-  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
-  cat("PASS: dr and other muts are indeed unique")
-}else{
-  cat("FAIL: dr and others muts are NOT unique!")
-  quit()
-}
-
 # pretty display names i.e. labels to reduce major code duplication later
 foo_cnames = data.frame(colnames(comb_df_s))
 names(foo_cnames) <- "old_name"
 
 stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
-flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
+#flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
 
-lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
-mcsm_lig_dn  = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn
+#lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
+#mcsm_lig_dn  = paste0("Ligand affinity (log fold change)"); mcsm_lig_dn
+
+lig_dn       = paste0("Lig Dist(", angstroms_symbol, ")"); lig_dn
+mcsm_lig_dn  = paste0("mCSM-lig"); mcsm_lig_dn
 
 duet_dn      = paste0("DUET ", stability_suffix); duet_dn
 foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
 deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
 dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
 
-mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
-mcsm_ppi2_dn = paste0("mCSM-PPI2 affinity ", stability_suffix); mcsm_ppi2_dn
+mcsm_na_dn   = paste0("mCSM-NA ", stability_suffix); mcsm_na_dn
+mcsm_ppi2_dn = paste0("mCSM-PPI2 ", stability_suffix); mcsm_ppi2_dn
 consurf_dn   = paste0("Consurf"); consurf_dn
 snap2_dn     = paste0("SNAP2"); snap2_dn
 
-dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
-encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
-encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
-sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
-mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
-
 
 # change column names: plyr
 new_colnames = c(asa  = "ASA"
@@ -235,7 +154,8 @@ new_colnames = c(asa  = "ASA"
                 , kd_values           = "KD"
                 , log10_or_mychisq    = "Log10 (OR)"
                 , neglog_pval_fisher  = "-Log (P)"
-                , af                  = "MAF"
+                #, af                  = "MAF"
+                , maf                  = "MAF"
                 #, ligand_dist_colname     = lig_dn # cannot handle variable name 'ligand_dist_colname'
                 , affinity_scaled     = mcsm_lig_dn
                 , duet_scaled         = duet_dn
@@ -245,12 +165,7 @@ new_colnames = c(asa  = "ASA"
                 , mcsm_na_scaled      = mcsm_na_dn
                 , mcsm_ppi2_affinity  = mcsm_ppi2_dn
                 , consurf_score       = consurf_dn
-                , snap2_score         = snap2_dn
-                , ddg_dynamut_scaled  = dynamut_dn
-                , ddg_encom_scaled    = encom_ddg_dn
-                , dds_encom_scaled    = encom_dds_dn
-                , ddg_sdm             = sdm_dn
-                , ddg_mcsm            = mcsm_dn)
+                , snap2_score         = snap2_dn)
 
 comb_df_sl1 = plyr::rename(comb_df_s
                           , replace = new_colnames
@@ -260,29 +175,26 @@ comb_df_sl1 = plyr::rename(comb_df_s
 # renaming colname using variable i.e ligand_dist_colname: dplyr
 comb_df_sl = comb_df_sl1 %>% dplyr::rename(!!lig_dn := all_of(ligand_dist_colname))
 names(comb_df_sl)
-#####################################################################
-if (mut_info_label_colname == "") {
-  cat("\nAssigning labels:", dr_other_muts_labels, "--> to column:", mut_info_colname)
-  table(comb_df_sl[[mut_info_colname]])
 
-  # dr_muts
-  levels(comb_df_sl[[mut_info_colname]])[levels(comb_df_sl[[mut_info_colname]])==dr_muts] <- dr_other_muts_labels[[1]]
-  # other_muts
-  levels(comb_df_sl[[mut_info_colname]])[levels(comb_df_sl[[mut_info_colname]])==other_muts] <- dr_other_muts_labels[[2]]
-  table(comb_df_sl[[mut_info_colname]])
-  
-  static_cols1 = mut_info_colname
-}else{
-  table(comb_df_sl[[mut_info_label_colname]])
-  static_cols1 = mut_info_label_colname
-  
-}
+#=======================
+# NEW: Affinity filtered data
+#========================
+# mcsm-lig --> LigDist_colname
+comb_df_sl_lig = comb_df_sl[comb_df_sl[[lig_dn]]<DistCutOff,]
+
+# mcsm-ppi2 --> ppi2Dist_colname
+comb_df_sl_ppi2 = comb_df_sl[comb_df_sl[[ppi2Dist_colname]]<DistCutOff,]
+
+# mcsm-na --> naDist_colname
+comb_df_sl_na = comb_df_sl[comb_df_sl[[naDist_colname]]<DistCutOff,]
+
+#####################################################################
+static_cols1 = mut_info_label_colname
 #######################################################################
 #======================
 # Selecting dfs
 # with appropriate cols
 #=======================
-
 static_cols_start =  c(snp_colname
                        , aa_pos_colname
                        , mut_colname
@@ -296,7 +208,8 @@ static_cols_end = c(lig_dn
                     , "KD"
                     , "MAF"
                     , "Log10 (OR)"
-                    , "-Log (P)")
+                    #, "-Log (P)"
+                    )
 
 #########################################################################
 #==============
@@ -312,7 +225,7 @@ expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
 expected_rows_lf
 
 # LF data: duet
-lf_duet = gather(wf_duet
+lf_duet = tidyr::gather(wf_duet
                   , key = param_type
                   , value = param_value
                   , all_of(duet_dn):tail(static_cols_end,1)
@@ -329,35 +242,6 @@ if (nrow(lf_duet) == expected_rows_lf){
 wf_lf_dataL[['wf_duet']] = wf_duet
 wf_lf_dataL[['lf_duet']] = lf_duet
 
-############################################################################
-#==============
-# mCSM-lig
-#==============
-# WF data: mcsm_lig
-cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
-wf_mcsm_lig = comb_df_sl[, cols_to_select_mcsm_lig]
-
-pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
-expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
-expected_rows_lf
-
-# LF data: mcsm_lig
-lf_mcsm_lig = gather(wf_mcsm_lig
-                     , key = param_type
-                     , value = param_value
-                     , all_of(mcsm_lig_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_mcsm_lig) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", mcsm_lig_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for mcsm_lig")
-  quit()
-}
-
-# Assign them to the output list
-wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
-wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
 ############################################################################
 #==============
 # FoldX
@@ -446,7 +330,9 @@ if (nrow(lf_dynamut2) == expected_rows_lf){
 # Assign them to the output list
 wf_lf_dataL[['wf_dynamut2']] = wf_dynamut2
 wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
-############################################################################
+
+
+######################################################################################
 #==================
 # Consurf: LF
 #https://consurf.tau.ac.il/overview.php
@@ -459,9 +345,9 @@ wf_lf_dataL[['lf_dynamut2']] = lf_dynamut2
 #5-->"", 6-->"", 7-->"", 8-->"", 9-->"most_conserved"
 #====================
 # FIXME: if you add category column to consurf
-#cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
-#wf_consurf = comb_df_sl[, cols_to_select_consurf]
-#pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
+cols_to_select_consurf = c(static_cols_start, c("consurf_outcome", consurf_dn), static_cols_end)
+wf_consurf = comb_df_sl[, cols_to_select_consurf]
+pivot_cols_consurf = cols_to_select_consurf[1: (length(static_cols_start) + 1)]; pivot_cols_consurf
 
 # WF data: consurf
 cols_to_select_consurf = c(static_cols_start, c(consurf_dn), static_cols_end)
@@ -517,15 +403,54 @@ if (nrow(lf_snap2) == expected_rows_lf){
 # Assign them to the output list
 wf_lf_dataL[['wf_snap2']] = wf_snap2
 wf_lf_dataL[['lf_snap2']] = lf_snap2
+###########################################################################
+# AFFINITY cols
+###########################################################################
+#=========================
+# mCSM-lig:
+# data filtered by cut off
+#=========================
+#---------------------
+# mCSM-lig: WF and lF
+#----------------------
+# WF data: mcsm_lig
+cols_to_select_mcsm_lig = c(static_cols_start,  c("ligand_outcome", mcsm_lig_dn), static_cols_end)
+wf_mcsm_lig = comb_df_sl_lig[, cols_to_select_mcsm_lig] # filtered df
 
-############################################################################
+pivot_cols_mcsm_lig = cols_to_select_mcsm_lig[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_lig
+expected_rows_lf = nrow(wf_mcsm_lig) * (length(wf_mcsm_lig) - length(pivot_cols_mcsm_lig))
+expected_rows_lf
+
+# LF data: mcsm_lig
+lf_mcsm_lig = gather(wf_mcsm_lig
+                     , key = param_type
+                     , value = param_value
+                     , all_of(mcsm_lig_dn):tail(static_cols_end,1)
+                     , factor_key = TRUE)
+
+if (nrow(lf_mcsm_lig) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", mcsm_lig_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for mcsm_lig")
+  quit()
+}
+
+# Assign them to the output list
+wf_lf_dataL[['wf_mcsm_lig']] = wf_mcsm_lig
+wf_lf_dataL[['lf_mcsm_lig']] = lf_mcsm_lig
+
+#====================
+# mcsm-NA affinity
+# data filtered by cut off
+#====================
 if (tolower(gene_name)%in%geneL_na){
-  #==============
-  # mCSM-NA: LF
-  #==============
+  #---------------
+  # mCSM-NA: WF and lF
+  #-----------------
   # WF data: mcsm-na
   cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
-  wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+  #wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+  wf_mcsm_na = comb_df_sl_na[, cols_to_select_mcsm_na]
   
   pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
   expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
@@ -550,14 +475,19 @@ if (tolower(gene_name)%in%geneL_na){
   wf_lf_dataL[['lf_mcsm_na']] = lf_mcsm_na
 
 }
-#-------------------------------------------------------------------
+
+#=========================
+# mcsm-ppi2 affinity
+# data filtered by cut off
+#========================
 if (tolower(gene_name)%in%geneL_ppi2){
-  #==============
-  # mCSM-PPI2: LF
-  #==============
+  #-----------------
+  # mCSM-PPI2: WF and lF
+  #-----------------
   # WF data: mcsm-ppi2
   cols_to_select_mcsm_ppi2 = c(static_cols_start, c("mcsm_ppi2_outcome", mcsm_ppi2_dn), static_cols_end)
-  wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
+  #wf_mcsm_ppi2 = comb_df_sl[, cols_to_select_mcsm_ppi2]
+  wf_mcsm_ppi2 = comb_df_sl_ppi2[, cols_to_select_mcsm_ppi2]
   
   pivot_cols_mcsm_ppi2 = cols_to_select_mcsm_ppi2[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_ppi2
   expected_rows_lf = nrow(wf_mcsm_ppi2) * (length(wf_mcsm_ppi2) - length(pivot_cols_mcsm_ppi2))
@@ -582,156 +512,7 @@ if (tolower(gene_name)%in%geneL_ppi2){
   wf_lf_dataL[['lf_mcsm_ppi2']] = lf_mcsm_ppi2
   
 }
-#-------------------------------------------------------------------
-if (tolower(gene_name)%in%geneL_dy){
-  #==============
-  # Dynamut: LF
-  #==============
-  # WF data: dynamut
-  cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
-  wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
-  
-  pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
-  expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
-  expected_rows_lf
-  
-  # LF data: dynamut
-  lf_dynamut = gather(wf_dynamut
-                      , key = param_type
-                      , value = param_value
-                      , all_of(dynamut_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-  
-  if (nrow(lf_dynamut) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", dynamut_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_dynamut']] = wf_dynamut
-  wf_lf_dataL[['lf_dynamut']] = lf_dynamut
 
-#-------------------------------------------------------------------------
-  #==============
-  # EnCOM ddg: LF
-  #==============
-  # WF data: encomddg
-  cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
-  wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
-  
-  pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
-  expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
-  expected_rows_lf
-  
-  # LF data: encomddg 
-  lf_encomddg  = gather(wf_encomddg 
-                       , key = param_type
-                       , value = param_value
-                       , all_of(encom_ddg_dn):tail(static_cols_end,1)
-                       , factor_key = TRUE)
-  
-  if (nrow(lf_encomddg) == expected_rows_lf){
-    cat("\nPASS: long format data created for ", encom_ddg_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_encomddg']] = wf_encomddg
-  wf_lf_dataL[['lf_encomddg']] = lf_encomddg
-#-------------------------------------------------------------------------
-  #==============
-  # EnCOM dds: LF
-  #==============
-  # WF data: encomdds
-  cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
-  wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
-  
-  pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
-  expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
-  expected_rows_lf
-  
-  # LF data: encomdds 
-  lf_encomdds  = gather(wf_encomdds
-                        , key = param_type
-                        , value = param_value
-                        , all_of(encom_dds_dn):tail(static_cols_end,1)
-                        , factor_key = TRUE)
-  
-  if (nrow(lf_encomdds) == expected_rows_lf){
-    cat("\nPASS: long format data created for", encom_dds_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_encomdds']] = wf_encomdds
-  wf_lf_dataL[['lf_encomdds']] = lf_encomdds
-#-------------------------------------------------------------------------
-  #==============
-  # SDM: LF
-  #==============
-  # WF data: sdm
-  cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
-  wf_sdm = comb_df_sl[, cols_to_select_sdm]
-  
-  pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
-  expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
-  expected_rows_lf
-  
-  # LF data: sdm
-  lf_sdm  = gather(wf_sdm
-                   , key = param_type
-                   , value = param_value
-                   , all_of(sdm_dn):tail(static_cols_end,1)
-                   , factor_key = TRUE)
-  
-  if (nrow(lf_sdm) == expected_rows_lf){
-    cat("\nPASS: long format data created for", sdm_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_sdm']] = wf_sdm
-  wf_lf_dataL[['lf_sdm']] = lf_sdm
-#-------------------------------------------------------------------------
-  #==============
-  # mCSM: LF
-  #==============
-  # WF data: mcsm
-  cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
-  wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
-  
-  pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
-  expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
-  expected_rows_lf
-  
-  # LF data: mcsm
-  lf_mcsm  = gather(wf_mcsm
-                   , key = param_type
-                   , value = param_value
-                   , all_of(mcsm_dn):tail(static_cols_end,1)
-                   , factor_key = TRUE)
-  
-  if (nrow(lf_mcsm) == expected_rows_lf){
-    cat("\nPASS: long format data created for", mcsm_dn)
-  }else{
-    cat("\nFAIL: long format data could not be created for duet")
-    quit()
-  }
-  
-  # Assign them to the output list
-  wf_lf_dataL[['wf_mcsm']] = wf_mcsm
-  wf_lf_dataL[['lf_mcsm']] = lf_mcsm
-
-  }
-#-------------------------------------------------------------------------
 return(wf_lf_dataL)
 }
 ############################################################################
diff --git a/scripts/functions/plotting_globals.R b/scripts/functions/plotting_globals.R
index b2a29b9..0dc1a78 100644
--- a/scripts/functions/plotting_globals.R
+++ b/scripts/functions/plotting_globals.R
@@ -39,6 +39,10 @@ resistance_col <<- "drtype"
 LigDist_colname <<- "ligand_distance" 
 LigDist_cutoff <<- 10
 
+DistCutOff = 10
+ppi2Dist_colname  = "interface_dist"
+naDist_colname    = "TBC"
+
 #==================
 # Angstroms symbol
 #==================
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index de43c74..e1423d0 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -112,7 +112,7 @@ cat(s1)
 #source("other_plots_data.R")
 ####################################################################
 
-source(paste0(plot_script_path, "dm_om_data.R"))
+#source(paste0(plot_script_path, "dm_om_data.R"))
 s2 = c("\nSuccessfully sourced other_plots_data.R")
 cat(s2)
 
diff --git a/scripts/plotting/plotting_thesis/preformatting.R b/scripts/plotting/plotting_thesis/preformatting.R
index 37e7cc0..e53bc8e 100644
--- a/scripts/plotting/plotting_thesis/preformatting.R
+++ b/scripts/plotting/plotting_thesis/preformatting.R
@@ -10,8 +10,10 @@ source("~/git/LSHTM_analysis/config/embb.R")
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 ###################################################################
 # FIXME: ADD distance to NA when SP replies
+# DONE: plotting_globals.R
 dist_columns = c("ligand_distance", "interface_dist")
 DistCutOff = 10
+
 common_cols  = c("mutationinformation"
                  , "X5uhc_position"
                  , "X5uhc_offset"
@@ -98,22 +100,24 @@ df3 = merged_df3
 #=================
 # PREFORMATTING: for consistency
 #=================
-df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
-table(df3$sensitivity)
+# DONE: combining_dfs.R
+# df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
+# table(df3$sensitivity)
 
 # ConSurf labels
-consurf_colOld = "consurf_colour_rev"
-consurf_colNew = "consurf_outcome"
-df3[[consurf_colNew]] = df3[[consurf_colOld]]
-df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]])
-df3[[consurf_colNew]]
+#consurf_colOld = "consurf_colour_rev"
+#consurf_colNew = "consurf_outcome"
+#df3[[consurf_colNew]] = df3[[consurf_colOld]]
+#df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]])
+#df3[[consurf_colNew]]
+# not this bit
 levels(df3$consurf_outcome) = c( "nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
-levels(df3$consurf_outcome)
+#levels(df3$consurf_outcome)
 
 # SNAP2 labels
-snap2_colname = "snap2_outcome"
-df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect")
-df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral")
+#snap2_colname = "snap2_outcome"
+#df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect")
+#df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral")
 
 #  for ref: not needed perse as function already does this and assigns labels for barplots
 # labels_duet = levels(as.factor(df3$duet_outcome))
@@ -138,14 +142,16 @@ df2 = merged_df2
 #=================
 # PREFORMATTING: for consistency
 #=================
-df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S")
-table(df2$sensitivity)
+# DONE: combining_dfs.R
+# df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S")
+# table(df2$sensitivity)
 
 #----------------------------------------------------
 # Create dst2: fill na in dst with value of dst_mode
 # for epistasis
 #----------------------------------------------------
-df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2f$dst)
+# DONE: combining_dfs.R
+# df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2f$dst)
 
 #----------------------------------------------------
 # reverse signs for foldx scaled values for
@@ -168,10 +174,11 @@ scaled_cols_stab_revised = c(scaled_cols_stab_revised, "foldx_scaled_signC")
 
 ######################################################
 # Affinity related variables
-DistCutOff = 10
-LigDist_colname  # = "ligand_distance" # from globals 
-ppi2Dist_colname  = "interface_dist"
-naDist_colname    = "TBC"
+# DONE:in plotting_globals.R
+# DistCutOff = 10
+# LigDist_colname  # = "ligand_distance" # from globals 
+# ppi2Dist_colname  = "interface_dist"
+# naDist_colname    = "TBC"
 
 ######################################################
 # corr colnames