From 6d9412d23266ed1833629c84133e8b7b8c0eafd5 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 26 Aug 2021 16:35:46 +0100
Subject: [PATCH 01/51] playing with dm_om (other)plots data and graph on gid
 branch

---
 scripts/functions/plotting_globals.R    |   3 +-
 scripts/plotting/get_plotting_dfs.R     | 203 +------
 scripts/plotting/other_plots_combined.R |  13 +-
 scripts/plotting/other_plots_data.R     | 693 ++++++++++++++++--------
 4 files changed, 502 insertions(+), 410 deletions(-)

diff --git a/scripts/functions/plotting_globals.R b/scripts/functions/plotting_globals.R
index cfd2848..c28047e 100644
--- a/scripts/functions/plotting_globals.R
+++ b/scripts/functions/plotting_globals.R
@@ -32,7 +32,8 @@ import_dirs <- function(drug_name, gene_name) {
 #===============================
 # mcsm ligand distance cut off
 #===============================
-#mcsm_lig_cutoff <<- 10
+LigDist_colname <<- "ligand_distance" 
+LigDist_cutoff <<- 10
 
 #==================
 # Angstroms symbol
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index a9e78e9..2dae471 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -25,8 +25,8 @@ source("../functions/bp_subcolours.R")
 # variables for lig
 #====================
 
-LigDist_colname = "ligand_distance"
-LigDist_cutoff = 10
+#LigDist_colname = "ligand_distance"
+#LigDist_cutoff = 10
 
 #===========
 # input
@@ -54,10 +54,15 @@ pd_df = plotting_data(mcsm_df
                       , lig_dist_colname = LigDist_colname
                       , lig_dist_cutoff = LigDist_cutoff)
 
-my_df       = pd_df[[1]] 
-my_df_u     = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
-my_df_u_lig = pd_df[[3]] 
-dup_muts    = pd_df[[4]] 
+my_df   = pd_df[[1]] 
+my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
+
+max_ang <- round(max(my_df_u[LigDist_colname]))
+min_ang <- round(min(my_df_u[LigDist_colname]))
+
+cat("\nLigand distance cut off, colname:", LigDist_colname
+    , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
+    , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")
 
 #--------------------------------
 # call: combining_dfs_plotting()
@@ -81,14 +86,22 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
                                       , lig_dist_colname = LigDist_colname
                                       , lig_dist_cutoff = LigDist_cutoff)
 
-merged_df2          = all_plot_dfs[[1]]
-merged_df3          = all_plot_dfs[[2]]
-merged_df2_comp     = all_plot_dfs[[3]]
-merged_df3_comp     = all_plot_dfs[[4]]
-merged_df2_lig      = all_plot_dfs[[5]]
-merged_df3_lig      = all_plot_dfs[[6]]
-merged_df2_comp_lig = all_plot_dfs[[7]]
-merged_df3_comp_lig = all_plot_dfs[[8]]
+merged_df2 = all_plot_dfs[[1]]
+merged_df3 = all_plot_dfs[[2]]
+#======================================================================
+# read other files
+infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
+                            , "_complex_dynamut_norm.csv")
+
+infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
+                              , "_complex_dynamut2_norm.csv")
+
+infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
+                            , "_complex_mcsm_na_norm.csv")
+
+dynamut_df   = read.csv(infilename_dynamut)
+dynamut2_df  = read.csv(infilename_dynamut2)
+mcsm_na_df   = read.csv(infilename_mcsm_na)
 
 ####################################################################
 #                        Data for subcols barplot (~heatmpa)
@@ -168,61 +181,6 @@ subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
 print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
 print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
 
-#=======================
-# Data for sub colours
-# barplot: LIG
-#=======================
-cat("\nNo. of cols to select:", length(cols_to_select))
-
-subcols_df_lig = merged_df3_lig[, cols_to_select]
-
-cat("\nNo of unique positions for LIG:"
-    , length(unique(subcols_df_lig$position)))
-
-# should be a factor
-if (is.factor(subcols_df_lig$ligand_outcome)){
-  cat("\nLigand_outcome is factor")
-  table(subcols_df_lig$ligand_outcome)
-}else{
-  cat("\nConverting ligand_outcome to factor")
-  subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
-  table(subcols_df_lig$ligand_outcome)
-}
-
-# should be -1 and 1
-min(subcols_df_lig$affinity_scaled)
-max(subcols_df_lig$affinity_scaled)
-
-tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
-tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
-
-# check unique values in normalised data
-cat("\nNo. of unique values in affinity scaled, no rounding:"
-    , length(unique(subcols_df_lig$affinity_scaled)))
-
-# No rounding    
-my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
-
-# Add rounding is to be used
-n = 3 
-subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
-
-cat("\nNo. of unique values in duet scaled", n, "places rounding:"
-    , length(unique(subcols_df_lig$affinity_scaledR)))
-
-my_grp_lig_r = subcols_df_lig$affinity_scaledR  # rounding
-
-# Add grp cols
-subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
-subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
-
-# Call the function to create the palette based on the group defined above
-subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
-subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
-
-print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
-print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
-
 ####################################################################
 #                        Data for logoplots
 ####################################################################
@@ -472,113 +430,6 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
       , "\nGot: ", check1)
 }
 
-#=================================
-# Data for Correlation plots: LIG
-#=================================
-cat("\n=========================================="
-    , "\nCORR PLOTS data: LIG"
-    , "\n===========================================")
-
-df_lig = merged_df2_lig
-
-table(df_lig$ligand_outcome)
-
-#--------------------
-# adding log cols : NEW UNCOMMENT
-#--------------------
-#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
-#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
-
-##df_lig$log10_or_kin = log10(df_lig$or_kin)
-##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
-
-#----------------------------
-# columns for corr plots:PS
-#----------------------------
-# subset data to generate pairwise correlations
-cols_to_select =  c("mutationinformation"
-                    , "affinity_scaled"
-                    #, "mutation_info_labels"
-                    , "asa"
-                    , "rsa"
-                    , "rd_values"
-                    , "kd_values"
-                    , "log10_or_mychisq"
-                    , "neglog_pval_fisher"
-                    ##, "or_kin"
-                    ##, "neglog_pwald_kin"
-                    , "af"
-                    ##, "af_kin"
-                    , "ligand_outcome"
-                    , drug)
-
-corr_data_lig = df_lig[, cols_to_select]
-
-dim(corr_data_lig)
-
-#--------------------------------------
-# assign nice colnames (for display)
-#--------------------------------------
-my_corr_colnames = c("Mutation"
-                     , "Ligand Affinity"
-                     #, "Mutation class"
-                     , "ASA"
-                     , "RSA"
-                     , "RD"
-                     , "KD"
-                     , "Log (OR)"
-                     , "-Log (P)"
-                     ##, "Adjusted (OR)"
-                     ##, "-Log (P wald)"
-                     , "MAF"
-                     ##, "MAF_kin"
-                     , "ligand_outcome"
-                     , drug)
-
-length(my_corr_colnames)
-
-colnames(corr_data_lig)
-colnames(corr_data_lig) <- my_corr_colnames
-colnames(corr_data_lig)
-
-start = 1
-end = which(colnames(corr_data_lig) == drug); end # should be the last column
-offset = 1
-
-#=============================
-# Corr data for plots: LIG
-# big_df lig: ~ merged_df2_lig
-#==============================
-#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
-corr_lig_df2 = corr_data_lig[start:end]
-head(corr_lig_df2)
-
-#=============================
-# Corr data for plots: LIG
-# short_df lig: ~ merged_df3_lig
-#==============================
-corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
-
-na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
-check1_lig = nrow(corr_lig_df3) - na_or_lig
-
-if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
-  cat( "\nPASS: No. of rows for corr_lig_df3 match"
-       , "\nPASS: No. of OR values checked: " , check1_lig)
-} else {
-  cat("\nFAIL: Numbers  mismatch:"
-      , "\nExpected nrows: ", nrow(merged_df3_lig)
-      , "\nGot: ", nrow(corr_ps_df3_lig)
-      , "\nExpected OR values: ", nrow(merged_df3_comp_lig)
-      , "\nGot: ", check1_lig)
-}
-
-# remove unnecessary columns
-identical(corr_data_lig, corr_lig_df2)
-identical(corr_data_ps, corr_ps_df2)
-
-#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
-
 ########################################################################
 #                           End of script
 ########################################################################
diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R
index d927808..3047f38 100644
--- a/scripts/plotting/other_plots_combined.R
+++ b/scripts/plotting/other_plots_combined.R
@@ -35,7 +35,7 @@ plot_dr_other_combined_labelled  =  paste0(plotdir,"/", dr_other_combined_labell
 #my_comparisons <- list( c(dr_muts_col, other_muts_col) )
 my_comparisons <- list( c("DM", "OM") )
 
-my_ats = 22# axis text size
+my_ats = 22 # axis text size
 my_als = 20 # axis label size
 my_fls = 20 # facet label size
 my_pts = 22 # plot title size
@@ -45,12 +45,15 @@ my_pts = 22 # plot title size
 #===========
 # Plot1: PS
 #===========
-my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type"
-                        , data = df_lf_ps,  paired = FALSE, p.adjust.method = "BH")
+# my_stat_ps = compare_means(param_value~mutation_info
+#                            , group.by = "param_type"
+#                            , data = df_lf_ps
+#                            , paired = FALSE
+#                            , p.adjust.method = "BH")
 
 y_value = "param_value"
 
-p1 = ggplot(df_lf_ps, aes(x = mutation_info
+p1 = ggplot(lf_duet, aes(x = mutation_info
                     , y = eval(parse(text=y_value)) ))  + 
   facet_wrap(~ param_type
              , nrow = 1
@@ -61,7 +64,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info
   geom_point(position = position_jitterdodge(dodge.width=0.01)
              , alpha = 0.5
              , show.legend = FALSE
-             , aes(colour = factor(duet_outcome))) +
+             , aes(colour = duet_outcome)) +
   theme(axis.text.x = element_text(size = my_ats)
         , axis.text.y = element_text(size = my_ats
                                      , angle = 0
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
index df5c1e3..8fc9e00 100644
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@@ -5,21 +5,18 @@
 #########################################################
 #=======================================================================
 # working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting")
-getwd()
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
 
-#source("Header_TT.R")
-library(ggplot2)
-library(data.table)
-library(dplyr)
-library(tidyverse)
-source("combining_dfs_plotting.R")
-
-rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
-   , merged_df3_comp, merged_df3_comp_lig
-   , my_df_u, my_df_u_lig)
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
 
+#source("get_plotting_dfs.R")
+#=======================================================================
+# MOVE TO COMBINE or singular file for deepddg
 
 cols_to_select = c("mutation", "mutationinformation"
                    , "wild_type", "position", "mutant_type"
@@ -27,275 +24,515 @@ cols_to_select = c("mutation", "mutationinformation"
 
 merged_df3_short = merged_df3[, cols_to_select]
 
-# write merged_df3 to generate structural figure
-write.csv(merged_df3_short, "merged_df3_short.csv")
+infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+                      , "_mcsm_formatted_snps.csv")
 
+mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
+names(mcsm_f_snps) <- "mutationinformation"
+
+# write merged_df3 to generate structural figure on chimera
+#write.csv(merged_df3_short, "merged_df3_short.csv")
 #========================================================================
-#%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT: PS
-#%%%%%%%%%%%%%%%%%%%%
-df_ps = merged_df3
+# MOVE TO COMBINE or singular file for deepddg
 
 #============================
-# adding foldx scaled values
+# adding deepddg scaled values
 # scale data b/w -1 and 1
 #============================
-n = which(colnames(df_ps) == "ddg"); n 
+n = which(colnames(merged_df3) == "deepddg"); n 
 
-my_min = min(df_ps[,n]); my_min 
-my_max = max(df_ps[,n]); my_max 
+my_min = min(merged_df3[,n]); my_min 
+my_max = max(merged_df3[,n]); my_max 
 
-df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
-                            , df_ps[,n]/abs(my_min)
-                            , df_ps[,n]/my_max) 
+merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
+                            , merged_df3[,n]/abs(my_min)
+                            , merged_df3[,n]/my_max) 
 # sanity check
-my_min = min(df_ps$foldx_scaled); my_min 
-my_max = max(df_ps$foldx_scaled); my_max
+my_min = min(merged_df3$deepddg_scaled); my_min 
+my_max = max(merged_df3$deepddg_scaled); my_max
 
 if (my_min == -1 && my_max == 1){
-  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
-      , "\nProceeding with assigning foldx outcome category")
+  cat("PASS: DeepDDG successfully scaled b/w -1 and 1"
+      #, "\nProceeding with assigning deep outcome category")
+      , "\n")
 }else{
-  cat("FAIL: could not scale foldx ddg values"
+  cat("FAIL: could not scale DeepDDG ddg values"
       , "Aborting!")
 }
 
-#================================
-# adding foldx outcome category
-# ddg<0 = "Stabilising" (-ve)
-#=================================
+#========================================================================
+# cols to select
 
-c1 = table(df_ps$ddg < 0)
-df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
-c2 = table(df_ps$ddg < 0)
+cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation"
+                               , "mutation_info", "position"
+                               , LigDist_colname
+                               , "duet_stability_change", "duet_scaled", "duet_outcome"
+                               , "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
+                               , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                               , "deepddg", "deepddg_scaled", "deepddg_outcome"
+                               , "asa", "rsa"
+                               , "rd_values", "kd_values"
+                               , "log10_or_mychisq", "neglog_pval_fisher", "af")]
 
-if ( all(c1 == c2) ){
-  cat("PASS: foldx outcome successfully created")
-}else{
-  cat("FAIL: foldx outcome could not be created. Aborting!")
-  exit()
+cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" 
+                                  , "mcsm_na_affinity", "mcsm_na_scaled"
+                                  , "mcsm_na_outcome")]
+# entire dynamut_df
+
+cols_dynamut2_df <- dynamut2_df[, c("mutationinformation"
+                                    , "ddg_dynamut2", "ddg_dynamut2_scaled"
+                                    , "ddg_dynamut2_outcome")]
+
+n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + 
+  length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols
+
+i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df))
+i2<- intersect(names(dynamut_df), names(cols_dynamut2_df))
+merging_cols <- intersect(i1, i2)
+cat("\nmerging_cols:", merging_cols)
+
+if (merging_cols == "mutationinformation") {
+  cat("\nStage 1: Found common col between dfs, checking values in it...")
+  c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]])
+  c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]])
+  c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]])
+  c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]])
+  cols_check <- c(c1, c2, c3, c4)
+  expected_cols = n_comb_cols - ( length(cols_check) - 1)
+  if (all(cols_check)){
+    cat("\nStage 2:Proceeding with merging dfs:\n")
+    comb_df <- Reduce(inner_join, list(cols_mcsm_df
+                                       , cols_mcsm_na_df
+                                       , dynamut_df
+                                       , cols_dynamut2_df))
+    comb_df_s = arrange(comb_df, position)
+    
+    # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) {
+    #   cat("\Stage3, PASS: dfs merged sucessfully"
+    #       , "\nnrow of merged_df: ", nrow(comb_df_s)
+    #       , "\nncol of merged_df:", ncol(comb_df_s))
+    #   }
+    
+    }
 }
+names(comb_df_s)
 #=======================================================================
-# name tidying
-df_ps$mutation_info = as.factor(df_ps$mutation_info)
-df_ps$duet_outcome = as.factor(df_ps$duet_outcome)
-df_ps$foldx_outcome  = as.factor(df_ps$foldx_outcome)
-df_ps$ligand_outcome  = as.factor(df_ps$ligand_outcome)
+fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
+fact_cols
+lapply(comb_df_s[, fact_cols], class)
+comb_df_s[,fact_cols] <- lapply(comb_df_s[,cols],as.factor)
 
-# check
-table(df_ps$mutation_info)
+if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
+  cat("\nChanging cols to factor")
+  comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
+  if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}
+lapply(comb_df_s[, fact_cols], class)
+
+#=======================================================================
+table(comb_df_s$mutation_info)
 
  # further checks to make sure dr and other muts are indeed unique
-dr_muts = df_ps[df_ps$mutation_info == dr_muts_col,]
+dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
 dr_muts_names = unique(dr_muts$mutation)
 
-other_muts = df_ps[df_ps$mutation_info == other_muts_col,]
+other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
 other_muts_names = unique(other_muts$mutation)
 
 if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
   table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
   cat("PASS: dr and other muts are indeed unique")
 }else{
-  cat("FAIL: dr adn others muts are NOT unique!")
+  cat("FAIL: dr and others muts are NOT unique!")
   quit()
 }
 
+# pretty display names i.e. labels to reduce major code duplication later
+foo_cnames = data.frame(colnames(comb_df_s))
+names(foo_cnames) <- "old_name"
 
-#%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT: LIG
-#%%%%%%%%%%%%%%%%%%%%
+stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
+flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
 
-df_lig = merged_df3_lig
+lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
+duet_dn      = paste0("DUET ", stability_suffix); duet_dn
+foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
+deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
+mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
+dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
+dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
+encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
+encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
+sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
+mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
 
-# name tidying
-df_lig$mutation_info = as.factor(df_lig$mutation_info)
-df_lig$duet_outcome = as.factor(df_lig$duet_outcome)
-#df_lig$ligand_outcome  = as.factor(df_lig$ligand_outcome)
-
-# check
-table(df_lig$mutation_info)
-
-#========================================================================
-#===========
-# Data: ps
-#===========
-# keep similar dtypes cols together
-cols_to_select_ps = c("mutationinformation", "mutation", "position", "mutation_info"
-                   , "duet_outcome"
+# Change colnames of some columns using datatable 
+comb_df_sl = comb_df_s
+names(comb_df_sl)
 
+setnames(comb_df_sl
+         , old = c("asa", "rsa", "rd_values", "kd_values"
+                   , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                   , LigDist_colname
                    , "duet_scaled"
-                   , "ligand_distance"
-                   , "asa"
-                   , "rsa"
-                   , "rd_values"
-                   , "kd_values")
+                   , "foldx_scaled"
+                   , "deepddg_scaled"
+                   , "mcsm_na_scaled"
+                   , "ddg_dynamut_scaled"
+                   , "ddg_dynamut2_scaled"
+                   , "ddg_encom_scaled"
+                   , "dds_encom_scaled"
+                   , "ddg_sdm"
+                   , "ddg_mcsm")
+                   
+         , new = c("ASA", "RSA", "RD", "KD"
+                   , "Log10 (OR)", "-Log (P)", "MAF"
+                   , lig_dn
+                   , duet_dn
+                   , foldx_dn
+                   , deepddg_dn
+                   , mcsm_na_dn
+                   , dynamut_dn
+                   , dynamut2_dn
+                   , encom_ddg_dn
+                   , encom_dds_dn
+                   , sdm_dn
+                   , mcsm_dn)
+         )
 
-df_wf_ps = df_ps[, cols_to_select_ps]
+foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
 
-pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+# some more pretty labels
+table(comb_df_sl$mutation_info)
 
-expected_rows_lf_ps = nrow(df_wf_ps) * (length(df_wf_ps) - length(pivot_cols_ps))
-expected_rows_lf_ps
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
+
+table(comb_df_sl$mutation_info)
+
+#######################################################################
+#======================
+# Selecting dfs
+# with appropriate cols
+#=======================
+static_cols_start =  c("mutationinformation"
+                       , "position"
+                       , "mutation"
+                       , "mutation_info")
+
+static_cols_end = c(lig_dn
+                    , "ASA"
+                    , "RSA"
+                    , "RD"
+                    , "KD")
+
+# ordering is important!
+
+#########################################################################
+#==============
+# DUET: LF
+#==============
+cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
+wf_duet = comb_df_sl[, cols_to_select_duet]
+
+#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
+
+expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
+expected_rows_lf
 
 # LF data: duet
-df_lf_ps = gather(df_wf_ps, param_type, param_value, duet_scaled:kd_values, factor_key=TRUE)
+lf_duet = gather(wf_duet
+                  , key = param_type
+                  , value = param_value
+                  , all_of(duet_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
 
-if (nrow(df_lf_ps) == expected_rows_lf_ps){
-  cat("PASS: long format data created for duet")
+if (nrow(lf_duet) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", duet_dn)
 }else{
-  cat("FAIL: long format data could not be created for duet")
-  exit()
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
 }
 
-str(df_wf_ps)
-str(df_lf_ps)
-
-# assign pretty labels: param_type
-levels(df_lf_ps$param_type); table(df_lf_ps$param_type)
-
-ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")")
-ligand_dist_colname
-
-duet_stability_name = paste0(delta_symbol, delta_symbol, "G")
-duet_stability_name
-  
-#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- "Stability"
-levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- duet_stability_name
-#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- "Ligand Distance"
-levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- ligand_dist_colname
-levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="asa"] <- "ASA"
-levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rsa"] <- "RSA"
-levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rd_values"] <- "RD"
-levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="kd_values"] <- "KD"
-# check
-levels(df_lf_ps$param_type); table(df_lf_ps$param_type)
-
-# assign pretty labels: mutation_info
-levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info)
-sum(table(df_lf_ps$mutation_info)) == nrow(df_lf_ps)
-
-levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==dr_muts_col] <- "DM"
-levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==other_muts_col] <- "OM"
-# check
-levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info)
-
 ############################################################################
+#==============
+# FoldX: LF
+#==============
+cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
+wf_foldx = comb_df_sl[, cols_to_select_foldx]
 
-#===========
-# LF data: LIG
-#===========
-# keep similar dtypes cols together
-cols_to_select_lig = c("mutationinformation", "mutation", "position", "mutation_info"
-                       , "ligand_outcome"
-                       
-                       , "affinity_scaled"
-                       #, "ligand_distance"
-                       , "asa"
-                       , "rsa"
-                       , "rd_values"
-                       , "kd_values")
+pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
 
-df_wf_lig = df_lig[, cols_to_select_lig]
+expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
+expected_rows_lf
 
-pivot_cols_lig = cols_to_select_lig[1:5]; pivot_cols_lig
+# LF data: duet
+print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>")
+lf_foldx <<- gather(wf_foldx
+                 , key = param_type
+                 , value = param_value
+                 , all_of(foldx_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
 
-expected_rows_lf_lig = nrow(df_wf_lig) * (length(df_wf_lig) - length(pivot_cols_lig))
-expected_rows_lf_lig
-
-# LF data: foldx
-df_lf_lig = gather(df_wf_lig, param_type, param_value, affinity_scaled:kd_values, factor_key=TRUE)
-
-if (nrow(df_lf_lig) == expected_rows_lf_lig){
-  cat("PASS: long format data created for foldx")
+if (nrow(lf_foldx) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", foldx_dn)
 }else{
-  cat("FAIL: long format data could not be created for foldx")
-  exit()
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
 }
 
-# assign pretty labels: param_type
-levels(df_lf_lig$param_type); table(df_lf_lig$param_type)
-
-levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="affinity_scaled"] <- "Ligand Affinity"
-#levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="ligand_distance"] <- "Ligand Distance"
-levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="asa"] <- "ASA"
-levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rsa"] <- "RSA"
-levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rd_values"] <- "RD"
-levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="kd_values"] <- "KD"
-#check
-levels(df_lf_lig$param_type); table(df_lf_lig$param_type)
-
-# assign pretty labels: mutation_info
-levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info)
-sum(table(df_lf_lig$mutation_info)) == nrow(df_lf_lig)
-
-levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==dr_muts_col] <- "DM"
-levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==other_muts_col] <- "OM"
-# check
-levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info)
-
-#############################################################################
-#===========
-# Data: foldx
-#===========
-# keep similar dtypes cols together
-cols_to_select_foldx = c("mutationinformation", "mutation", "position", "mutation_info"
-                      , "foldx_outcome"
-                      
-                      , "foldx_scaled")
-                      #, "ligand_distance"
-                      #, "asa"
-                      #, "rsa"
-                      #, "rd_values"
-                      #, "kd_values")
-
-
-df_wf_foldx = df_ps[, cols_to_select_foldx]
-
-pivot_cols_foldx = cols_to_select_foldx[1:5]; pivot_cols_foldx
-  
-expected_rows_lf_foldx = nrow(df_wf_foldx) * (length(df_wf_foldx) - length(pivot_cols_foldx))
-expected_rows_lf_foldx
-
-# LF data: foldx
-df_lf_foldx = gather(df_wf_foldx, param_type, param_value, foldx_scaled, factor_key=TRUE)
-
-if (nrow(df_lf_foldx) == expected_rows_lf_foldx){
-  cat("PASS: long format data created for foldx")
-}else{
-  cat("FAIL: long format data could not be created for foldx")
-  exit()
-}
-
-foldx_stability_name = paste0(delta_symbol, delta_symbol, "G")
-foldx_stability_name
-
-# assign pretty labels: param type
-levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type)
-
-#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- "Stability"
-levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- foldx_stability_name
-#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="ligand_distance"] <- "Ligand Distance"
-#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="asa"] <- "ASA"
-#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rsa"] <- "RSA"
-#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rd_values"] <- "RD"
-#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="kd_values"] <- "KD"
-# check
-levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type)
-
-# assign pretty labels: mutation_info
-levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info)
-sum(table(df_lf_foldx$mutation_info)) == nrow(df_lf_foldx)
-
-levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==dr_muts_col] <- "DM"
-levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==other_muts_col] <- "OM"
-# check
-levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info)
-
 ############################################################################
+#==============
+# Deepddg: LF
+#==============
+cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
+wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
 
-# clear excess variables
-rm(cols_to_select_ps, cols_to_select_foldx, cols_to_select_lig
-   , pivot_cols_ps, pivot_cols_foldx, pivot_cols_lig
-  , expected_rows_lf_ps, expected_rows_lf_foldx, expected_rows_lf_lig
-  , my_max, my_min, na_count, na_count_df2, na_count_df3, dup_muts_nu
-  , c1, c2, n)
+pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
+
+expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
+expected_rows_lf
+
+# LF data: duet
+lf_deepddg = gather(wf_deepddg
+                  , key = param_type
+                  , value = param_value
+                  , all_of(deepddg_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_deepddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", deepddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM-NA: LF
+#==============
+cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
+wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+
+pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
+
+expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+expected_rows_lf
+
+# LF data: duet
+lf_mcsm_na = gather(wf_mcsm_na
+                    , key = param_type
+                    , value = param_value
+                    , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_mcsm_na) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", mcsm_na_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut: LF
+#==============
+cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
+wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
+
+pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
+
+expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
+expected_rows_lf
+
+# LF data: duet
+lf_dynamut = gather(wf_dynamut
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut2: LF
+#==============
+cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
+
+wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
+
+pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
+
+expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
+expected_rows_lf
+
+# LF data: duet
+lf_dynamut2 = gather(wf_dynamut2
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut2_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut2) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut2_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# EnCOM ddg: LF
+#==============
+cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
+wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
+
+pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
+
+expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomddg  = gather(wf_encomddg 
+                     , key = param_type
+                     , value = param_value
+                     , all_of(encom_ddg_dn):tail(static_cols_end,1)
+                     , factor_key = TRUE)
+
+if (nrow(lf_encomddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", encom_ddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+#==============
+# EnCOM dds: LF
+#==============
+cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
+wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
+
+pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
+
+expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomdds  = gather(wf_encomdds
+                      , key = param_type
+                      , value = param_value
+                      , all_of(encom_dds_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+
+if (nrow(lf_encomdds) == expected_rows_lf){
+  cat("\nPASS: long format data created for", encom_dds_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# SDM: LF
+#==============
+cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
+wf_sdm = comb_df_sl[, cols_to_select_sdm]
+
+pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
+
+expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_sdm  = gather(wf_sdm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(sdm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_sdm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", sdm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM: LF
+#==============
+cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
+wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
+
+pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
+
+expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_mcsm  = gather(wf_mcsm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(mcsm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_mcsm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", mcsm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+# # clear excess variables
+# rm(all_plot_dfs
+#    , cols_dynamut2_df
+#    , cols_mcsm_df
+#    , cols_mcsm_na_df
+#    , comb_df
+#    , corr_data_ps
+#    , corr_ps_df3
+#    , df_lf_ps
+#    , foo
+#    , foo_cnames
+#    , gene_metadata
+#    , logo_data
+#    , logo_data_or_mult
+#    , logo_data_plot
+#    , logo_data_plot_logor
+#    , logo_data_plot_or
+#    , my_data_snp
+#    , my_df
+#    , my_df_u
+#    , ols_mcsm_df
+#    , other_muts
+#    , pd_df
+#    , subcols_df_ps
+#    , tab_mt
+#    , wide_df_logor
+#    , wide_df_logor_m
+#    , wide_df_or
+#    , wide_df_or_mult
+#    , wt)
+# 
+# 
+# rm(c3, c4, check1
+#    , cols_check
+#    , cols_to_select
+#    , cols_to_select_deepddg
+#    , cols_to_select_duet
+#    , cols_to_select_dynamut
+#    , cols_to_select_dynamut2
+#    , cols_to_select_encomddg
+#    , cols_to_select_encomdds
+#    , cols_to_select_mcsm
+#    , cols_to_select_mcsm_na
+#    , cols_to_select_sdm)

From 6e01ef22c00070ef820817b2f71c0709129f81a6 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 26 Aug 2021 16:37:56 +0100
Subject: [PATCH 02/51] added stat_bp_stability.R which needs to be a function
 for generating stat plots

---
 scripts/functions/stat_bp_stability.R | 51 +++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 scripts/functions/stat_bp_stability.R

diff --git a/scripts/functions/stat_bp_stability.R b/scripts/functions/stat_bp_stability.R
new file mode 100644
index 0000000..a34b66f
--- /dev/null
+++ b/scripts/functions/stat_bp_stability.R
@@ -0,0 +1,51 @@
+my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type"
+                           , data = df_lf_ps,  paired = FALSE, p.adjust.method = "BH")
+
+y_value = "param_value"
+
+stat_bp_mut <- function(df
+                        , x_bp_cateog = "mutation_info"
+                        , y_var = "param_value"
+                        , facet_var = "param_type"
+                        , scales = "free_y"
+                        , title = ""
+                        , col_categ = "duet_outcome"
+                        , grp_comp = "my_comparisons"
+                        , stat_method = "wilcox.test"
+                        , my_paired = FALSE
+                        #, stat_label = "p.format")
+                        , stat_label = "p.signif" )
+
+p1 = ggplot(df_lf_ps, aes(x = mutation_info
+                          , y = eval(parse(text = y_value)) ))  + 
+  facet_wrap(~ param_type
+             , nrow = 1
+             , scales = "free_y") + 
+  geom_boxplot(fill = "white", outlier.colour = NA
+               #, position = position_dodge(width = 0.9)
+               , width = 0.2) +
+  geom_point(position = position_jitterdodge(dodge.width=0.01)
+             , alpha = 0.5
+             , show.legend = FALSE
+             , aes(colour = factor(duet_outcome))) +
+  theme(axis.text.x = element_text(size = my_ats)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_ats)
+        , axis.title.y = element_text(size = my_ats)
+        , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold")
+        , strip.background = element_rect(fill = "khaki2")
+        , strip.text.x = element_text(size = my_fls, colour = "black")
+        , legend.title = element_text(color = "black", size = my_als)
+        , legend.text = element_text(size = my_ats)
+        , legend.direction = "vertical") +
+  labs(title = "DUET"
+       , x = ""
+       , y = "")+ 
+  stat_compare_means(comparisons = my_comparisons
+                     , method = "wilcox.test"
+                     , paired = FALSE
+                     #, label = "p.format")
+                     , label = "p.signif")
\ No newline at end of file

From da9bb677060a468a8fe7bcbd996bff5a5183d6eb Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 27 Aug 2021 13:01:52 +0100
Subject: [PATCH 03/51] added function for stats from lf data

---
 scripts/functions/lf_unpaired_stats.R      | 21 +++++
 scripts/functions/stat_bp_stability.R      | 57 +++++++++++++-
 scripts/functions/test_lf_unpaired_stats.R | 17 ++++
 scripts/plotting/other_plots_data.R        | 92 +++++++++++-----------
 4 files changed, 137 insertions(+), 50 deletions(-)
 create mode 100644 scripts/functions/lf_unpaired_stats.R
 create mode 100644 scripts/functions/test_lf_unpaired_stats.R

diff --git a/scripts/functions/lf_unpaired_stats.R b/scripts/functions/lf_unpaired_stats.R
new file mode 100644
index 0000000..28a8ad0
--- /dev/null
+++ b/scripts/functions/lf_unpaired_stats.R
@@ -0,0 +1,21 @@
+library(ggpubr)
+###################################################################
+
+lf_unpaired_stats <- function(lf_data
+                              , lf_stat_value = "param_value"
+                              , lf_stat_group = "mutation_info"
+                              , lf_col_statvars = "param_type"
+                              , my_paired = FALSE
+                              , stat_adj = "none"){
+  
+  stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
+  
+  my_stat_df = compare_means(stat_formula
+                             , group.by = lf_col_statvars
+                             , data = lf_data
+                             , paired = my_paired
+                             , p.adjust.method =  stat_adj)  
+  
+  
+  return(my_stat_df)
+}
\ No newline at end of file
diff --git a/scripts/functions/stat_bp_stability.R b/scripts/functions/stat_bp_stability.R
index a34b66f..8ca4a7f 100644
--- a/scripts/functions/stat_bp_stability.R
+++ b/scripts/functions/stat_bp_stability.R
@@ -1,8 +1,49 @@
-my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type"
-                           , data = df_lf_ps,  paired = FALSE, p.adjust.method = "BH")
+library(ggpubr)
+###################################################################
+
+my_unpaired_stats <- function(lf_data
+                              , lf_stat_value = "param_value"
+                              , lf_stat_group = "mutation_info"
+                              , lf_col_statvars = "param_type"
+                              , my_paired = FALSE
+                              , stat_adj = "none"){
+  
+  stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
+  
+  my_stat_df = compare_means(stat_formula
+                             , group.by = lf_col_statvars
+                             , data = lf_data
+                             , paired = my_paired
+                             , p.adjust.method =  stat_adj)  
+  
+  
+  return(my_stat_df)
+}
+  
+#####################
+# call stat function
+#####################
+stat_results_df <- my_unpaired_stats(lf_data =  lf_duet
+                  , lf_stat_value = "param_value"
+                  , lf_stat_group = "mutation_info"
+                  , lf_col_statvars = "param_type"
+                  , my_paired = FALSE
+                  , stat_adj = "none"
+)
 
 y_value = "param_value"
 
+#################################
+my_comparisons <- list( c("DM", "OM") )
+
+my_ats = 22 # axis text size
+my_als = 20 # axis label size
+my_fls = 20 # facet label size
+my_pts = 22 # plot title size
+
+####################################
+
+
 stat_bp_mut <- function(df
                         , x_bp_cateog = "mutation_info"
                         , y_var = "param_value"
@@ -16,7 +57,12 @@ stat_bp_mut <- function(df
                         #, stat_label = "p.format")
                         , stat_label = "p.signif" )
 
-p1 = ggplot(df_lf_ps, aes(x = mutation_info
+
+#############################
+y_value = "param_value"
+
+
+p1 = ggplot(lf_duet, aes(x = mutation_info
                           , y = eval(parse(text = y_value)) ))  + 
   facet_wrap(~ param_type
              , nrow = 1
@@ -48,4 +94,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info
                      , method = "wilcox.test"
                      , paired = FALSE
                      #, label = "p.format")
-                     , label = "p.signif")
\ No newline at end of file
+                     , label = "p.signif")
+
+p1
+
diff --git a/scripts/functions/test_lf_unpaired_stats.R b/scripts/functions/test_lf_unpaired_stats.R
new file mode 100644
index 0000000..9ec4aac
--- /dev/null
+++ b/scripts/functions/test_lf_unpaired_stats.R
@@ -0,0 +1,17 @@
+setwd("~/git/LSHTM_analysis/scripts/functions")
+source("lf_unpaired_stats.R")
+
+# run other_plots_data.R
+# to get the df you want to test this function 
+
+
+#####################
+# call stat function
+#####################
+stat_results_df <- lf_unpaired_stats(lf_data =  lf_duet
+                  , lf_stat_value = "param_value"
+                  , lf_stat_group = "mutation_info"
+                  , lf_col_statvars = "param_type"
+                  , my_paired = FALSE
+                  , stat_adj = "none"
+)
\ No newline at end of file
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
index 8fc9e00..d2229b8 100644
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@@ -6,7 +6,7 @@
 #=======================================================================
 # working dir and loading libraries
 # getwd()
-# setwd("~/git/LSHTM_analysis/scripts/plotting")
+setwd("~/git/LSHTM_analysis/scripts/plotting")
 # getwd()
 
 # make cmd
@@ -14,7 +14,7 @@
 # drug = "streptomycin"
 # gene = "gid"
 
-#source("get_plotting_dfs.R")
+source("get_plotting_dfs.R")
 #=======================================================================
 # MOVE TO COMBINE or singular file for deepddg
 
@@ -492,47 +492,47 @@ if (nrow(lf_mcsm) == expected_rows_lf){
   quit()
 }
 ############################################################################
-# # clear excess variables
-# rm(all_plot_dfs
-#    , cols_dynamut2_df
-#    , cols_mcsm_df
-#    , cols_mcsm_na_df
-#    , comb_df
-#    , corr_data_ps
-#    , corr_ps_df3
-#    , df_lf_ps
-#    , foo
-#    , foo_cnames
-#    , gene_metadata
-#    , logo_data
-#    , logo_data_or_mult
-#    , logo_data_plot
-#    , logo_data_plot_logor
-#    , logo_data_plot_or
-#    , my_data_snp
-#    , my_df
-#    , my_df_u
-#    , ols_mcsm_df
-#    , other_muts
-#    , pd_df
-#    , subcols_df_ps
-#    , tab_mt
-#    , wide_df_logor
-#    , wide_df_logor_m
-#    , wide_df_or
-#    , wide_df_or_mult
-#    , wt)
-# 
-# 
-# rm(c3, c4, check1
-#    , cols_check
-#    , cols_to_select
-#    , cols_to_select_deepddg
-#    , cols_to_select_duet
-#    , cols_to_select_dynamut
-#    , cols_to_select_dynamut2
-#    , cols_to_select_encomddg
-#    , cols_to_select_encomdds
-#    , cols_to_select_mcsm
-#    , cols_to_select_mcsm_na
-#    , cols_to_select_sdm)
+# clear excess variables
+rm(all_plot_dfs
+   , cols_dynamut2_df
+   , cols_mcsm_df
+   , cols_mcsm_na_df
+   , comb_df
+   , corr_data_ps
+   , corr_ps_df3
+   , df_lf_ps
+   , foo
+   , foo_cnames
+   , gene_metadata
+   , logo_data
+   , logo_data_or_mult
+   , logo_data_plot
+   , logo_data_plot_logor
+   , logo_data_plot_or
+   , my_data_snp
+   , my_df
+   , my_df_u
+   , ols_mcsm_df
+   , other_muts
+   , pd_df
+   , subcols_df_ps
+   , tab_mt
+   , wide_df_logor
+   , wide_df_logor_m
+   , wide_df_or
+   , wide_df_or_mult
+   , wt)
+
+
+rm(c3, c4, check1
+   , cols_check
+   , cols_to_select
+   , cols_to_select_deepddg
+   , cols_to_select_duet
+   , cols_to_select_dynamut
+   , cols_to_select_dynamut2
+   , cols_to_select_encomddg
+   , cols_to_select_encomdds
+   , cols_to_select_mcsm
+   , cols_to_select_mcsm_na
+   , cols_to_select_sdm)

From edb409baef92d835d73e0c4ae9533290c0702d39 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 27 Aug 2021 13:03:39 +0100
Subject: [PATCH 04/51] renamed dm_om barplot function scriptto
 lf_bp_stability.R

---
 scripts/functions/{stat_bp_stability.R => lf_bp_stability.R} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/functions/{stat_bp_stability.R => lf_bp_stability.R} (100%)

diff --git a/scripts/functions/stat_bp_stability.R b/scripts/functions/lf_bp_stability.R
similarity index 100%
rename from scripts/functions/stat_bp_stability.R
rename to scripts/functions/lf_bp_stability.R

From 826d3c72b724a27739bf5225beb145ec7eadbe72 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 27 Aug 2021 14:05:00 +0100
Subject: [PATCH 05/51] added functions for bp with stat and tested them

---
 scripts/functions/lf_bp_stability.R        | 100 ---------------------
 scripts/functions/lf_bp_with_stats.R       |  68 ++++++++++++++
 scripts/functions/test_lf_bp_with_stats.R  |  28 ++++++
 scripts/functions/test_lf_unpaired_stats.R |  12 +--
 4 files changed, 103 insertions(+), 105 deletions(-)
 delete mode 100644 scripts/functions/lf_bp_stability.R
 create mode 100644 scripts/functions/lf_bp_with_stats.R
 create mode 100644 scripts/functions/test_lf_bp_with_stats.R

diff --git a/scripts/functions/lf_bp_stability.R b/scripts/functions/lf_bp_stability.R
deleted file mode 100644
index 8ca4a7f..0000000
--- a/scripts/functions/lf_bp_stability.R
+++ /dev/null
@@ -1,100 +0,0 @@
-library(ggpubr)
-###################################################################
-
-my_unpaired_stats <- function(lf_data
-                              , lf_stat_value = "param_value"
-                              , lf_stat_group = "mutation_info"
-                              , lf_col_statvars = "param_type"
-                              , my_paired = FALSE
-                              , stat_adj = "none"){
-  
-  stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
-  
-  my_stat_df = compare_means(stat_formula
-                             , group.by = lf_col_statvars
-                             , data = lf_data
-                             , paired = my_paired
-                             , p.adjust.method =  stat_adj)  
-  
-  
-  return(my_stat_df)
-}
-  
-#####################
-# call stat function
-#####################
-stat_results_df <- my_unpaired_stats(lf_data =  lf_duet
-                  , lf_stat_value = "param_value"
-                  , lf_stat_group = "mutation_info"
-                  , lf_col_statvars = "param_type"
-                  , my_paired = FALSE
-                  , stat_adj = "none"
-)
-
-y_value = "param_value"
-
-#################################
-my_comparisons <- list( c("DM", "OM") )
-
-my_ats = 22 # axis text size
-my_als = 20 # axis label size
-my_fls = 20 # facet label size
-my_pts = 22 # plot title size
-
-####################################
-
-
-stat_bp_mut <- function(df
-                        , x_bp_cateog = "mutation_info"
-                        , y_var = "param_value"
-                        , facet_var = "param_type"
-                        , scales = "free_y"
-                        , title = ""
-                        , col_categ = "duet_outcome"
-                        , grp_comp = "my_comparisons"
-                        , stat_method = "wilcox.test"
-                        , my_paired = FALSE
-                        #, stat_label = "p.format")
-                        , stat_label = "p.signif" )
-
-
-#############################
-y_value = "param_value"
-
-
-p1 = ggplot(lf_duet, aes(x = mutation_info
-                          , y = eval(parse(text = y_value)) ))  + 
-  facet_wrap(~ param_type
-             , nrow = 1
-             , scales = "free_y") + 
-  geom_boxplot(fill = "white", outlier.colour = NA
-               #, position = position_dodge(width = 0.9)
-               , width = 0.2) +
-  geom_point(position = position_jitterdodge(dodge.width=0.01)
-             , alpha = 0.5
-             , show.legend = FALSE
-             , aes(colour = factor(duet_outcome))) +
-  theme(axis.text.x = element_text(size = my_ats)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_ats)
-        , axis.title.y = element_text(size = my_ats)
-        , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold")
-        , strip.background = element_rect(fill = "khaki2")
-        , strip.text.x = element_text(size = my_fls, colour = "black")
-        , legend.title = element_text(color = "black", size = my_als)
-        , legend.text = element_text(size = my_ats)
-        , legend.direction = "vertical") +
-  labs(title = "DUET"
-       , x = ""
-       , y = "")+ 
-  stat_compare_means(comparisons = my_comparisons
-                     , method = "wilcox.test"
-                     , paired = FALSE
-                     #, label = "p.format")
-                     , label = "p.signif")
-
-p1
-
diff --git a/scripts/functions/lf_bp_with_stats.R b/scripts/functions/lf_bp_with_stats.R
new file mode 100644
index 0000000..336c270
--- /dev/null
+++ b/scripts/functions/lf_bp_with_stats.R
@@ -0,0 +1,68 @@
+library(ggpubr)
+###################################################################
+
+####################################
+lf_bp_with_stats <- function(lf_df
+                        , x_grp = "mutation_info"
+                        , y_var = "param_value"
+                        , facet_var = "param_type"
+                        , n_facet_row = 1
+                        , y_scales = "free_y"
+                        , p_title = ""
+                        , colour_categ = ""
+                        , colour_bp_strip = "khaki2"
+                        , stat_grp_comp = c("DM", "OM")
+                        , stat_method = "wilcox.test"
+                        , my_paired = FALSE
+                        #, stat_label = "p.format")
+                        , stat_label = c("p.format", "p.signif")
+                        , my_ats = 22 # axis text size
+                        , my_als = 20 # axis label size
+                        , my_fls = 20 # facet label size
+                        , my_pts = 22 # plot title size
+) {
+  my_comparisonsL <- list( stat_grp_comp )
+
+  bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
+                    , y = eval(parse(text = y_var)) ))  + 
+    
+    facet_wrap(~ eval(parse(text = facet_var))
+               , nrow = n_facet_row
+               , scales = y_scales) +
+    
+    geom_boxplot(fill = "white", outlier.colour = NA
+                 #, position = position_dodge(width = 0.9)
+                 , width = 0.2) +
+    
+    geom_point(position = position_jitterdodge(dodge.width = 0.01)
+               , alpha = 0.5
+               , show.legend = FALSE
+               , aes(colour = factor(eval(parse(text = colour_categ))) )) +
+    
+    theme(axis.text.x = element_text(size = my_ats)
+          , axis.text.y = element_text(size = my_ats
+                                       , angle = 0
+                                       , hjust = 1
+                                       , vjust = 0)
+          , axis.title.x = element_text(size = my_ats)
+          , axis.title.y = element_text(size = my_ats)
+          , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold")
+          , strip.background = element_rect(fill = colour_bp_strip)
+          , strip.text.x = element_text(size = my_fls, colour = "black")
+          , legend.title = element_text(color = "black", size = my_als)
+          , legend.text = element_text(size = my_ats)
+          , legend.direction = "vertical") +
+    
+    labs(title = p_title
+         , x = ""
+         , y = "")+ 
+  
+    stat_compare_means(comparisons = my_comparisonsL
+                       , method = stat_method
+                       , paired = my_paired
+                       #, label = "p.format")
+                       , label = stat_label[1])
+  
+  return(bp_statP)
+
+}
diff --git a/scripts/functions/test_lf_bp_with_stats.R b/scripts/functions/test_lf_bp_with_stats.R
new file mode 100644
index 0000000..4cfff9d
--- /dev/null
+++ b/scripts/functions/test_lf_bp_with_stats.R
@@ -0,0 +1,28 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+
+source("../functions/lf_bp_with_stats.R")
+
+######################
+# call function
+######################
+# Note: Data
+# run other_plots_data.R
+# to get the long format data to test this function 
+
+lf_bp_with_stats(lf_df = lf_dynamut2
+                       , x_grp = "mutation_info"
+                       , y_var = "param_value"
+                       , facet_var = "param_type"
+                       , n_facet_row = 1
+                       , y_scales = "free_y"
+                       , p_title = "Dynamut2"
+                       , colour_categ = "ddg_dynamut2_outcome"
+                       , stat_grp_comp = c("DM", "OM")
+                       , stat_method = "wilcox.test"
+                       , my_paired = FALSE
+                       #, stat_label = "p.format")
+                       , stat_label = c("p.format", "p.signif")
+                       , my_ats = 22 # axis text size
+                       , my_als = 20 # axis label size
+                       , my_fls = 20 # facet label size
+                       , my_pts = 22 )# plot title size 
diff --git a/scripts/functions/test_lf_unpaired_stats.R b/scripts/functions/test_lf_unpaired_stats.R
index 9ec4aac..a1ff9d1 100644
--- a/scripts/functions/test_lf_unpaired_stats.R
+++ b/scripts/functions/test_lf_unpaired_stats.R
@@ -1,13 +1,15 @@
 setwd("~/git/LSHTM_analysis/scripts/functions")
 source("lf_unpaired_stats.R")
 
+#####################
+# call stat function()
+# a useful way to check stats
+# for any lf data
+#####################
+# Note: Data
 # run other_plots_data.R
-# to get the df you want to test this function 
+# to get the long format data to test this function 
 
-
-#####################
-# call stat function
-#####################
 stat_results_df <- lf_unpaired_stats(lf_data =  lf_duet
                   , lf_stat_value = "param_value"
                   , lf_stat_group = "mutation_info"

From fcb4b85747c50ef5318f40254f3852d5d20cf21f Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 2 Sep 2021 12:50:24 +0100
Subject: [PATCH 06/51] modified bp with option for adding stats and
 boxplplots. Moved old one to redundant

---
 scripts/functions/lf_bp.R                     | 193 ++++++++++++++++++
 .../{ => redundant}/lf_bp_with_stats.R        |  55 +++--
 .../redundant/test_lf_bp_with_stats.R         |  83 ++++++++
 scripts/functions/test_lf_bp.R                |  55 +++++
 scripts/functions/test_lf_bp_with_stats.R     |  28 ---
 scripts/plotting/Header_TT.R                  |  67 +++---
 scripts/plotting/get_plotting_dfs.R           |  25 ++-
 scripts/plotting/other_plots_data.R           |  39 ++--
 8 files changed, 443 insertions(+), 102 deletions(-)
 create mode 100644 scripts/functions/lf_bp.R
 rename scripts/functions/{ => redundant}/lf_bp_with_stats.R (54%)
 create mode 100644 scripts/functions/redundant/test_lf_bp_with_stats.R
 create mode 100644 scripts/functions/test_lf_bp.R
 delete mode 100644 scripts/functions/test_lf_bp_with_stats.R

diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R
new file mode 100644
index 0000000..4677548
--- /dev/null
+++ b/scripts/functions/lf_bp.R
@@ -0,0 +1,193 @@
+#############################
+# Barplots: ggplot
+# stats +/-
+# violin +/-
+# barplot +/
+# beeswarm
+#############################
+
+lf_bp <- function(lf_df
+                  , p_title = ""
+                  , colour_categ = ""
+                  , x_grp = "mutation_info"
+                  , y_var = "param_value"
+                  , facet_var = "param_type"
+                  , n_facet_row = 1
+                  , y_scales = "free_y"
+                  , colour_bp_strip = "khaki2"
+                  , dot_size = 3
+                  , dot_transparency = 0.3
+                  , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
+                  , my_ats = 22 # axis text size
+                  , my_als = 20 # axis label size
+                  , my_fls = 20 # facet label size
+                  , my_pts = 22 # plot title size)
+                  , make_boxplot = FALSE
+                  , bp_width = c("auto", 0.5)
+                  , add_stats = FALSE
+                  , stat_grp_comp = c("DM", "OM")
+                  , stat_method = "wilcox.test"
+                  , my_paired = FALSE
+                  , stat_label = c("p.format", "p.signif") ){
+
+  p1 <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
+                    , y = eval(parse(text = y_var)) ))  + 
+    
+    facet_wrap(~ eval(parse(text = facet_var))
+               , nrow = n_facet_row
+               , scales = y_scales) +
+    
+    geom_violin(trim = T
+                , scale = "width"
+                #, position = position_dodge(width = 0.9)
+                , draw_quantiles = violin_quantiles)
+    
+    if (make_boxplot){
+      
+      if (bp_width == "auto"){
+        bp_width = 0.5/length(unique(lf_df[[x_grp]]))
+        cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
+      }else{
+        cat("\nBoxplot width value provided, using:",  bp_width, "\n")
+        bp_width = bp_width} 
+    
+    p2 = p1 + geom_boxplot(fill = "white"
+                  , outlier.colour = NA
+                  #, position = position_dodge(width = 0.9)
+                  , width = bp_width) + 
+      geom_beeswarm(priority = "density"
+                    #, shape = 21
+                    , size = dot_size
+                    , alpha = dot_transparency
+                    , show.legend = FALSE
+                    , cex = 0.8
+                    , aes(colour = factor(eval(parse(text = colour_categ))) ))  
+      
+    } else {
+    # ggbeeswarm (better than geom_point)
+    p2 = p1 +  geom_beeswarm(priority = "density"
+                    #, shape = 21
+                    , size = dot_size
+                    , alpha = dot_transparency
+                    , show.legend = FALSE
+                    , cex = 0.8
+                    , aes(colour = factor(eval(parse(text = colour_categ))) )) 
+    }
+  
+   # Add foramtting to graph
+   OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats)
+                   , axis.text.y = element_text(size = my_ats
+                                       , angle = 0
+                                       , hjust = 1
+                                       , vjust = 0)
+                   , axis.title.x = element_text(size = my_ats)
+                   , axis.title.y = element_text(size = my_ats)
+                   , plot.title = element_text(size = my_pts
+                                      , hjust = 0.5
+                                      , colour = "black"
+                                      , face = "bold")
+                   , strip.background = element_rect(fill = colour_bp_strip)
+                   , strip.text.x = element_text(size = my_fls
+                                                  , colour = "black")
+                   , legend.title = element_text(color = "black"
+                                                  , size = my_als)
+                   , legend.text = element_text(size = my_ats)
+                   , legend.direction = "vertical") +
+    
+    labs(title = p_title
+         , x = ""
+         , y = "") 
+   
+    if (add_stats){
+      my_comparisonsL <- list( stat_grp_comp )
+      
+    OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL
+                       , method = stat_method
+                       , paired = my_paired
+                       , label = stat_label[1])
+    } 
+   return(OutPlot)
+}
+
+#############################
+# Barplot NO stats: plotly
+# violin +/-
+# barplot +/
+# beeswarm
+
+# TODO: plot_ly()
+#############################
+lf_bp_plotly <- function(lf_df
+                         , p_title = ""
+                         , colour_categ = ""
+                         , x_grp = mutation_info
+                         , y_var = param_value
+                         , facet_var = param_type
+                         , n_facet_row = 1
+                         , y_scales = "free_y"
+                         , colour_bp_strip = "khaki2"
+                         , dot_size = 3
+                         , dot_transparency = 0.3
+                         , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
+                         , my_ats = 20 # axis text size
+                         , my_als = 18 # axis label size
+                         , my_fls = 18 # facet label size
+                         , my_pts = 22 # plot title size)
+                         #, make_boxplot = FALSE
+                         , bp_width = c("auto", 0.5)
+                         #, add_stats = FALSE
+                         #, stat_grp_comp = c("DM", "OM")
+                         #, stat_method = "wilcox.test"
+                         #, my_paired = FALSE
+                         #, stat_label = c("p.format", "p.signif") 
+                         ){
+  
+  OutPlotly = ggplot(lf_df, aes(x = eval(parse(text = x_grp))
+                              , y = eval(parse(text = y_var))
+                              , label1 = x_grp
+                              , label2 = y_var
+                              , lable3 = colour_categ) )  +
+
+      facet_wrap(~ eval(parse(text = facet_var))
+               , nrow = n_facet_row
+               , scales = y_scales) +
+    
+    geom_violin(trim = T
+                , scale = "width"
+                , draw_quantiles = violin_quantiles) +
+    
+   geom_beeswarm(priority = "density"
+                  , size = dot_size
+                  , alpha = dot_transparency
+                  , show.legend = FALSE
+                  , cex = 0.8
+                  , aes(colour = factor(eval(parse(text = colour_categ) ) ) ) ) +
+    theme(axis.text.x = element_text(size = my_ats)
+          , axis.text.y = element_text(size = my_ats
+                                       , angle = 0
+                                       , hjust = 1
+                                       , vjust = 0)
+          , axis.title.x = element_text(size = my_ats)
+          , axis.title.y = element_text(size = my_ats)
+          , plot.title = element_text(size = my_pts
+                             , hjust = 0.5
+                             , colour = "black"
+                             , face = "bold")
+          , strip.background = element_rect(fill = colour_bp_strip)
+          , strip.text.x = element_text(size = my_fls
+                                         , colour = "black")
+          , legend.title = element_text(color = "black"
+                                         , size = my_als)
+          , legend.text = element_text(size = my_ats)
+          , legend.position = "none")+
+    
+    labs(title = p_title
+         , x = ""
+         , y = "") 
+  
+  OutPlotly = ggplotly(OutPlotly
+                       #, tooltip = c("label")
+                       )
+  return(OutPlotly)
+  
+}
diff --git a/scripts/functions/lf_bp_with_stats.R b/scripts/functions/redundant/lf_bp_with_stats.R
similarity index 54%
rename from scripts/functions/lf_bp_with_stats.R
rename to scripts/functions/redundant/lf_bp_with_stats.R
index 336c270..22533d5 100644
--- a/scripts/functions/lf_bp_with_stats.R
+++ b/scripts/functions/redundant/lf_bp_with_stats.R
@@ -14,13 +14,23 @@ lf_bp_with_stats <- function(lf_df
                         , stat_grp_comp = c("DM", "OM")
                         , stat_method = "wilcox.test"
                         , my_paired = FALSE
-                        #, stat_label = "p.format")
+                        , bp_width = c("auto", 0.5)
+                        , dot_size = 3
+                        , dot_transparency = 0.3
                         , stat_label = c("p.format", "p.signif")
                         , my_ats = 22 # axis text size
                         , my_als = 20 # axis label size
                         , my_fls = 20 # facet label size
                         , my_pts = 22 # plot title size
 ) {
+  if (bp_width == "auto"){
+  bp_width = 0.5/length(unique(lf_df[[x_grp]]))
+  cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
+  }else{
+  cat("\nBoxplot width value provided, using:",  bp_width, "\n")
+  bp_width = bp_width
+  }
+  
   my_comparisonsL <- list( stat_grp_comp )
 
   bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
@@ -30,15 +40,30 @@ lf_bp_with_stats <- function(lf_df
                , nrow = n_facet_row
                , scales = y_scales) +
     
-    geom_boxplot(fill = "white", outlier.colour = NA
-                 #, position = position_dodge(width = 0.9)
-                 , width = 0.2) +
-    
-    geom_point(position = position_jitterdodge(dodge.width = 0.01)
-               , alpha = 0.5
-               , show.legend = FALSE
-               , aes(colour = factor(eval(parse(text = colour_categ))) )) +
+    geom_violin(trim = T
+                , scale = "width"
+                #, position = position_dodge(width = 0.9)
+                , draw_quantiles = c(0.25, 0.5, 0.75)) + 
     
+    # geom_boxplot(fill = "white"
+    #              , outlier.colour = NA
+    #              #, position = position_dodge(width = 0.9)
+    #              , width = bp_width) +
+
+    # geom_point(position = position_jitterdodge(dodge.width = 0.5)
+    #            , alpha = 0.5
+    #            , show.legend = FALSE
+    #            , aes(colour = factor(eval(parse(text = colour_categ))) )) +
+
+    # ggbeeswarm (better than geom_point)
+      geom_beeswarm(priority = "density"
+                    #, shape = 21
+                    , size = dot_size
+                    , alpha = dot_transparency
+                    , show.legend = FALSE
+                    , cex = 0.8
+                    , aes(colour = factor(eval(parse(text = colour_categ))) )) +
+
     theme(axis.text.x = element_text(size = my_ats)
           , axis.text.y = element_text(size = my_ats
                                        , angle = 0
@@ -46,10 +71,15 @@ lf_bp_with_stats <- function(lf_df
                                        , vjust = 0)
           , axis.title.x = element_text(size = my_ats)
           , axis.title.y = element_text(size = my_ats)
-          , plot.title = element_text(size = my_pts , hjust = 0.5, colour = "black", face = "bold")
+          , plot.title = element_text(size = my_pts
+                                      , hjust = 0.5
+                                      , colour = "black"
+                                      , face = "bold")
           , strip.background = element_rect(fill = colour_bp_strip)
-          , strip.text.x = element_text(size = my_fls, colour = "black")
-          , legend.title = element_text(color = "black", size = my_als)
+          , strip.text.x = element_text(size = my_fls
+                                        , colour = "black")
+          , legend.title = element_text(color = "black"
+                                        , size = my_als)
           , legend.text = element_text(size = my_ats)
           , legend.direction = "vertical") +
     
@@ -60,7 +90,6 @@ lf_bp_with_stats <- function(lf_df
     stat_compare_means(comparisons = my_comparisonsL
                        , method = stat_method
                        , paired = my_paired
-                       #, label = "p.format")
                        , label = stat_label[1])
   
   return(bp_statP)
diff --git a/scripts/functions/redundant/test_lf_bp_with_stats.R b/scripts/functions/redundant/test_lf_bp_with_stats.R
new file mode 100644
index 0000000..51654a1
--- /dev/null
+++ b/scripts/functions/redundant/test_lf_bp_with_stats.R
@@ -0,0 +1,83 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+
+source("../functions/lf_bp_with_stats.R")
+source("../functions/lf_bp.R")
+
+######################
+# Make  plot
+######################
+# Note: Data
+# run other_plots_data.R
+# to get the long format data to test this function 
+
+lf_bp(lf_df = lf_dynamut2
+                       , p_title = "Dynamut2"
+                       , colour_categ = "ddg_dynamut2_outcome"
+                       , x_grp = "mutation_info"
+                       , y_var = "param_value"
+                       , facet_var = "param_type"
+                       , n_facet_row = 1
+                       , y_scales = "free_y"
+                       , colour_bp_strip = "khaki2"
+                       , dot_size = 3
+                       , dot_transparency = 0.3
+                       , violin_quantiles = c(0.25, 0.5, 0.75)
+                       , my_ats = 22 # axis text size
+                       , my_als = 20 # axis label size
+                       , my_fls = 20 # facet label size
+                       , my_pts = 22 # plot title size 
+                       , make_boxplot = F
+                       , bp_width = "auto"
+                       , add_stats = T
+                       , stat_grp_comp = c("DM", "OM")
+                       , stat_method = "wilcox.test"
+                       , my_paired = FALSE
+                       , stat_label = c("p.format", "p.signif") )
+ 
+# foo = lf_dynamut2 %>%
+#   group_by(mutation_info, param_type) %>%
+#   summarise( Mean = mean(param_value, na.rm = T)
+#              , SD = sd(param_value, na.rm = T)
+#              , Median = median(param_value, na.rm = T)
+#              , IQR = IQR(param_value, na.rm = T) )
+
+# Quick tests
+plotdata_sel = subset(lf_dynamut2
+             , lf_dynamut2$param_type == "ASA")
+
+plot_sum = plotdata_sel %>%
+  group_by(mutation_info, param_type) %>%
+  summarise(n = n()
+             , Mean = mean(param_value, na.rm = T)
+             , SD = sd(param_value, na.rm = T)
+             , Min = min(param_value, na.rm = T)
+             , Q1 = quantile(param_value, na.rm = T, 0.25)
+             , Median = median(param_value, na.rm = T)
+             , Q3 = quantile(param_value, na.rm = T, 0.75)
+             , Max = max(param_value, na.rm = T) ) %>%
+  rename('Mutation Class' = mutation_info
+         , Parameter = param_type)
+plot_sum = as.data.frame(plot_sum, row.names = NULL)
+plot_sum
+
+bar = compare_means(param_value ~ mutation_info
+              , group.by = "param_type"
+              , data = plotdata_sel
+              , paired = FALSE
+              , p.adjust.method = "BH")
+bar2 = bar[c("param_type"
+              , "group1"
+              , "group2"
+              , "p.format"
+              , "p.signif"
+              , "p.adj")] %>%
+  rename(Parameter = param_type
+          , Group1 = group1
+          , Group2 = group2
+          , "P-value" = p.format
+          , "P-sig" = p.signif
+          , "P-adj" = p.adj)
+bar2 = data.frame(bar2); bar2
+
+library(Hmisc)
+describe(lf_dynamut2)
diff --git a/scripts/functions/test_lf_bp.R b/scripts/functions/test_lf_bp.R
new file mode 100644
index 0000000..42b78bf
--- /dev/null
+++ b/scripts/functions/test_lf_bp.R
@@ -0,0 +1,55 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+source("Header_TT.R")
+source("../functions/lf_bp.R")
+# ================================================
+# Data: run get_plotting_data.R
+# to get the long format data to test this function 
+# drug = "streptomycin" 
+# gene = "gid"
+# source("get_plotting_dfs.R")
+# ==================================================
+
+######################
+# Make plot: ggplot
+######################
+lf_bp(lf_df = lf_dynamut2
+      , p_title = "Dynamut2"
+      , colour_categ = "ddg_dynamut2_outcome"
+      , x_grp = "mutation_info"
+      , y_var = "param_value"
+      , facet_var = "param_type"
+      , n_facet_row = 1
+      , y_scales = "free_y"
+      , colour_bp_strip = "khaki2"
+      , dot_size = 3
+      , dot_transparency = 0.3
+      , violin_quantiles = c(0.25, 0.5, 0.75)
+      , my_ats = 22 # axis text size
+      , my_als = 20 # axis label size
+      , my_fls = 20 # facet label size
+      , my_pts = 22 # plot title size 
+      , make_boxplot = F
+      , bp_width = "auto"
+      , add_stats = T
+      , stat_grp_comp = c("DM", "OM")
+      , stat_method = "wilcox.test"
+      , my_paired = FALSE
+      , stat_label = c("p.format", "p.signif") )
+
+######################
+# Make plot: plotly
+######################
+# FIXME: This labels are not working as I want!
+# lf_bp_plotly(lf_df = lf_deepddg
+#       , p_title = "DeepDDG"
+#       , colour_categ = "deepddg_outcome"
+#       , x_grp = "mutation_info"
+#       , y_var = "param_value"
+#       , facet_var = "param_type"
+#       , n_facet_row = 1
+#       , y_scales = "free_y"
+#       , colour_bp_strip = "khaki2"
+#       , dot_size = 3
+#       , dot_transparency = 0.3
+#       , violin_quantiles = c(0.25, 0.5, 0.75)
+#  )
diff --git a/scripts/functions/test_lf_bp_with_stats.R b/scripts/functions/test_lf_bp_with_stats.R
deleted file mode 100644
index 4cfff9d..0000000
--- a/scripts/functions/test_lf_bp_with_stats.R
+++ /dev/null
@@ -1,28 +0,0 @@
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-
-source("../functions/lf_bp_with_stats.R")
-
-######################
-# call function
-######################
-# Note: Data
-# run other_plots_data.R
-# to get the long format data to test this function 
-
-lf_bp_with_stats(lf_df = lf_dynamut2
-                       , x_grp = "mutation_info"
-                       , y_var = "param_value"
-                       , facet_var = "param_type"
-                       , n_facet_row = 1
-                       , y_scales = "free_y"
-                       , p_title = "Dynamut2"
-                       , colour_categ = "ddg_dynamut2_outcome"
-                       , stat_grp_comp = c("DM", "OM")
-                       , stat_method = "wilcox.test"
-                       , my_paired = FALSE
-                       #, stat_label = "p.format")
-                       , stat_label = c("p.format", "p.signif")
-                       , my_ats = 22 # axis text size
-                       , my_als = 20 # axis label size
-                       , my_fls = 20 # facet label size
-                       , my_pts = 22 )# plot title size 
diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R
index 199031b..e4593d0 100755
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@@ -3,12 +3,6 @@
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")
 
-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
-require(extrafont)
-
 require("getopt", quietly = TRUE) # cmd parse arguments
 
 if (!require("tidyverse")) {
@@ -16,9 +10,23 @@ if (!require("tidyverse")) {
   library(tidyverse)
 }
 
-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
+# if (!require("ggplot2")) {
+#   install.packages("ggplot2", dependencies = TRUE)
+#   library(ggplot2)
+# }
+
+# if (!require ("dplyr")){
+#   install.packages("dplyr")
+#   library(dplyr)
+# }
+
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+if (!require ("ggbeeswarm")){
+   install.packages("ggbeeswarm")
+   library(ggbeeswarm)
 }
 
 if (!require("plotly")) {
@@ -101,11 +109,6 @@ if (!require ("psych")){
   library(psych)
 }
 
-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(dplyr)
-}
-
 if (!require ("compare")){
   install.packages("compare")
   library(compare)
@@ -116,31 +119,25 @@ if (!require ("arsenal")){
   library(arsenal)
 }
 
+if(!require(ggseqlogo)){
+  install.packages("ggseqlogo")
+  library(ggseqlogo)
+}
 
-#if (!requireNamespace("BiocManager", quietly = TRUE))
-#  install.packages("BiocManager")
-
-#BiocManager::install("Logolas")
-library("Logolas")
-
-#install.packages("ggseqlogo")
-library(ggseqlogo)
-
-
-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
+# for PDB files
 if(!require(bio3d)){
   install.packages("bio3d")
   library(bio3d)
 }
 
-#install.packages("protr")
 library(protr)
+if(!require(protr)){
+  install.packages("protr")
+  library(protr)
+}
+
+#if (!requireNamespace("BiocManager", quietly = TRUE))
+#  install.packages("BiocManager")
+
+#BiocManager::install("Logolas")
+library("Logolas")
\ No newline at end of file
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index 2dae471..2fc1c19 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -86,8 +86,10 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
                                       , lig_dist_colname = LigDist_colname
                                       , lig_dist_cutoff = LigDist_cutoff)
 
-merged_df2 = all_plot_dfs[[1]]
-merged_df3 = all_plot_dfs[[2]]
+merged_df2      = all_plot_dfs[[1]]
+merged_df3      = all_plot_dfs[[2]]
+merged_df2_comp = all_plot_dfs[[3]]
+merged_df3_comp = all_plot_dfs[[4]]
 #======================================================================
 # read other files
 infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
@@ -98,10 +100,15 @@ infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dyna
 
 infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
                             , "_complex_mcsm_na_norm.csv")
-
+                            
+infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+                      , "_mcsm_formatted_snps.csv")
+                      
 dynamut_df   = read.csv(infilename_dynamut)
 dynamut2_df  = read.csv(infilename_dynamut2)
 mcsm_na_df   = read.csv(infilename_mcsm_na)
+mcsm_f_snps  = read.csv(infilename_mcsm_f_snps, header = F)
+names(mcsm_f_snps) = "mutationinformation"
 
 ####################################################################
 #                        Data for subcols barplot (~heatmpa)
@@ -430,11 +437,17 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
       , "\nGot: ", check1)
 }
 
+
+rm(foo)
+####################################################################
+#                        Data for DM OM Plots: Long format dfs
+####################################################################
+source("other_plots_data.R")
+
 ########################################################################
 #                           End of script
 ########################################################################
-rm(foo)
 
-cat("\n===================================================\n"
+cat("\n######################################################\n"
       , "\nSuccessful: get_plotting_dfs.R worked!"
-      , "\n====================================================")
\ No newline at end of file
+      , "\n###################################################\n")
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
index d2229b8..8eb2020 100644
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@@ -3,10 +3,9 @@
 # TASK: producing boxplots for dr and other muts
 
 #########################################################
-#=======================================================================
 # working dir and loading libraries
 # getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting")
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
 # getwd()
 
 # make cmd
@@ -14,21 +13,21 @@ setwd("~/git/LSHTM_analysis/scripts/plotting")
 # drug = "streptomycin"
 # gene = "gid"
 
-source("get_plotting_dfs.R")
+# source("get_plotting_dfs.R")
 #=======================================================================
 # MOVE TO COMBINE or singular file for deepddg
+# 
+# cols_to_select = c("mutation", "mutationinformation"
+#                    , "wild_type", "position", "mutant_type"
+#                    , "mutation_info")
+# 
+# merged_df3_short = merged_df3[, cols_to_select]
 
-cols_to_select = c("mutation", "mutationinformation"
-                   , "wild_type", "position", "mutant_type"
-                   , "mutation_info")
-
-merged_df3_short = merged_df3[, cols_to_select]
-
-infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
-                      , "_mcsm_formatted_snps.csv")
-
-mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
-names(mcsm_f_snps) <- "mutationinformation"
+# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+#                       , "_mcsm_formatted_snps.csv")
+# 
+# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
+# names(mcsm_f_snps) <- "mutationinformation"
 
 # write merged_df3 to generate structural figure on chimera
 #write.csv(merged_df3_short, "merged_df3_short.csv")
@@ -52,11 +51,11 @@ my_min = min(merged_df3$deepddg_scaled); my_min
 my_max = max(merged_df3$deepddg_scaled); my_max
 
 if (my_min == -1 && my_max == 1){
-  cat("PASS: DeepDDG successfully scaled b/w -1 and 1"
+  cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
       #, "\nProceeding with assigning deep outcome category")
       , "\n")
 }else{
-  cat("FAIL: could not scale DeepDDG ddg values"
+  cat("\nFAIL: could not scale DeepDDG ddg values"
       , "Aborting!")
 }
 
@@ -100,7 +99,7 @@ if (merging_cols == "mutationinformation") {
   cols_check <- c(c1, c2, c3, c4)
   expected_cols = n_comb_cols - ( length(cols_check) - 1)
   if (all(cols_check)){
-    cat("\nStage 2:Proceeding with merging dfs:\n")
+    cat("\nStage 2: Proceeding with merging dfs:\n")
     comb_df <- Reduce(inner_join, list(cols_mcsm_df
                                        , cols_mcsm_na_df
                                        , dynamut_df
@@ -115,12 +114,13 @@ if (merging_cols == "mutationinformation") {
     
     }
 }
-names(comb_df_s)
+#names(comb_df_s)
+cat("\n!!!IT GOT TO HERE!!!!")
 #=======================================================================
 fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
 fact_cols
 lapply(comb_df_s[, fact_cols], class)
-comb_df_s[,fact_cols] <- lapply(comb_df_s[,cols],as.factor)
+comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
 
 if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
   cat("\nChanging cols to factor")
@@ -512,7 +512,6 @@ rm(all_plot_dfs
    , my_data_snp
    , my_df
    , my_df_u
-   , ols_mcsm_df
    , other_muts
    , pd_df
    , subcols_df_ps

From a981580b7a70bc33cddee5e42addb799835163c2 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 2 Sep 2021 12:51:31 +0100
Subject: [PATCH 07/51] separated get_plotting_dfs_with_lig.R

---
 scripts/plotting/get_plotting_dfs_with_lig.R | 589 +++++++++++++++++++
 1 file changed, 589 insertions(+)
 create mode 100644 scripts/plotting/get_plotting_dfs_with_lig.R

diff --git a/scripts/plotting/get_plotting_dfs_with_lig.R b/scripts/plotting/get_plotting_dfs_with_lig.R
new file mode 100644
index 0000000..f17e997
--- /dev/null
+++ b/scripts/plotting/get_plotting_dfs_with_lig.R
@@ -0,0 +1,589 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Get formatted data for plots
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+source("Header_TT.R")
+source("../functions/my_pairs_panel.R") # with lower panel turned off
+source("../functions/plotting_globals.R")
+source("../functions/plotting_data.R")
+source("../functions/combining_dfs_plotting.R")
+source("../functions/bp_subcolours.R")
+
+#********************
+# cmd args passed 
+# in from other scripts
+# to call this
+#********************
+#drug = 'streptomycin'
+#gene = 'gid'
+#====================
+# variables for lig
+#====================
+
+LigDist_colname = "ligand_distance"
+LigDist_cutoff = 10
+
+#===========
+# input
+#===========
+#---------------------
+# call: import_dirs()
+#---------------------
+import_dirs(drug, gene)
+
+#---------------------------
+# call: plotting_data()
+#---------------------------
+if (!exists("infile_params") && exists("gene")){
+#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
+  #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
+  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
+  infile_params = paste0(outdir, "/", in_filename_params)
+  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
+}
+
+# Input 1: read <gene>_comb_afor.csv
+cat("\nReading mcsm combined data file: ", infile_params)
+mcsm_df = read.csv(infile_params, header = T)
+pd_df = plotting_data(mcsm_df
+                      , lig_dist_colname = LigDist_colname
+                      , lig_dist_cutoff = LigDist_cutoff)
+
+my_df       = pd_df[[1]] 
+my_df_u     = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
+my_df_u_lig = pd_df[[3]] 
+dup_muts    = pd_df[[4]] 
+
+#--------------------------------
+# call: combining_dfs_plotting()
+#--------------------------------
+if (!exists("infile_metadata") && exists("gene")){
+#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
+  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
+  infile_metadata = paste0(outdir, "/", in_filename_metadata)
+  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
+}
+
+# Input 2: read <gene>_meta data.csv
+cat("\nReading meta data file: ", infile_metadata)
+
+gene_metadata <- read.csv(infile_metadata
+                          , stringsAsFactors = F
+                          , header = T)
+
+all_plot_dfs = combining_dfs_plotting(my_df_u
+                                      , gene_metadata
+                                      , lig_dist_colname = LigDist_colname
+                                      , lig_dist_cutoff = LigDist_cutoff)
+
+merged_df2          = all_plot_dfs[[1]]
+merged_df3          = all_plot_dfs[[2]]
+merged_df2_comp     = all_plot_dfs[[3]]
+merged_df3_comp     = all_plot_dfs[[4]]
+merged_df2_lig      = all_plot_dfs[[5]]
+merged_df3_lig      = all_plot_dfs[[6]]
+merged_df2_comp_lig = all_plot_dfs[[7]]
+merged_df3_comp_lig = all_plot_dfs[[8]]
+
+####################################################################
+#                        Data for subcols barplot (~heatmap)
+####################################################################
+# can include: mutation, or_kin, pwald, af_kin
+cols_to_select = c("mutationinformation", "drtype"
+                   , "wild_type"
+                   , "position"
+                   , "mutant_type"
+                   , "chain", "ligand_id", "ligand_distance"
+                   , "duet_stability_change", "duet_outcome", "duet_scaled"
+                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
+                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
+                   , "asa", "rsa", "rd_values", "kd_values"
+                   , "af", "or_mychisq", "pval_fisher" 
+                   , "or_fisher", "or_logistic", "pval_logistic"
+                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
+                   , "wt_calcprop", "mut_calcprop")
+
+#=======================
+# Data for sub colours
+# barplot: PS
+#=======================
+
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_ps = merged_df3[, cols_to_select]
+
+cat("\nNo of unique positions for ps:"
+    , length(unique(subcols_df_ps$position)))
+
+# add count_pos col that counts the no. of nsSNPS at a position
+setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
+
+# should be a factor
+if (is.factor(subcols_df_ps$duet_outcome)){
+  cat("\nDuet_outcome is factor")
+  table(subcols_df_ps$duet_outcome)
+}else{
+  cat("\nConverting duet_outcome to factor")
+  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
+  table(subcols_df_ps$duet_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_ps$duet_scaled)
+max(subcols_df_ps$duet_scaled)
+
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in duet scaled, no rounding:"
+    , length(unique(subcols_df_ps$duet_scaled)))
+
+# No rounding    
+my_grp = subcols_df_ps$duet_scaled; length(my_grp)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_ps$duet_scaledR)))
+
+my_grp_r = subcols_df_ps$duet_scaledR  # rounding
+
+# Add grp cols
+subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
+subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
+subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
+
+print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
+print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
+
+#=======================
+# Data for sub colours
+# barplot: LIG
+#=======================
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_lig = merged_df3_lig[, cols_to_select]
+
+cat("\nNo of unique positions for LIG:"
+    , length(unique(subcols_df_lig$position)))
+
+# should be a factor
+if (is.factor(subcols_df_lig$ligand_outcome)){
+  cat("\nLigand_outcome is factor")
+  table(subcols_df_lig$ligand_outcome)
+}else{
+  cat("\nConverting ligand_outcome to factor")
+  subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
+  table(subcols_df_lig$ligand_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_lig$affinity_scaled)
+max(subcols_df_lig$affinity_scaled)
+
+tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
+tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in affinity scaled, no rounding:"
+    , length(unique(subcols_df_lig$affinity_scaled)))
+
+# No rounding    
+my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_lig$affinity_scaledR)))
+
+my_grp_lig_r = subcols_df_lig$affinity_scaledR  # rounding
+
+# Add grp cols
+subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
+subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
+subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
+
+print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
+print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
+
+####################################################################
+#                        Data for logoplots
+####################################################################
+#-------------------------
+# choose df for logoplot
+#-------------------------
+logo_data = merged_df3
+#logo_data = merged_df3_comp
+
+# quick checks
+colnames(logo_data)
+str(logo_data)
+
+c1 = unique(logo_data$position) 
+nrow(logo_data)
+cat("No. of rows in my_data:", nrow(logo_data)
+    , "\nDistinct positions corresponding to snps:", length(c1)
+    , "\n===========================================================")
+#=======================================================================
+#==================
+# logo data: OR
+#==================
+foo = logo_data[, c("position"
+                      , "mutant_type","duet_scaled", "or_mychisq"
+                      , "mut_prop_polarity", "mut_prop_water")] 
+
+logo_data$log10or = log10(logo_data$or_mychisq)
+logo_data_plot = logo_data[, c("position"
+                            , "mutant_type", "or_mychisq", "log10or")]
+
+logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
+wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
+
+wide_df_or = as.matrix(wide_df_or)
+rownames(wide_df_or) = wide_df_or[,1]
+dim(wide_df_or)
+wide_df_or = wide_df_or[,-1]
+str(wide_df_or)
+
+position_or = as.numeric(colnames(wide_df_or))
+
+#==================
+# logo data: logOR
+#==================
+logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
+wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
+
+wide_df_logor = as.matrix(wide_df_logor)
+
+rownames(wide_df_logor) = wide_df_logor[,1]
+wide_df_logor = subset(wide_df_logor, select = -c(1) )
+colnames(wide_df_logor)
+wide_df_logor_m = data.matrix(wide_df_logor)
+
+rownames(wide_df_logor_m)
+colnames(wide_df_logor_m)
+
+position_logor = as.numeric(colnames(wide_df_logor_m))
+
+#===============================
+# logo data: multiple nsSNPs (>1)
+#=================================
+#require(data.table)
+
+# get freq count of positions so you can subset freq<1
+setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
+
+table(logo_data$position)
+table(logo_data$mut_pos_occurrence)
+
+max_mut = max(table(logo_data$position))
+
+# extract freq_pos > 1
+my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
+
+#--------------------------------------
+# matrix for_mychisq mutant type
+# frequency of mutant type by position
+#---------------------------------------
+table(my_data_snp$mutant_type, my_data_snp$position)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)
+
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+# should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos
+
+#-------------------------------------
+# matrix for wild type
+# frequency of wild type by position
+#-------------------------------------
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
+tab_wt = unclass(tab_wt)
+
+# remove wt duplicates
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]
+
+tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
+
+rownames(tab_wt)
+rownames(tab_wt)
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
+
+#----------------------------------
+# logo data OR: multiple nsSNPs (>1)
+#----------------------------------
+logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
+#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
+wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
+
+wide_df_or_mult = as.matrix(wide_df_or_mult)
+rownames(wide_df_or_mult) = wide_df_or_mult[,1]
+wide_df_or_mult = wide_df_or_mult[,-1]
+str(wide_df_or_mult)
+
+position_or_mult = as.numeric(colnames(wide_df_or_mult))
+
+####################################################################
+#                        Data for Corrplots
+####################################################################
+cat("\n=========================================="
+    , "\nCORR PLOTS data: PS"
+    , "\n===========================================")
+
+df_ps = merged_df2
+
+#--------------------
+# adding log cols : NEW UNCOMMENT
+#--------------------
+#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
+#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+
+##df_ps$log10_or_kin = log10(df_ps$or_kin)
+##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+
+#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# subset data to generate pairwise correlations
+cols_to_select =  c("mutationinformation"
+                    , "duet_scaled"
+                    , "foldx_scaled"
+                    #, "mutation_info_labels"
+                    , "asa"
+                    , "rsa"
+                    , "rd_values"
+                    , "kd_values"
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    ##, "or_kin"
+                    ##, "neglog_pwald_kin"
+                    , "af"
+                    ##, "af_kin"
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[cols_to_select]
+
+dim(corr_data_ps)
+
+#--------------------------------------
+# assign nice colnames (for display)
+#--------------------------------------
+my_corr_colnames = c("Mutation"
+                     , "DUET"
+                     , "FoldX"
+                     #, "Mutation class"
+                     , "ASA"
+                     , "RSA"
+                     , "RD"
+                     , "KD"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     ##, "Adjusted (OR)"
+                     ##, "-Log (P wald)"
+                     , "MAF"
+                     ##, "AF_kin"
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+#===========================
+# Corr data for plots: PS
+# big_df ps: ~ merged_df2
+#===========================
+
+#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
+corr_ps_df2 = corr_data_ps[start:end]
+head(corr_ps_df2)
+
+#===========================
+# Corr data for plots: PS
+# short_df ps: ~merged_df3
+#===========================
+corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
+
+na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
+check1 = nrow(corr_ps_df3) - na_or
+
+##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
+##check2 = nrow(corr_ps_df3) - na_adj_or 
+
+if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
+  cat( "\nPASS: No. of rows for corr_ps_df3 match"
+       , "\nPASS: No. of OR values checked: " , check1)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3)
+      , "\nGot: ", nrow(corr_ps_df3)
+      , "\nExpected OR values: ", nrow(merged_df3_comp)
+      , "\nGot: ", check1)
+}
+
+#=================================
+# Data for Correlation plots: LIG
+#=================================
+cat("\n=========================================="
+    , "\nCORR PLOTS data: LIG"
+    , "\n===========================================")
+
+df_lig = merged_df2_lig
+
+table(df_lig$ligand_outcome)
+
+#--------------------
+# adding log cols : NEW UNCOMMENT
+#--------------------
+#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+##df_lig$log10_or_kin = log10(df_lig$or_kin)
+##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# subset data to generate pairwise correlations
+cols_to_select =  c("mutationinformation"
+                    , "affinity_scaled"
+                    #, "mutation_info_labels"
+                    , "asa"
+                    , "rsa"
+                    , "rd_values"
+                    , "kd_values"
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    ##, "or_kin"
+                    ##, "neglog_pwald_kin"
+                    , "af"
+                    ##, "af_kin"
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select]
+
+dim(corr_data_lig)
+
+#--------------------------------------
+# assign nice colnames (for display)
+#--------------------------------------
+my_corr_colnames = c("Mutation"
+                     , "Ligand Affinity"
+                     #, "Mutation class"
+                     , "ASA"
+                     , "RSA"
+                     , "RD"
+                     , "KD"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     ##, "Adjusted (OR)"
+                     ##, "-Log (P wald)"
+                     , "MAF"
+                     ##, "MAF_kin"
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames
+colnames(corr_data_lig)
+
+start = 1
+end = which(colnames(corr_data_lig) == drug); end # should be the last column
+offset = 1
+
+#=============================
+# Corr data for plots: LIG
+# big_df lig: ~ merged_df2_lig
+#==============================
+#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
+corr_lig_df2 = corr_data_lig[start:end]
+head(corr_lig_df2)
+
+#=============================
+# Corr data for plots: LIG
+# short_df lig: ~ merged_df3_lig
+#==============================
+corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
+
+na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
+check1_lig = nrow(corr_lig_df3) - na_or_lig
+
+if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
+  cat( "\nPASS: No. of rows for corr_lig_df3 match"
+       , "\nPASS: No. of OR values checked: " , check1_lig)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3_lig)
+      , "\nGot: ", nrow(corr_ps_df3_lig)
+      , "\nExpected OR values: ", nrow(merged_df3_comp_lig)
+      , "\nGot: ", check1_lig)
+}
+
+# remove unnecessary columns
+identical(corr_data_lig, corr_lig_df2)
+identical(corr_data_ps, corr_ps_df2)
+
+#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
+
+########################################################################
+#                           End of script
+########################################################################
+rm(foo)
+
+cat("\n===================================================\n"
+      , "\nSuccessful: get_plotting_dfs.R worked!"
+      , "\n====================================================")
\ No newline at end of file

From 605eb54526205738a8bc2e2d4b6d993c4573b007 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 2 Sep 2021 17:40:24 +0100
Subject: [PATCH 08/51] saving work for the day

---
 dynamut/split_csv.sh           | 4 +++-
 scripts/functions/lf_bp.R      | 8 +++++---
 scripts/functions/test_lf_bp.R | 9 ++++++---
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh
index b5f15f1..18103c6 100755
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
 
 # Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
 # copy your snp file to split into the dynamut dir
@@ -13,7 +13,9 @@ mkdir -p ${OUTDIR}/${CHUNK}
 cd ${OUTDIR}/${CHUNK}
 
 split ../../${INFILE} -l ${CHUNK} -d snp_batch_
+#split ${INFILE} -l ${CHUNK} -d snp_batch_
 
 # use case
 #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
+~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R
index 4677548..608247d 100644
--- a/scripts/functions/lf_bp.R
+++ b/scripts/functions/lf_bp.R
@@ -24,7 +24,7 @@ lf_bp <- function(lf_df
                   , my_pts = 22 # plot title size)
                   , make_boxplot = FALSE
                   , bp_width = c("auto", 0.5)
-                  , add_stats = FALSE
+                  , add_stats = TRUE
                   , stat_grp_comp = c("DM", "OM")
                   , stat_method = "wilcox.test"
                   , my_paired = FALSE
@@ -104,8 +104,10 @@ lf_bp <- function(lf_df
     OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL
                        , method = stat_method
                        , paired = my_paired
-                       , label = stat_label[1])
-    } 
+                       , label = stat_label[2])
+    return(OutPlot)
+    }
+   
    return(OutPlot)
 }
 
diff --git a/scripts/functions/test_lf_bp.R b/scripts/functions/test_lf_bp.R
index 42b78bf..f3d2327 100644
--- a/scripts/functions/test_lf_bp.R
+++ b/scripts/functions/test_lf_bp.R
@@ -12,9 +12,9 @@ source("../functions/lf_bp.R")
 ######################
 # Make plot: ggplot
 ######################
-lf_bp(lf_df = lf_dynamut2
-      , p_title = "Dynamut2"
-      , colour_categ = "ddg_dynamut2_outcome"
+lf_bp(lf_df = lf_encomddg
+      , p_title = "ENCoM-DDG"
+      , colour_categ = "ddg_encom_outcome"
       , x_grp = "mutation_info"
       , y_var = "param_value"
       , facet_var = "param_type"
@@ -36,6 +36,9 @@ lf_bp(lf_df = lf_dynamut2
       , my_paired = FALSE
       , stat_label = c("p.format", "p.signif") )
 
+#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"]
+#            , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"])
+
 ######################
 # Make plot: plotly
 ######################

From 869fca7f945a823c67248d381726058bcec45c92 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 6 Sep 2021 19:50:50 +0100
Subject: [PATCH 09/51] added function for generating lineage barplots and also
 test script along wiadding script for processing data and added it to
 get_plotting_dfs.R

---
 dynamut/split_csv.sh                |   6 +-
 scripts/functions/bp_lineage.R      | 172 +++++++++++++++++++++++++++
 scripts/functions/test_bp_lineage.R | 111 ++++++++++++++++++
 scripts/plotting/get_plotting_dfs.R |   8 +-
 scripts/plotting/lineage_bp_data.R  | 173 ++++++++++++++++++++++++++++
 scripts/plotting/other_plots_data.R |   5 +-
 6 files changed, 470 insertions(+), 5 deletions(-)
 create mode 100644 scripts/functions/bp_lineage.R
 create mode 100644 scripts/functions/test_bp_lineage.R
 create mode 100644 scripts/plotting/lineage_bp_data.R

diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh
index 18103c6..17c1a03 100755
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@@ -12,10 +12,12 @@ CHUNK=$3
 mkdir -p ${OUTDIR}/${CHUNK}
 cd ${OUTDIR}/${CHUNK}
 
+# makes the 2 dirs, hence ../..
 split ../../${INFILE} -l ${CHUNK} -d snp_batch_
-#split ${INFILE} -l ${CHUNK} -d snp_batch_
 
 # use case
 #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
-~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+
+# add .txt to the files
diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R
new file mode 100644
index 0000000..86eb9f1
--- /dev/null
+++ b/scripts/functions/bp_lineage.R
@@ -0,0 +1,172 @@
+########################################
+# Lineage and within SNP count barplot
+########################################
+
+lin_count_bp <- function( lf_data
+                          , x_categ = ""
+                          , y_count = ""
+                          , bar_fill_categ = ""
+                          , display_label_col = ""
+                          , bar_stat_stype = "identity"
+                          , x_lab_angle = 90
+                          , d_lab_size = 5
+                          , d_lab_hjust = 0.5
+                          , d_lab_vjust = 0.5
+                          , d_lab_col = "black"
+                          , my_xats = 20 # x axis text size
+                          , my_yats = 20 # y axis text size
+                          , my_xals = 22 # x axis label size
+                          , my_yals = 22 # y axis label size
+                          , my_lls   = 22 # legend label size
+                          , bar_col_labels = c("Mutations", "Total Samples")
+                          , bar_col_values = c("grey50", "gray75")
+                          , bar_leg_name = ""
+                          , leg_location = "top"
+                          , y_log10 = FALSE
+                          , y_scale_percent = FALSE
+                          , y_label = c("Count", "SNP diversity")
+                          ) {
+  g = ggplot(lf_data
+             , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
+                     , y    = eval(parse(text = y_count))
+                     , fill = eval(parse(text = bar_fill_categ)) ) )
+  
+  OutPlot = g + geom_bar( stat          = bar_stat_stype
+                          , position    = position_stack(reverse = TRUE)
+                          #, alpha    = 1
+                          #, colour   = "grey75"
+  ) + 
+    theme(axis.text.x     = element_text(size = my_xats
+                                         , angle = x_lab_angle)
+          , axis.text.y   = element_text(size = my_yats
+                                         , angle = 90
+                                         , hjust = 1
+                                         , vjust = 0)
+          , axis.title.x = element_text(size     = my_xals
+                                        , colour = "black")
+          , axis.title.y = element_text(size     = my_yals
+                                        , colour = "black")
+          , legend.position = leg_location
+          , legend.text = element_text(size = my_lls)) + 
+    
+    geom_label(aes(label = eval(parse(text = display_label_col)))
+               , size    = d_lab_size
+               , hjust   = d_lab_hjust
+               , vjust   = d_lab_vjust
+               , colour  = d_lab_col
+               , show.legend = FALSE
+               #, check_overlap = TRUE
+               , position = position_stack(reverse = T)) + 
+
+    scale_fill_manual(values   = bar_col_values
+                      , name   = bar_leg_name
+                      , labels = bar_col_labels) +
+    labs(title    = ""
+         , x      = ""
+         , y      = y_label
+         , colour = "black")
+    
+  
+  if (y_log10){
+    
+   OutPlot = OutPlot + 
+     scale_y_continuous(trans = "log10"
+                        , labels = trans_format("log10", math_format(10^.x) ) )
+
+  }
+  
+  if (y_scale_percent){
+    
+    OutPlot = OutPlot +
+      #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+      scale_y_continuous(labels = scales::percent) +
+      
+      labs(title    = ""
+           , x      = ""
+           , y      = y_label
+           , colour = "black")
+  }
+  
+  return(OutPlot)
+}
+
+############################
+# Lineage diversity barplot
+############################
+# lin_diversity_bp <- function( wf_data
+#                              , x_categ = "sel_lineages"
+#                              , y_count = "snp_diversity"
+#                              , bar_stat_stype = "identity"
+#                              , display_label_col = "snp_diversity_f"
+#                              , x_lab_angle = 90
+#                              , d_lab_size = 5
+#                              , d_lab_hjust = 0.5
+#                              , d_lab_vjust = 0.5
+#                              , d_lab_col = "black"
+#                              , my_xats = 20 # x axis text size
+#                              , my_yats = 20 # y axis text size
+#                              , my_xals = 22 # x axis label size
+#                              , my_yals = 22 # y axis label size
+#                              , my_lls  = 22 # legend label size
+#                              , bar_leg_name = ""
+#                              , leg_location = "top"
+#                              , y_scale_percent = TRUE
+#                              , y_label = "SNP diversity" )
+#   
+#                              {
+#   g = ggplot(wf_data
+#              , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
+#                    , y    = eval(parse(text = y_count)) ) )
+#   
+#   OutPlot = g + geom_bar( stat     = bar_stat_stype
+#                      , position    = position_stack(reverse = TRUE)
+#                      ) + 
+#     
+#     theme(axis.text.x     = element_text(size = my_xats
+#                                          , angle = x_lab_angle)
+#           , axis.text.y   = element_text(size = my_yats
+#                                          , angle = 90
+#                                          , hjust = 1
+#                                          , vjust = 0)
+#           , axis.title.x = element_text(size     = my_xals
+#                                         , colour = "black")
+#           , axis.title.y = element_text(size     = my_yals
+#                                         , colour = "black")
+#           , legend.position = leg_location
+#           , legend.text = element_text(size = my_lls)) + 
+#     
+#     geom_label(aes(label = eval(parse(text = display_label_col)))
+#                , size    = d_lab_size
+#                , hjust   = d_lab_hjust
+#                , vjust   = d_lab_vjust
+#                , colour  = d_lab_col
+#                , show.legend = FALSE
+#                #, check_overlap = TRUE
+#                , position = position_stack(reverse = T))
+# #  return(OutPlot)
+#   
+#   if (y_scale_percent){
+# 
+#     OutPlot = OutPlot + 
+#       scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+#       labs(title = ""
+#            , x   = ""
+#            , y   = y_label
+#            , colour = "black")
+#     
+#     return(OutPlot) 
+#     }
+     
+#  return(OutPlot)
+  
+#  }
+  
+  
+
+
+# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) +
+#   geom_bar(stat = "identity")
+# ggp +  scale_y_continuous(labels = scales::percent_format(accuracy = 1)
+#                           #, limits = c(0,1)
+#                           , breaks = seq(0, 30, 5)
+# )    
diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/test_bp_lineage.R
new file mode 100644
index 0000000..6742429
--- /dev/null
+++ b/scripts/functions/test_bp_lineage.R
@@ -0,0 +1,111 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+
+#source ('get_plotting_dfs.R')
+source("../functions/bp_lineage.R")
+
+#########################################
+# Lineage and SNP count: lineage lf data
+#########################################
+# Relevel factors so that x-axis categ appear as you want
+lin_lf_plot = lin_lf
+lin_lf_plot
+is.factor(lin_lf_plot$sel_lineages_f)
+
+lin_lf_plot$sel_lineages_f =  factor(lin_lf_plot$sel_lineages_f, c(""
+                                                                   , "L1"
+                                                                   , "L2"
+                                                                   , "L3"
+                                                                   , "L4"
+                                                                   , "L5"
+                                                                   , "L6"
+                                                                   , "L7"
+                                                                   , "LBOV"
+                                                                   , "L1;L2"
+                                                                   , "L1;L3"
+                                                                   , "L1;L4"
+                                                                   , "L2;L3"
+                                                                   , "L2;L3;L4"
+                                                                   , "L2;L4"  
+                                                                   , "L2;L6"
+                                                                   , "L2;LBOV"  
+                                                                   , "L3;L4" 
+                                                                   , "L4;L6"
+                                                                   , "L4;L7"))
+
+levels(lin_lf_plot$sel_lineages_f)
+
+lin_count_bp(lin_lf_plot
+             , x_categ = "sel_lineages_f"
+             , y_count = "p_count"
+             , bar_fill_categ = "count_categ"
+             , display_label_col = "p_count"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , bar_col_labels = c("Mutations", "Total Samples")
+             , bar_col_values = c("grey50", "gray75")
+             , y_log10 = T
+             , y_label = "Count"
+             , y_scale_percent = F)
+
+###############################################
+# Lineage SNP diversity count: lineage wf data
+###############################################
+# Relevel factors so that x-axis categ appear as you want
+lin_wf_plot = lin_wf
+is.factor(lin_wf_plot$sel_lineages_f)
+
+lin_wf_plot$sel_lineages_f =  factor(lin_wf_plot$sel_lineages_f, c(""
+                                                         , "L1"
+                                                         , "L2"
+                                                         , "L3"
+                                                         , "L4"
+                                                         , "L5"
+                                                         , "L6"
+                                                         , "L7"
+                                                         , "LBOV"
+                                                         , "L1;L2"
+                                                         , "L1;L3"
+                                                         , "L1;L4"
+                                                         , "L2;L3"
+                                                         , "L2;L3;L4"
+                                                         , "L2;L4"  
+                                                         , "L2;L6"
+                                                         , "L2;LBOV"  
+                                                         , "L3;L4" 
+                                                         , "L4;L6"
+                                                         , "L4;L7"))
+
+levels(lin_wf_plot$sel_lineages_f)
+
+#==========
+# Plot
+#==========
+lin_count_bp(lin_wf_plot
+                 , x_categ = "sel_lineages_f"
+                 , y_count = "snp_diversity"
+                 , display_label_col = "snp_diversity_f"
+
+                 , bar_stat_stype = "identity"
+                 , x_lab_angle = 90
+                 , my_xats = 20
+                 , y_scale_percent = T
+                 , y_label = "SNP diversity"
+             
+
+)
+
+
+
+
+, x_categ = "sel_lineages_f"
+, y_count = "p_count"
+, bar_fill_categ = "count_categ"
+, display_label_col = "p_count"
+, bar_stat_stype = "identity"
+, x_lab_angle = 90
+, my_xats = 15
+, bar_col_labels = c("Mutations", "Total Samples")
+, bar_col_values = c("grey50", "gray75")
+, y_log10 = T
+, y_scale_percent = F
\ No newline at end of file
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index 2fc1c19..89b477c 100644
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -437,13 +437,19 @@ if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
       , "\nGot: ", check1)
 }
 
-
 rm(foo)
 ####################################################################
 #                        Data for DM OM Plots: Long format dfs
 ####################################################################
+
 source("other_plots_data.R")
 
+####################################################################
+#                  Data for Lineage barplots: WF and LF dfs
+####################################################################
+
+source("lineage_bp_data.R")
+
 ########################################################################
 #                           End of script
 ########################################################################
diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R
new file mode 100644
index 0000000..2cdfbe8
--- /dev/null
+++ b/scripts/plotting/lineage_bp_data.R
@@ -0,0 +1,173 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for lineage barplots:
+# WF and LF data with lineage sample, and snp counts
+# sourced by get_plotting_dfs.R
+#########################################################
+# working dir and loading libraries
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
+
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
+
+# source("get_plotting_dfs.R")
+#=======================================================================
+#################################################
+# Get data with lineage count, and snp diversity
+#################################################
+table(merged_df2$lineage)
+
+if (table(merged_df2$lineage == "")[[2]]) {
+
+cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
+  
+}
+
+##################################
+# WF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+sel_lineages = levels(as.factor(merged_df2$lineage))
+
+lin_wf = data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
+  #print(curr_total)
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+
+  foo = merged_df2[merged_df2$lineage==i,]
+  print(paste0(i, "=======\n"))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+  
+  total_snps_u = c(total_snps_u, curr_count)
+}
+lin_wf
+
+# Add these counts as columns to the df
+lin_wf$num_snps_u = total_snps_u
+lin_wf$total_samples = total_samples
+
+# Add SNP diversity
+lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
+lin_wf
+
+#=====================
+# Add some formatting
+#=====================
+# SNP diversity 
+lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
+lin_wf
+
+# Lineage names
+lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
+lin_wf
+
+# # Lineage names
+# lin_wf = lin_wf %>%
+#   mutate(ordering_category = case_when(
+#     sel_lineages_f   == ""    ~ 0
+#     , sel_lineages_f == "L1"   ~ 1
+#     , sel_lineages_f == "L2"   ~ 2
+#     , sel_lineages_f == "L3"   ~ 3
+#     , sel_lineages_f == "L4"   ~ 4
+#     , sel_lineages_f == "L5"   ~ 5
+#     , sel_lineages_f == "L6"   ~ 6
+#     , sel_lineages_f == "L7"   ~ 7
+#     , sel_lineages_f == "LBOV" ~ 8
+#     
+#     , sel_lineages_f == "L1;L2" ~ 9
+#     , sel_lineages_f == "L1;L3" ~ 10
+#     , sel_lineages_f == "L1;L4" ~ 11
+#     
+#     , sel_lineages_f == "L2;L3"    ~ 12
+#     , sel_lineages_f == "L2;L3;L4" ~ 13
+#     , sel_lineages_f == "L2;L4"    ~ 14
+#     , sel_lineages_f == "L2;L6"    ~ 15
+#     , sel_lineages_f == "L2;LBOV"  ~ 16
+#     
+#     , sel_lineages_f == "L3;L4" ~ 17
+#     
+#     , sel_lineages_f == "L4;L6" ~ 18
+#     , sel_lineages_f == "L4;L7" ~ 19
+#     
+#     , FALSE ~ -1)
+#   )
+
+##################################
+# LF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+names(lin_wf)
+tot_cols = ncol(lin_wf)
+pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
+pivot_cols_n = length(pivot_cols)
+
+expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
+  
+lin_lf <- gather(lin_wf
+                 , count_categ
+                 , p_count
+                 , num_snps_u:total_samples
+                 , factor_key = TRUE)
+lin_lf
+
+# quick checks
+if ( nrow(lin_lf)  ==  expected_rows ){
+  cat("\nPASS: Lineage LF data created"
+      , "\nnrow: ", nrow(lin_lf)
+      , "\nncol: ", ncol(lin_lf))
+} else {
+  cat("\nFAIL: numbers mismatch"
+      , "\nExpected nrow: ", expected_rows)
+}
+#######################################
+# #=====================
+# # Add some formatting
+# #=====================
+# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
+# lin_lf
+
+
+# lin_lf = lin_lf %>%
+#   mutate(ordering_category = case_when(
+#      sel_lineages_f   == ""    ~ 0
+#     , sel_lineages_f == "L1"   ~ 1
+#     , sel_lineages_f == "L2"   ~ 2
+#     , sel_lineages_f == "L3"   ~ 3
+#     , sel_lineages_f == "L4"   ~ 4
+#     , sel_lineages_f == "L5"   ~ 5
+#     , sel_lineages_f == "L6"   ~ 6
+#     , sel_lineages_f == "L7"   ~ 7
+#     , sel_lineages_f == "LBOV" ~ 8
+# 
+#     , sel_lineages_f == "L1;L2" ~ 9
+#     , sel_lineages_f == "L1;L3" ~ 10
+#     , sel_lineages_f == "L1;L4" ~ 11
+# 
+#     , sel_lineages_f == "L2;L3"    ~ 12
+#     , sel_lineages_f == "L2;L3;L4" ~ 13
+#     , sel_lineages_f == "L2;L4"    ~ 14
+#     , sel_lineages_f == "L2;L6"    ~ 15
+#     , sel_lineages_f == "L2;LBOV"  ~ 16
+# 
+#     , sel_lineages_f == "L3;L4" ~ 17
+# 
+#     , sel_lineages_f == "L4;L6" ~ 18
+#     , sel_lineages_f == "L4;L7" ~ 19
+# 
+#     , FALSE ~ -1)
+#   )
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
index 8eb2020..a55303b 100644
--- a/scripts/plotting/other_plots_data.R
+++ b/scripts/plotting/other_plots_data.R
@@ -1,7 +1,8 @@
 #!/usr/bin/env Rscript  
 #########################################################
-# TASK: producing boxplots for dr and other muts
-
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
 #########################################################
 # working dir and loading libraries
 # getwd()

From 50b89cdcd7afd3875bfe95de89f33a8d71897d1f Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 6 Sep 2021 19:52:56 +0100
Subject: [PATCH 10/51] one function with tuned params to generate count and
 diversity barplot

---
 scripts/functions/bp_lineage.R | 85 ++--------------------------------
 1 file changed, 3 insertions(+), 82 deletions(-)

diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R
index 86eb9f1..6b7090c 100644
--- a/scripts/functions/bp_lineage.R
+++ b/scripts/functions/bp_lineage.R
@@ -1,5 +1,7 @@
 ########################################
-# Lineage and within SNP count barplot
+# Lineage barplot
+# Lineage and nsSNP count barplot
+# Lineage Diversity barplot
 ########################################
 
 lin_count_bp <- function( lf_data
@@ -89,84 +91,3 @@ lin_count_bp <- function( lf_data
   
   return(OutPlot)
 }
-
-############################
-# Lineage diversity barplot
-############################
-# lin_diversity_bp <- function( wf_data
-#                              , x_categ = "sel_lineages"
-#                              , y_count = "snp_diversity"
-#                              , bar_stat_stype = "identity"
-#                              , display_label_col = "snp_diversity_f"
-#                              , x_lab_angle = 90
-#                              , d_lab_size = 5
-#                              , d_lab_hjust = 0.5
-#                              , d_lab_vjust = 0.5
-#                              , d_lab_col = "black"
-#                              , my_xats = 20 # x axis text size
-#                              , my_yats = 20 # y axis text size
-#                              , my_xals = 22 # x axis label size
-#                              , my_yals = 22 # y axis label size
-#                              , my_lls  = 22 # legend label size
-#                              , bar_leg_name = ""
-#                              , leg_location = "top"
-#                              , y_scale_percent = TRUE
-#                              , y_label = "SNP diversity" )
-#   
-#                              {
-#   g = ggplot(wf_data
-#              , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
-#                    , y    = eval(parse(text = y_count)) ) )
-#   
-#   OutPlot = g + geom_bar( stat     = bar_stat_stype
-#                      , position    = position_stack(reverse = TRUE)
-#                      ) + 
-#     
-#     theme(axis.text.x     = element_text(size = my_xats
-#                                          , angle = x_lab_angle)
-#           , axis.text.y   = element_text(size = my_yats
-#                                          , angle = 90
-#                                          , hjust = 1
-#                                          , vjust = 0)
-#           , axis.title.x = element_text(size     = my_xals
-#                                         , colour = "black")
-#           , axis.title.y = element_text(size     = my_yals
-#                                         , colour = "black")
-#           , legend.position = leg_location
-#           , legend.text = element_text(size = my_lls)) + 
-#     
-#     geom_label(aes(label = eval(parse(text = display_label_col)))
-#                , size    = d_lab_size
-#                , hjust   = d_lab_hjust
-#                , vjust   = d_lab_vjust
-#                , colour  = d_lab_col
-#                , show.legend = FALSE
-#                #, check_overlap = TRUE
-#                , position = position_stack(reverse = T))
-# #  return(OutPlot)
-#   
-#   if (y_scale_percent){
-# 
-#     OutPlot = OutPlot + 
-#       scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
-#       labs(title = ""
-#            , x   = ""
-#            , y   = y_label
-#            , colour = "black")
-#     
-#     return(OutPlot) 
-#     }
-     
-#  return(OutPlot)
-  
-#  }
-  
-  
-
-
-# ggp <- ggplot(bar_sel, aes(sel_lineages, snp_diversity)) +
-#   geom_bar(stat = "identity")
-# ggp +  scale_y_continuous(labels = scales::percent_format(accuracy = 1)
-#                           #, limits = c(0,1)
-#                           , breaks = seq(0, 30, 5)
-# )    

From 3cee341170b2c30a1fb74b5503cc9aa6b21cc38d Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 7 Sep 2021 09:27:47 +0100
Subject: [PATCH 11/51] replaced old lineage barplot with count and diversity
 combined plots sourced from function

---
 scripts/functions/bp_lineage.R                |  12 +-
 scripts/functions/test_bp_lineage.R           | 107 +++------
 scripts/plotting/basic_barplots_combined.R    |   2 +-
 scripts/plotting/lineage_basic_barplot.R      | 214 ------------------
 .../lineage_basic_barplots_combined.R         | 128 +++++++++++
 scripts/plotting/lineage_bp_data.R            | 112 ++++-----
 scripts/plotting/running_plotting_scripts.txt |   8 +
 7 files changed, 217 insertions(+), 366 deletions(-)
 delete mode 100644 scripts/plotting/lineage_basic_barplot.R
 create mode 100644 scripts/plotting/lineage_basic_barplots_combined.R

diff --git a/scripts/functions/bp_lineage.R b/scripts/functions/bp_lineage.R
index 6b7090c..ad43386 100644
--- a/scripts/functions/bp_lineage.R
+++ b/scripts/functions/bp_lineage.R
@@ -20,8 +20,8 @@ lin_count_bp <- function( lf_data
                           , my_xals = 22 # x axis label size
                           , my_yals = 22 # y axis label size
                           , my_lls   = 22 # legend label size
-                          , bar_col_labels = c("Mutations", "Total Samples")
-                          , bar_col_values = c("grey50", "gray75")
+                          , bar_col_labels = ""
+                          , bar_col_values = ""
                           , bar_leg_name = ""
                           , leg_location = "top"
                           , y_log10 = FALSE
@@ -68,20 +68,18 @@ lin_count_bp <- function( lf_data
          , y      = y_label
          , colour = "black")
     
-  
   if (y_log10){
     
    OutPlot = OutPlot + 
      scale_y_continuous(trans = "log10"
                         , labels = trans_format("log10", math_format(10^.x) ) )
-
-  }
+   }
   
   if (y_scale_percent){
     
     OutPlot = OutPlot +
-      #scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
-      scale_y_continuous(labels = scales::percent) +
+      scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+      #scale_y_continuous(labels = scales::percent) +
       
       labs(title    = ""
            , x      = ""
diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/test_bp_lineage.R
index 6742429..876f237 100644
--- a/scripts/functions/test_bp_lineage.R
+++ b/scripts/functions/test_bp_lineage.R
@@ -1,39 +1,25 @@
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 
-#source ('get_plotting_dfs.R')
+source ('get_plotting_dfs.R')
 source("../functions/bp_lineage.R")
 
 #########################################
 # Lineage and SNP count: lineage lf data
 #########################################
-# Relevel factors so that x-axis categ appear as you want
-lin_lf_plot = lin_lf
-lin_lf_plot
-is.factor(lin_lf_plot$sel_lineages_f)
-
-lin_lf_plot$sel_lineages_f =  factor(lin_lf_plot$sel_lineages_f, c(""
-                                                                   , "L1"
-                                                                   , "L2"
-                                                                   , "L3"
-                                                                   , "L4"
-                                                                   , "L5"
-                                                                   , "L6"
-                                                                   , "L7"
-                                                                   , "LBOV"
-                                                                   , "L1;L2"
-                                                                   , "L1;L3"
-                                                                   , "L1;L4"
-                                                                   , "L2;L3"
-                                                                   , "L2;L3;L4"
-                                                                   , "L2;L4"  
-                                                                   , "L2;L6"
-                                                                   , "L2;LBOV"  
-                                                                   , "L3;L4" 
-                                                                   , "L4;L6"
-                                                                   , "L4;L7"))
+#=========================
+# Data: All lineages or
+# selected few
+#=========================
+sel_lineages = levels(lin_lf$sel_lineages_f)
+sel_lineages
+lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
 
+# drop unused factor levels
+lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
 levels(lin_lf_plot$sel_lineages_f)
-
+#=========================
+# Lineage count plot
+#=========================
 lin_count_bp(lin_lf_plot
              , x_categ = "sel_lineages_f"
              , y_count = "p_count"
@@ -44,68 +30,33 @@ lin_count_bp(lin_lf_plot
              , my_xats = 20
              , bar_col_labels = c("Mutations", "Total Samples")
              , bar_col_values = c("grey50", "gray75")
-             , y_log10 = T
-             , y_label = "Count"
-             , y_scale_percent = F)
+             , y_scale_percent = F # T for diversity
+             , y_log10 = F
+             , y_label = "Count")
 
 ###############################################
 # Lineage SNP diversity count: lineage wf data
 ###############################################
-# Relevel factors so that x-axis categ appear as you want
-lin_wf_plot = lin_wf
-is.factor(lin_wf_plot$sel_lineages_f)
-
-lin_wf_plot$sel_lineages_f =  factor(lin_wf_plot$sel_lineages_f, c(""
-                                                         , "L1"
-                                                         , "L2"
-                                                         , "L3"
-                                                         , "L4"
-                                                         , "L5"
-                                                         , "L6"
-                                                         , "L7"
-                                                         , "LBOV"
-                                                         , "L1;L2"
-                                                         , "L1;L3"
-                                                         , "L1;L4"
-                                                         , "L2;L3"
-                                                         , "L2;L3;L4"
-                                                         , "L2;L4"  
-                                                         , "L2;L6"
-                                                         , "L2;LBOV"  
-                                                         , "L3;L4" 
-                                                         , "L4;L6"
-                                                         , "L4;L7"))
+#=========================
+# Data: All lineages or
+# selected few
+#=========================
+sel_lineages = levels(lin_wf$sel_lineages_f)
+sel_lineages
+lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
 
+# drop unused factor levels
+lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
 levels(lin_wf_plot$sel_lineages_f)
-
-#==========
-# Plot
-#==========
+#=========================
+# Lineage Diversity plot
+#=========================
 lin_count_bp(lin_wf_plot
                  , x_categ = "sel_lineages_f"
                  , y_count = "snp_diversity"
                  , display_label_col = "snp_diversity_f"
-
                  , bar_stat_stype = "identity"
                  , x_lab_angle = 90
                  , my_xats = 20
                  , y_scale_percent = T
-                 , y_label = "SNP diversity"
-             
-
-)
-
-
-
-
-, x_categ = "sel_lineages_f"
-, y_count = "p_count"
-, bar_fill_categ = "count_categ"
-, display_label_col = "p_count"
-, bar_stat_stype = "identity"
-, x_lab_angle = 90
-, my_xats = 15
-, bar_col_labels = c("Mutations", "Total Samples")
-, bar_col_values = c("grey50", "gray75")
-, y_log10 = T
-, y_scale_percent = F
\ No newline at end of file
+                 , y_label = "SNP diversity")
diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R
index 7fee2d7..2643b0d 100644
--- a/scripts/plotting/basic_barplots_combined.R
+++ b/scripts/plotting/basic_barplots_combined.R
@@ -23,7 +23,7 @@ plot_basic_bp_combined_labelled  =  paste0(plotdir,"/", basic_bp_combined_labell
 
 #=======================================================================
 #=======
-# combin DUET and Ligand affinity plots
+# combine DUET and Ligand affinity plots
 #=======
 svg(plot_basic_bp_combined_labelled , width = 12, height = 12 )
 
diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R
deleted file mode 100644
index e4503d1..0000000
--- a/scripts/plotting/lineage_basic_barplot.R
+++ /dev/null
@@ -1,214 +0,0 @@
-#!/usr/bin/env Rscript       
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-#########################################################
-# TASK: Basic lineage barplot showing numbers
-
-# Output: Basic barplot with lineage samples and mut count
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-source("Header_TT.R")
-require(data.table)
-source("combining_dfs_plotting.R")
-# should return the following dfs, directories and variables
-
-# PS combined: 
-# 1) merged_df2
-# 2) merged_df2_comp
-# 3) merged_df3
-# 4) merged_df3_comp
-
-# LIG combined: 
-# 5) merged_df2_lig
-# 6) merged_df2_comp_lig
-# 7) merged_df3_lig
-# 8) merged_df3_comp_lig
-
-# 9) my_df_u
-# 10) my_df_u_lig
-
-cat("Directories imported:"
-    , "\n===================="
-    , "\ndatadir:", datadir
-    , "\nindir:", indir
-    , "\noutdir:", outdir
-    , "\nplotdir:", plotdir)
-
-cat("Variables imported:"
-    , "\n====================="
-    , "\ndrug:", drug
-    , "\ngene:", gene
-    , "\ngene_match:", gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
-    , "\nNo. of duplicated muts:", dup_muts_nu
-    , "\nNA count for ORs:", na_count
-    , "\nNA count in df2:", na_count_df2
-    , "\nNA count in df3:", na_count_df3
-    , "\ndr_muts_col:", dr_muts_col
-    , "\nother_muts_col:", other_muts_col
-    , "\ndrtype_col:", resistance_col)
-
-
-#===========
-# input
-#===========
-# output of combining_dfs_plotting.R
-
-#=======
-# output
-#=======
-# plot 1
-basic_bp_lineage = "basic_lineage_barplot.svg"
-plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
-
-#=======================================================================
-#================
-# Data for plots:
-# you need merged_df2, comprehensive one
-# since this has one-many relationship
-# i.e the same SNP can belong to multiple lineages
-#================
-# REASSIGNMENT as necessary
-my_df  = merged_df2
-
-# clear excess variable
-rm(merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-#==========================
-# Plot: Lineage barplot
-# x = lineage y = No. of samples
-# col = Lineage
-# fill = lineage
-#============================
-table(my_df$lineage)
-as.data.frame(table(my_df$lineage))
-
-#=============
-# Data for plots
-#=============
-# REASSIGNMENT
-df <- my_df
-
-rm(my_df)
-
-# get freq count of positions so you can subset freq<1
-#setDT(df)[, lineage_count := .N, by = .(lineage)]
-
-#******************
-# generate plot: barplot of mutation by lineage
-#******************
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4"
-                 #, "lineage5"
-                 #, "lineage6"
-                 #, "lineage7"
-                 )
-
-df_lin = subset(df, subset = lineage %in% sel_lineages)
-
-# Create df with lineage inform & no. of unique mutations
-# per lineage and total samples within lineage
-# this is essentially barplot with two y axis
-
-bar = bar = as.data.frame(sel_lineages) #4, 1
-total_snps_u = NULL
-total_samples = NULL
-
-for (i in sel_lineages){
-  #print(i)
-  curr_total = length(unique(df$id)[df$lineage==i])
-  total_samples = c(total_samples, curr_total)
-  print(total_samples)
-  
-  foo = df[df$lineage==i,]
-  print(paste0(i, "======="))
-  print(length(unique(foo$mutationinformation)))
-  curr_count = length(unique(foo$mutationinformation))
-
-  total_snps_u = c(total_snps_u, curr_count)
-}
-
-print(total_snps_u)
-bar$num_snps_u = total_snps_u
-bar$total_samples = total_samples
-bar
-
-#*****************
-# generate plot: lineage barplot with two y-axis
-#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
-#*****************
-
-y1 = bar$num_snps_u
-y2 = bar$total_samples
-x = sel_lineages
-
-to_plot = data.frame(x = x
-                      , y1 = y1
-                      , y2 = y2)
-to_plot
-
-# FIXME later: will be depricated!
-melted = melt(to_plot, id = "x")
-melted
-
-
-svg(plot_basic_bp_lineage)
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(melted, aes(x = x
-                 , y = value
-                 , fill = variable))
-
-printFile = g + geom_bar(stat = "identity"
-                         , position = position_stack(reverse = TRUE)
-                         , alpha=.75
-                         , colour='grey75') + 
-  theme(axis.text.x = element_text(size = my_ats)
-        , axis.text.y = element_text(size = my_ats
-                                     #, angle = 30
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als
-                                      , colour = 'black')
-        , axis.title.y = element_text(size = my_als
-                                      , colour = 'black')
-        , legend.position = "top"
-        , legend.text = element_text(size = my_als)) + 
-          #geom_text() +
-          geom_label(aes(label = value)
-                     , size = 5
-                     , hjust = 0.5
-                     , vjust = 0.5
-                     , colour = 'black'
-                     , show.legend = FALSE
-                     #, check_overlap = TRUE
-                     , position = position_stack(reverse = T)) + 
-          labs(title = ''
-               , x = ''
-               , y = "Number"
-               , fill = 'Variable'
-               , colour = 'black') + 
-          scale_fill_manual(values = c('grey50', 'gray75')
-                            , name=''
-                            , labels=c('Mutations', 'Total Samples')) +
-          scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
-
-print(printFile)
-dev.off()
diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R
new file mode 100644
index 0000000..4b63587
--- /dev/null
+++ b/scripts/plotting/lineage_basic_barplots_combined.R
@@ -0,0 +1,128 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+#########################################################
+# TASK: Basic lineage barplot showing numbers
+
+# Output: Basic barplot with lineage samples and mut count
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+source("Header_TT.R")
+source("../functions/bp_lineage.R")
+
+#===========
+# input
+#===========
+source ('get_plotting_dfs.R')
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    #, "\nNo. of duplicated muts:", dup_muts_nu
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+#=======
+# output
+#=======
+# plot 1
+basic_bp_lineage_cl = "basic_lineage_barplots_combined.svg"
+plot_basic_bp_lineage_cl  =  paste0(plotdir,"/", basic_bp_lineage_cl)
+plot_basic_bp_lineage_cl 
+#################################################################
+#=============================
+# PLOT 1: Lineage count plot:
+# LF data
+#=============================
+#------------------------
+# Data: All lineages or
+# selected few
+#------------------------
+sel_lineages = levels(lin_lf$sel_lineages_f)[1:4]
+sel_lineages
+lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
+str(lin_lf_plot)
+
+# drop unused factor levels
+lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
+levels(lin_lf_plot$sel_lineages_f)
+str(lin_lf_plot)
+
+#------------------------
+# plot from my function:
+#------------------------
+lin_countP = lin_count_bp(lin_lf_plot
+             , x_categ = "sel_lineages_f"
+             , y_count = "p_count"
+             , bar_fill_categ = "count_categ"
+             , display_label_col = "p_count"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , bar_col_labels = c("Mutations", "Total Samples")
+             , bar_col_values = c("grey50", "gray75")
+             , y_scale_percent = F # T for diversity
+             , y_log10 = F
+             , y_label = "Count")
+lin_countP
+#================================
+# PLOT 2: Lineage Diversity plot
+# WF data
+#================================
+#------------------------
+# Data: All lineages or
+# selected few
+#------------------------
+sel_lineages = levels(lin_wf$sel_lineages_f)[1:4]
+sel_lineages
+lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
+str(lin_wf_plot)
+
+# drop unused factor levels
+lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
+levels(lin_wf_plot$sel_lineages_f)
+str(lin_wf_plot)
+
+#------------------------
+# plot from my function:
+#------------------------
+lin_diversityP = lin_count_bp(lin_wf_plot
+             , x_categ = "sel_lineages_f"
+             , y_count = "snp_diversity"
+             , display_label_col = "snp_diversity_f"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , y_scale_percent = T
+             , y_label = "SNP diversity")
+
+lin_diversityP
+#########################################################################333
+#================================
+# Combine plots
+#================================
+
+svg(plot_basic_bp_lineage_cl , width = 8, height = 15 )
+
+lineage_bp_combined = cowplot::plot_grid(lin_countP, lin_diversityP
+                                      #, labels = c("(a)", "(b)", "(c)", "(d)")
+                                      , nrow = 2
+                                      , labels = "AUTO"
+                                      , label_size = 25)
+
+lineage_bp_combined
+dev.off()
diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R
index 2cdfbe8..2d51ab0 100644
--- a/scripts/plotting/lineage_bp_data.R
+++ b/scripts/plotting/lineage_bp_data.R
@@ -68,42 +68,35 @@ lin_wf
 #=====================
 # SNP diversity 
 lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
-lin_wf
+lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
 
 # Lineage names
 lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
 lin_wf
 
-# # Lineage names
-# lin_wf = lin_wf %>%
-#   mutate(ordering_category = case_when(
-#     sel_lineages_f   == ""    ~ 0
-#     , sel_lineages_f == "L1"   ~ 1
-#     , sel_lineages_f == "L2"   ~ 2
-#     , sel_lineages_f == "L3"   ~ 3
-#     , sel_lineages_f == "L4"   ~ 4
-#     , sel_lineages_f == "L5"   ~ 5
-#     , sel_lineages_f == "L6"   ~ 6
-#     , sel_lineages_f == "L7"   ~ 7
-#     , sel_lineages_f == "LBOV" ~ 8
-#     
-#     , sel_lineages_f == "L1;L2" ~ 9
-#     , sel_lineages_f == "L1;L3" ~ 10
-#     , sel_lineages_f == "L1;L4" ~ 11
-#     
-#     , sel_lineages_f == "L2;L3"    ~ 12
-#     , sel_lineages_f == "L2;L3;L4" ~ 13
-#     , sel_lineages_f == "L2;L4"    ~ 14
-#     , sel_lineages_f == "L2;L6"    ~ 15
-#     , sel_lineages_f == "L2;LBOV"  ~ 16
-#     
-#     , sel_lineages_f == "L3;L4" ~ 17
-#     
-#     , sel_lineages_f == "L4;L6" ~ 18
-#     , sel_lineages_f == "L4;L7" ~ 19
-#     
-#     , FALSE ~ -1)
-#   )
+# Important: Relevel factors so that x-axis categ appear as you want
+lin_wf$sel_lineages_f =  factor(lin_wf$sel_lineages_f, c("L1"
+                                                         , "L2"
+                                                         , "L3"
+                                                         , "L4"
+                                                         , "L5"
+                                                         , "L6"
+                                                         , "L7"
+                                                         , "LBOV"
+                                                         , "L1;L2"
+                                                         , "L1;L3"
+                                                         , "L1;L4"
+                                                         , "L2;L3"
+                                                         , "L2;L3;L4"
+                                                         , "L2;L4"  
+                                                         , "L2;L6"
+                                                         , "L2;LBOV"  
+                                                         , "L3;L4" 
+                                                         , "L4;L6"
+                                                         , "L4;L7"
+                                                         , ""))
+
+levels(lin_wf$sel_lineages_f)
 
 ##################################
 # LF data: lineages with 
@@ -134,40 +127,27 @@ if ( nrow(lin_lf)  ==  expected_rows ){
   cat("\nFAIL: numbers mismatch"
       , "\nExpected nrow: ", expected_rows)
 }
-#######################################
-# #=====================
-# # Add some formatting
-# #=====================
-# lin_lf$sel_lineages_f = gsub("lineage", "L", lin_lf$sel_lineages)
-# lin_lf
 
+# Important: Relevel factors so that x-axis categ appear as you want
+lin_lf$sel_lineages_f =  factor(lin_lf$sel_lineages_f, c("L1"
+                                                         , "L2"
+                                                         , "L3"
+                                                         , "L4"
+                                                         , "L5"
+                                                         , "L6"
+                                                         , "L7"
+                                                         , "LBOV"
+                                                         , "L1;L2"
+                                                         , "L1;L3"
+                                                         , "L1;L4"
+                                                         , "L2;L3"
+                                                         , "L2;L3;L4"
+                                                         , "L2;L4"  
+                                                         , "L2;L6"
+                                                         , "L2;LBOV"  
+                                                         , "L3;L4" 
+                                                         , "L4;L6"
+                                                         , "L4;L7"
+                                                         , ""))
 
-# lin_lf = lin_lf %>%
-#   mutate(ordering_category = case_when(
-#      sel_lineages_f   == ""    ~ 0
-#     , sel_lineages_f == "L1"   ~ 1
-#     , sel_lineages_f == "L2"   ~ 2
-#     , sel_lineages_f == "L3"   ~ 3
-#     , sel_lineages_f == "L4"   ~ 4
-#     , sel_lineages_f == "L5"   ~ 5
-#     , sel_lineages_f == "L6"   ~ 6
-#     , sel_lineages_f == "L7"   ~ 7
-#     , sel_lineages_f == "LBOV" ~ 8
-# 
-#     , sel_lineages_f == "L1;L2" ~ 9
-#     , sel_lineages_f == "L1;L3" ~ 10
-#     , sel_lineages_f == "L1;L4" ~ 11
-# 
-#     , sel_lineages_f == "L2;L3"    ~ 12
-#     , sel_lineages_f == "L2;L3;L4" ~ 13
-#     , sel_lineages_f == "L2;L4"    ~ 14
-#     , sel_lineages_f == "L2;L6"    ~ 15
-#     , sel_lineages_f == "L2;LBOV"  ~ 16
-# 
-#     , sel_lineages_f == "L3;L4" ~ 17
-# 
-#     , sel_lineages_f == "L4;L6" ~ 18
-#     , sel_lineages_f == "L4;L7" ~ 19
-# 
-#     , FALSE ~ -1)
-#   )
+levels(lin_lf$sel_lineages_f)
diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt
index 8d7ba0c..332293e 100644
--- a/scripts/plotting/running_plotting_scripts.txt
+++ b/scripts/plotting/running_plotting_scripts.txt
@@ -112,6 +112,14 @@ note:
  - fa flag has default if not supplied 
  - fb flag has default if not supplied
  
+ 
+#===================
+# Add LINEAGE ONE
+#=================== 
+# Lineage_bp.R
+creates Count and Diversity plot
+
+ 
 ########################################################################
 # TODO
 Delete: dirs.R

From c9519b3b56194aad06ce8394292cce3a166444d6 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 7 Sep 2021 10:52:26 +0100
Subject: [PATCH 12/51] moved old lineage_basic_barplot.R to redundant

---
 .../redundant/lineage_basic_barplot.R         | 214 +++++++++++++
 .../redundant/other_plots_data_SAFEGUARD.R    | 301 ++++++++++++++++++
 2 files changed, 515 insertions(+)
 create mode 100644 scripts/plotting/redundant/lineage_basic_barplot.R
 create mode 100644 scripts/plotting/redundant/other_plots_data_SAFEGUARD.R

diff --git a/scripts/plotting/redundant/lineage_basic_barplot.R b/scripts/plotting/redundant/lineage_basic_barplot.R
new file mode 100644
index 0000000..e4503d1
--- /dev/null
+++ b/scripts/plotting/redundant/lineage_basic_barplot.R
@@ -0,0 +1,214 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+#########################################################
+# TASK: Basic lineage barplot showing numbers
+
+# Output: Basic barplot with lineage samples and mut count
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+source("Header_TT.R")
+require(data.table)
+source("combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+
+#===========
+# input
+#===========
+# output of combining_dfs_plotting.R
+
+#=======
+# output
+#=======
+# plot 1
+basic_bp_lineage = "basic_lineage_barplot.svg"
+plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
+
+#=======================================================================
+#================
+# Data for plots:
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+#================
+# REASSIGNMENT as necessary
+my_df  = merged_df2
+
+# clear excess variable
+rm(merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+as.data.frame(table(my_df$lineage))
+
+#=============
+# Data for plots
+#=============
+# REASSIGNMENT
+df <- my_df
+
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4"
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7"
+                 )
+
+df_lin = subset(df, subset = lineage %in% sel_lineages)
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+y1 = bar$num_snps_u
+y2 = bar$total_samples
+x = sel_lineages
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+# FIXME later: will be depricated!
+melted = melt(to_plot, id = "x")
+melted
+
+
+svg(plot_basic_bp_lineage)
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted, aes(x = x
+                 , y = value
+                 , fill = variable))
+
+printFile = g + geom_bar(stat = "identity"
+                         , position = position_stack(reverse = TRUE)
+                         , alpha=.75
+                         , colour='grey75') + 
+  theme(axis.text.x = element_text(size = my_ats)
+        , axis.text.y = element_text(size = my_ats
+                                     #, angle = 30
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als
+                                      , colour = 'black')
+        , axis.title.y = element_text(size = my_als
+                                      , colour = 'black')
+        , legend.position = "top"
+        , legend.text = element_text(size = my_als)) + 
+          #geom_text() +
+          geom_label(aes(label = value)
+                     , size = 5
+                     , hjust = 0.5
+                     , vjust = 0.5
+                     , colour = 'black'
+                     , show.legend = FALSE
+                     #, check_overlap = TRUE
+                     , position = position_stack(reverse = T)) + 
+          labs(title = ''
+               , x = ''
+               , y = "Number"
+               , fill = 'Variable'
+               , colour = 'black') + 
+          scale_fill_manual(values = c('grey50', 'gray75')
+                            , name=''
+                            , labels=c('Mutations', 'Total Samples')) +
+          scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
+
+print(printFile)
+dev.off()
diff --git a/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R b/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R
new file mode 100644
index 0000000..df5c1e3
--- /dev/null
+++ b/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R
@@ -0,0 +1,301 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: producing boxplots for dr and other muts
+
+#########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+library(dplyr)
+library(tidyverse)
+source("combining_dfs_plotting.R")
+
+rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig
+   , merged_df3_comp, merged_df3_comp_lig
+   , my_df_u, my_df_u_lig)
+
+
+cols_to_select = c("mutation", "mutationinformation"
+                   , "wild_type", "position", "mutant_type"
+                   , "mutation_info")
+
+merged_df3_short = merged_df3[, cols_to_select]
+
+# write merged_df3 to generate structural figure
+write.csv(merged_df3_short, "merged_df3_short.csv")
+
+#========================================================================
+#%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT: PS
+#%%%%%%%%%%%%%%%%%%%%
+df_ps = merged_df3
+
+#============================
+# adding foldx scaled values
+# scale data b/w -1 and 1
+#============================
+n = which(colnames(df_ps) == "ddg"); n 
+
+my_min = min(df_ps[,n]); my_min 
+my_max = max(df_ps[,n]); my_max 
+
+df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
+                            , df_ps[,n]/abs(my_min)
+                            , df_ps[,n]/my_max) 
+# sanity check
+my_min = min(df_ps$foldx_scaled); my_min 
+my_max = max(df_ps$foldx_scaled); my_max
+
+if (my_min == -1 && my_max == 1){
+  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
+      , "\nProceeding with assigning foldx outcome category")
+}else{
+  cat("FAIL: could not scale foldx ddg values"
+      , "Aborting!")
+}
+
+#================================
+# adding foldx outcome category
+# ddg<0 = "Stabilising" (-ve)
+#=================================
+
+c1 = table(df_ps$ddg < 0)
+df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
+c2 = table(df_ps$ddg < 0)
+
+if ( all(c1 == c2) ){
+  cat("PASS: foldx outcome successfully created")
+}else{
+  cat("FAIL: foldx outcome could not be created. Aborting!")
+  exit()
+}
+#=======================================================================
+# name tidying
+df_ps$mutation_info = as.factor(df_ps$mutation_info)
+df_ps$duet_outcome = as.factor(df_ps$duet_outcome)
+df_ps$foldx_outcome  = as.factor(df_ps$foldx_outcome)
+df_ps$ligand_outcome  = as.factor(df_ps$ligand_outcome)
+
+# check
+table(df_ps$mutation_info)
+
+ # further checks to make sure dr and other muts are indeed unique
+dr_muts = df_ps[df_ps$mutation_info == dr_muts_col,]
+dr_muts_names = unique(dr_muts$mutation)
+
+other_muts = df_ps[df_ps$mutation_info == other_muts_col,]
+other_muts_names = unique(other_muts$mutation)
+
+if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
+  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
+  cat("PASS: dr and other muts are indeed unique")
+}else{
+  cat("FAIL: dr adn others muts are NOT unique!")
+  quit()
+}
+
+
+#%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT: LIG
+#%%%%%%%%%%%%%%%%%%%%
+
+df_lig = merged_df3_lig
+
+# name tidying
+df_lig$mutation_info = as.factor(df_lig$mutation_info)
+df_lig$duet_outcome = as.factor(df_lig$duet_outcome)
+#df_lig$ligand_outcome  = as.factor(df_lig$ligand_outcome)
+
+# check
+table(df_lig$mutation_info)
+
+#========================================================================
+#===========
+# Data: ps
+#===========
+# keep similar dtypes cols together
+cols_to_select_ps = c("mutationinformation", "mutation", "position", "mutation_info"
+                   , "duet_outcome"
+
+                   , "duet_scaled"
+                   , "ligand_distance"
+                   , "asa"
+                   , "rsa"
+                   , "rd_values"
+                   , "kd_values")
+
+df_wf_ps = df_ps[, cols_to_select_ps]
+
+pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+
+expected_rows_lf_ps = nrow(df_wf_ps) * (length(df_wf_ps) - length(pivot_cols_ps))
+expected_rows_lf_ps
+
+# LF data: duet
+df_lf_ps = gather(df_wf_ps, param_type, param_value, duet_scaled:kd_values, factor_key=TRUE)
+
+if (nrow(df_lf_ps) == expected_rows_lf_ps){
+  cat("PASS: long format data created for duet")
+}else{
+  cat("FAIL: long format data could not be created for duet")
+  exit()
+}
+
+str(df_wf_ps)
+str(df_lf_ps)
+
+# assign pretty labels: param_type
+levels(df_lf_ps$param_type); table(df_lf_ps$param_type)
+
+ligand_dist_colname = paste0("Distance to ligand (", angstroms_symbol, ")")
+ligand_dist_colname
+
+duet_stability_name = paste0(delta_symbol, delta_symbol, "G")
+duet_stability_name
+  
+#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- "Stability"
+levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="duet_scaled"] <- duet_stability_name
+#levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- "Ligand Distance"
+levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="ligand_distance"] <- ligand_dist_colname
+levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="asa"] <- "ASA"
+levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rsa"] <- "RSA"
+levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="rd_values"] <- "RD"
+levels(df_lf_ps$param_type)[levels(df_lf_ps$param_type)=="kd_values"] <- "KD"
+# check
+levels(df_lf_ps$param_type); table(df_lf_ps$param_type)
+
+# assign pretty labels: mutation_info
+levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info)
+sum(table(df_lf_ps$mutation_info)) == nrow(df_lf_ps)
+
+levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==dr_muts_col] <- "DM"
+levels(df_lf_ps$mutation_info)[levels(df_lf_ps$mutation_info)==other_muts_col] <- "OM"
+# check
+levels(df_lf_ps$mutation_info); table(df_lf_ps$mutation_info)
+
+############################################################################
+
+#===========
+# LF data: LIG
+#===========
+# keep similar dtypes cols together
+cols_to_select_lig = c("mutationinformation", "mutation", "position", "mutation_info"
+                       , "ligand_outcome"
+                       
+                       , "affinity_scaled"
+                       #, "ligand_distance"
+                       , "asa"
+                       , "rsa"
+                       , "rd_values"
+                       , "kd_values")
+
+df_wf_lig = df_lig[, cols_to_select_lig]
+
+pivot_cols_lig = cols_to_select_lig[1:5]; pivot_cols_lig
+
+expected_rows_lf_lig = nrow(df_wf_lig) * (length(df_wf_lig) - length(pivot_cols_lig))
+expected_rows_lf_lig
+
+# LF data: foldx
+df_lf_lig = gather(df_wf_lig, param_type, param_value, affinity_scaled:kd_values, factor_key=TRUE)
+
+if (nrow(df_lf_lig) == expected_rows_lf_lig){
+  cat("PASS: long format data created for foldx")
+}else{
+  cat("FAIL: long format data could not be created for foldx")
+  exit()
+}
+
+# assign pretty labels: param_type
+levels(df_lf_lig$param_type); table(df_lf_lig$param_type)
+
+levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="affinity_scaled"] <- "Ligand Affinity"
+#levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="ligand_distance"] <- "Ligand Distance"
+levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="asa"] <- "ASA"
+levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rsa"] <- "RSA"
+levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="rd_values"] <- "RD"
+levels(df_lf_lig$param_type)[levels(df_lf_lig$param_type)=="kd_values"] <- "KD"
+#check
+levels(df_lf_lig$param_type); table(df_lf_lig$param_type)
+
+# assign pretty labels: mutation_info
+levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info)
+sum(table(df_lf_lig$mutation_info)) == nrow(df_lf_lig)
+
+levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==dr_muts_col] <- "DM"
+levels(df_lf_lig$mutation_info)[levels(df_lf_lig$mutation_info)==other_muts_col] <- "OM"
+# check
+levels(df_lf_lig$mutation_info); table(df_lf_lig$mutation_info)
+
+#############################################################################
+#===========
+# Data: foldx
+#===========
+# keep similar dtypes cols together
+cols_to_select_foldx = c("mutationinformation", "mutation", "position", "mutation_info"
+                      , "foldx_outcome"
+                      
+                      , "foldx_scaled")
+                      #, "ligand_distance"
+                      #, "asa"
+                      #, "rsa"
+                      #, "rd_values"
+                      #, "kd_values")
+
+
+df_wf_foldx = df_ps[, cols_to_select_foldx]
+
+pivot_cols_foldx = cols_to_select_foldx[1:5]; pivot_cols_foldx
+  
+expected_rows_lf_foldx = nrow(df_wf_foldx) * (length(df_wf_foldx) - length(pivot_cols_foldx))
+expected_rows_lf_foldx
+
+# LF data: foldx
+df_lf_foldx = gather(df_wf_foldx, param_type, param_value, foldx_scaled, factor_key=TRUE)
+
+if (nrow(df_lf_foldx) == expected_rows_lf_foldx){
+  cat("PASS: long format data created for foldx")
+}else{
+  cat("FAIL: long format data could not be created for foldx")
+  exit()
+}
+
+foldx_stability_name = paste0(delta_symbol, delta_symbol, "G")
+foldx_stability_name
+
+# assign pretty labels: param type
+levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type)
+
+#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- "Stability"
+levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="foldx_scaled"] <- foldx_stability_name
+#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="ligand_distance"] <- "Ligand Distance"
+#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="asa"] <- "ASA"
+#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rsa"] <- "RSA"
+#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="rd_values"] <- "RD"
+#levels(df_lf_foldx$param_type)[levels(df_lf_foldx$param_type)=="kd_values"] <- "KD"
+# check
+levels(df_lf_foldx$param_type); table(df_lf_foldx$param_type)
+
+# assign pretty labels: mutation_info
+levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info)
+sum(table(df_lf_foldx$mutation_info)) == nrow(df_lf_foldx)
+
+levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==dr_muts_col] <- "DM"
+levels(df_lf_foldx$mutation_info)[levels(df_lf_foldx$mutation_info)==other_muts_col] <- "OM"
+# check
+levels(df_lf_foldx$mutation_info); table(df_lf_foldx$mutation_info)
+
+############################################################################
+
+# clear excess variables
+rm(cols_to_select_ps, cols_to_select_foldx, cols_to_select_lig
+   , pivot_cols_ps, pivot_cols_foldx, pivot_cols_lig
+  , expected_rows_lf_ps, expected_rows_lf_foldx, expected_rows_lf_lig
+  , my_max, my_min, na_count, na_count_df2, na_count_df3, dup_muts_nu
+  , c1, c2, n)

From 686fd0cd808df34a3ac728f0bdbbd62828f51a46 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 7 Sep 2021 11:16:41 +0100
Subject: [PATCH 13/51] updated running_plotting_scripts.R

---
 scripts/plotting/barplots_subcolours.R        |  2 +-
 scripts/plotting/basic_barplots_combined.R    |  0
 scripts/plotting/corr_adjusted_PS_LIG.R       |  0
 scripts/plotting/dirs.R                       |  0
 scripts/plotting/dist_plots_check.R           |  0
 scripts/plotting/extreme_muts.R               |  0
 scripts/plotting/get_plotting_dfs.R           |  8 +++---
 scripts/plotting/get_plotting_dfs_with_lig.R  |  0
 scripts/plotting/ggcorr_all_PS_LIG.R          |  0
 scripts/plotting/hist_af_or_base.R            |  0
 scripts/plotting/hist_af_or_combined.R        |  0
 scripts/plotting/legend_adjustment.R          |  0
 .../lineage_basic_barplots_combined.R         | 24 ++++++++++++++++-
 scripts/plotting/lineage_bp_data.R            |  0
 scripts/plotting/lineage_dist_combined_PS.R   |  0
 .../plotting/lineage_dist_dm_om_combined_PS.R |  0
 scripts/plotting/opp_mcsm_muts.R              |  0
 scripts/plotting/or_plots_combined.R          |  0
 scripts/plotting/other_plots_combined.R       |  0
 scripts/plotting/other_plots_data.R           |  0
 scripts/plotting/output_tables.R              |  0
 scripts/plotting/ps_plots_combined.R          |  0
 scripts/plotting/resolving_ambiguous_muts.R   |  0
 scripts/plotting/running_plotting_scripts.txt | 26 +++++++++++++++----
 24 files changed, 49 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 scripts/plotting/basic_barplots_combined.R
 mode change 100644 => 100755 scripts/plotting/corr_adjusted_PS_LIG.R
 mode change 100644 => 100755 scripts/plotting/dirs.R
 mode change 100644 => 100755 scripts/plotting/dist_plots_check.R
 mode change 100644 => 100755 scripts/plotting/extreme_muts.R
 mode change 100644 => 100755 scripts/plotting/get_plotting_dfs.R
 mode change 100644 => 100755 scripts/plotting/get_plotting_dfs_with_lig.R
 mode change 100644 => 100755 scripts/plotting/ggcorr_all_PS_LIG.R
 mode change 100644 => 100755 scripts/plotting/hist_af_or_base.R
 mode change 100644 => 100755 scripts/plotting/hist_af_or_combined.R
 mode change 100644 => 100755 scripts/plotting/legend_adjustment.R
 mode change 100644 => 100755 scripts/plotting/lineage_basic_barplots_combined.R
 mode change 100644 => 100755 scripts/plotting/lineage_bp_data.R
 mode change 100644 => 100755 scripts/plotting/lineage_dist_combined_PS.R
 mode change 100644 => 100755 scripts/plotting/lineage_dist_dm_om_combined_PS.R
 mode change 100644 => 100755 scripts/plotting/opp_mcsm_muts.R
 mode change 100644 => 100755 scripts/plotting/or_plots_combined.R
 mode change 100644 => 100755 scripts/plotting/other_plots_combined.R
 mode change 100644 => 100755 scripts/plotting/other_plots_data.R
 mode change 100644 => 100755 scripts/plotting/output_tables.R
 mode change 100644 => 100755 scripts/plotting/ps_plots_combined.R
 mode change 100644 => 100755 scripts/plotting/resolving_ambiguous_muts.R

diff --git a/scripts/plotting/barplots_subcolours.R b/scripts/plotting/barplots_subcolours.R
index 4e4806a..1f98bec 100755
--- a/scripts/plotting/barplots_subcolours.R
+++ b/scripts/plotting/barplots_subcolours.R
@@ -124,4 +124,4 @@ print(outPlot_bp_lig)
 dev.off()
 ######################################################################=
 #                             End of script
-######################################################################=
\ No newline at end of file
+######################################################################=
diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/corr_adjusted_PS_LIG.R b/scripts/plotting/corr_adjusted_PS_LIG.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/dist_plots_check.R b/scripts/plotting/dist_plots_check.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/extreme_muts.R b/scripts/plotting/extreme_muts.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
old mode 100644
new mode 100755
index 89b477c..5876d8d
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -39,8 +39,8 @@ import_dirs(drug, gene)
 #---------------------------
 # call: plotting_data()
 #---------------------------
-if (!exists("infile_params") && exists("gene")){
-#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
+#if (!exists("infile_params") && exists("gene")){
+if (!is.character(infile_params) && exists("gene")){ # when running as cmd
   #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
   in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
   infile_params = paste0(outdir, "/", in_filename_params)
@@ -67,8 +67,8 @@ cat("\nLigand distance cut off, colname:", LigDist_colname
 #--------------------------------
 # call: combining_dfs_plotting()
 #--------------------------------
-if (!exists("infile_metadata") && exists("gene")){
-#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
+#if (!exists("infile_metadata") && exists("gene")){
+if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
   in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
   infile_metadata = paste0(outdir, "/", in_filename_metadata)
   cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
diff --git a/scripts/plotting/get_plotting_dfs_with_lig.R b/scripts/plotting/get_plotting_dfs_with_lig.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/ggcorr_all_PS_LIG.R b/scripts/plotting/ggcorr_all_PS_LIG.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/hist_af_or_base.R b/scripts/plotting/hist_af_or_base.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/hist_af_or_combined.R b/scripts/plotting/hist_af_or_combined.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/legend_adjustment.R b/scripts/plotting/legend_adjustment.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R
old mode 100644
new mode 100755
index 4b63587..94bcd4a
--- a/scripts/plotting/lineage_basic_barplots_combined.R
+++ b/scripts/plotting/lineage_basic_barplots_combined.R
@@ -3,9 +3,10 @@ getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 #########################################################
-# TASK: Basic lineage barplot showing numbers
+# TASK: Basic lineage barplots showing numbers
 
 # Output: Basic barplot with lineage samples and mut count
+# + SNP diversity
 
 ##########################################################
 # 				Installing and loading required packages 			 
@@ -16,6 +17,27 @@ source("../functions/bp_lineage.R")
 #===========
 # input
 #===========
+#drug = 'streptomycin'
+#gene = 'gid'
+
+spec = matrix(c(
+    "drug"       , "d",  1, "character",
+    "gene"       , "g",  1, "character",
+    "data_file1" , "fa", 2, "character",
+    "data_file2" , "fb", 2, "character" 
+), byrow = TRUE, ncol = 4)
+
+opt = getopt(spec)
+
+drug            = opt$drug
+gene            = opt$gene
+infile_params   = opt$data_file1
+infile_metadata = opt$data_file2
+
+if(is.null(drug)|is.null(gene)) {
+    stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+
 source ('get_plotting_dfs.R')
 
 cat("Directories imported:"
diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/lineage_dist_dm_om_combined_PS.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/output_tables.R b/scripts/plotting/output_tables.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R
old mode 100644
new mode 100755
diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt
index 332293e..74f67de 100644
--- a/scripts/plotting/running_plotting_scripts.txt
+++ b/scripts/plotting/running_plotting_scripts.txt
@@ -113,12 +113,28 @@ note:
  - fb flag has default if not supplied
  
  
-#===================
-# Add LINEAGE ONE
-#=================== 
-# Lineage_bp.R
-creates Count and Diversity plot
+#====================================
+# lineage_basic_barplots_combined.R
+#====================================
+#-----------------------------------------------------------------------
+./lineage_basic_barplots_combined.R-d streptomycin -g gid
+#-----------------------------------------------------------------------
 
+It replaces
+ ## lineage_basic_barplot.R
+These have been moved to redundant/
+
+
+sources:
+ ## get_plotting_dfs.R
+ ## functions//bp_lineage.R"
+
+ outputs: 1 svg in the plotdir
+ ## basic_lineage_barplots_combined.svg
+ 
+note: 
+ - fa flag has default if not supplied 
+ - fb flag has default if not supplied
  
 ########################################################################
 # TODO

From 2ee66c770bbf36cc571e5037895f4c8e09c74e58 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 7 Sep 2021 11:18:10 +0100
Subject: [PATCH 14/51] updated notes

---
 scripts/plotting/running_plotting_scripts.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/plotting/running_plotting_scripts.txt b/scripts/plotting/running_plotting_scripts.txt
index 74f67de..369bc4b 100644
--- a/scripts/plotting/running_plotting_scripts.txt
+++ b/scripts/plotting/running_plotting_scripts.txt
@@ -120,7 +120,7 @@ note:
 ./lineage_basic_barplots_combined.R-d streptomycin -g gid
 #-----------------------------------------------------------------------
 
-It replaces
+It replaces (and has an added diversity plot)
  ## lineage_basic_barplot.R
 These have been moved to redundant/
 

From 03031d2eb6ddd2d27cc142375bc3be90d2f735da Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 9 Sep 2021 13:12:07 +0100
Subject: [PATCH 15/51] moved all test scripts for functions to tests/

---
 .../functions/{ => tests}/test_aa_prop_bp.R   |   0
 .../functions/{ => tests}/test_af_or_calcs.R  |   0
 scripts/functions/{ => tests}/test_bp.R       |   0
 .../functions/{ => tests}/test_bp_lineage.R   |   0
 .../{ => tests}/test_combining_dfs_plotting.R |   0
 scripts/functions/{ => tests}/test_lf_bp.R    |   0
 .../{ => tests}/test_lf_unpaired_stats.R      |   0
 scripts/functions/tests/test_lineage_dist.R   |  32 ++
 .../{ => tests}/test_plotting_data.R          |   0
 scripts/plotting/Header_TT.R                  |  40 +-
 scripts/plotting/get_plotting_dfs.R           |   8 +-
 .../lineage_basic_barplots_combined.R         |  39 +-
 scripts/plotting/lineage_bp_data.R            | 129 +++---
 scripts/plotting/lineage_dist_combined_PS.R   | 303 --------------
 .../plotting/lineage_dist_dm_om_combined_PS.R | 387 ------------------
 15 files changed, 162 insertions(+), 776 deletions(-)
 rename scripts/functions/{ => tests}/test_aa_prop_bp.R (100%)
 rename scripts/functions/{ => tests}/test_af_or_calcs.R (100%)
 rename scripts/functions/{ => tests}/test_bp.R (100%)
 rename scripts/functions/{ => tests}/test_bp_lineage.R (100%)
 rename scripts/functions/{ => tests}/test_combining_dfs_plotting.R (100%)
 rename scripts/functions/{ => tests}/test_lf_bp.R (100%)
 rename scripts/functions/{ => tests}/test_lf_unpaired_stats.R (100%)
 create mode 100644 scripts/functions/tests/test_lineage_dist.R
 rename scripts/functions/{ => tests}/test_plotting_data.R (100%)
 delete mode 100755 scripts/plotting/lineage_dist_combined_PS.R
 delete mode 100755 scripts/plotting/lineage_dist_dm_om_combined_PS.R

diff --git a/scripts/functions/test_aa_prop_bp.R b/scripts/functions/tests/test_aa_prop_bp.R
similarity index 100%
rename from scripts/functions/test_aa_prop_bp.R
rename to scripts/functions/tests/test_aa_prop_bp.R
diff --git a/scripts/functions/test_af_or_calcs.R b/scripts/functions/tests/test_af_or_calcs.R
similarity index 100%
rename from scripts/functions/test_af_or_calcs.R
rename to scripts/functions/tests/test_af_or_calcs.R
diff --git a/scripts/functions/test_bp.R b/scripts/functions/tests/test_bp.R
similarity index 100%
rename from scripts/functions/test_bp.R
rename to scripts/functions/tests/test_bp.R
diff --git a/scripts/functions/test_bp_lineage.R b/scripts/functions/tests/test_bp_lineage.R
similarity index 100%
rename from scripts/functions/test_bp_lineage.R
rename to scripts/functions/tests/test_bp_lineage.R
diff --git a/scripts/functions/test_combining_dfs_plotting.R b/scripts/functions/tests/test_combining_dfs_plotting.R
similarity index 100%
rename from scripts/functions/test_combining_dfs_plotting.R
rename to scripts/functions/tests/test_combining_dfs_plotting.R
diff --git a/scripts/functions/test_lf_bp.R b/scripts/functions/tests/test_lf_bp.R
similarity index 100%
rename from scripts/functions/test_lf_bp.R
rename to scripts/functions/tests/test_lf_bp.R
diff --git a/scripts/functions/test_lf_unpaired_stats.R b/scripts/functions/tests/test_lf_unpaired_stats.R
similarity index 100%
rename from scripts/functions/test_lf_unpaired_stats.R
rename to scripts/functions/tests/test_lf_unpaired_stats.R
diff --git a/scripts/functions/tests/test_lineage_dist.R b/scripts/functions/tests/test_lineage_dist.R
new file mode 100644
index 0000000..1f40d16
--- /dev/null
+++ b/scripts/functions/tests/test_lineage_dist.R
@@ -0,0 +1,32 @@
+###############################
+# TEST function lineage_dist.R
+# to plot lineage
+# dist plots with or without facet
+##############################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+
+source("get_plotting_dfs.R")
+
+cat("cols imported:"
+    , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
+
+
+#############################################################
+
+lineage_distP(lin_dist_plot
+              , with_facet = F
+              , leg_label = "Mutation Class"
+)
+
+lineage_distP(lin_dist_plot
+              , with_facet = T
+              , facet_wrap_var = "mutation_info_labels"
+              , leg_label = "Mutation Class"
+              , leg_pos_wf = "none"
+              , leg_dir_wf = "horizontal"
+              
+)
\ No newline at end of file
diff --git a/scripts/functions/test_plotting_data.R b/scripts/functions/tests/test_plotting_data.R
similarity index 100%
rename from scripts/functions/test_plotting_data.R
rename to scripts/functions/tests/test_plotting_data.R
diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R
index e4593d0..47599d3 100755
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@@ -1,8 +1,12 @@
 #########################################################
-### A) Installing and loading required packages
+# A) Installing and loading required packages
+# B) My functions
+#########################################################
+
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")
 
+
 require("getopt", quietly = TRUE) # cmd parse arguments
 
 if (!require("tidyverse")) {
@@ -10,6 +14,21 @@ if (!require("tidyverse")) {
   library(tidyverse)
 }
 
+if (!require("shiny")) {
+  install.packages("shiny", dependencies = TRUE)
+  library(shiny)
+}
+
+if (!require("gridExtra")) {
+  install.packages("gridExtra", dependencies = TRUE)
+  library(gridExtra)
+}
+
+if (!require("ggridges")) {
+  install.packages("ggridges", dependencies = TRUE)
+  library(ggridges)
+}
+
 # if (!require("ggplot2")) {
 #   install.packages("ggplot2", dependencies = TRUE)
 #   library(ggplot2)
@@ -20,6 +39,11 @@ if (!require("tidyverse")) {
 #   library(dplyr)
 # }
 
+if (!require ("plyr")){
+   install.packages("plyr")
+   library(plyr)
+ }
+
 # Install
 #if(!require(devtools)) install.packages("devtools")
 #devtools::install_github("kassambara/ggcorrplot")
@@ -140,4 +164,16 @@ if(!require(protr)){
 #  install.packages("BiocManager")
 
 #BiocManager::install("Logolas")
-library("Logolas")
\ No newline at end of file
+library("Logolas")
+
+
+####################################
+# Load all my functions:
+# only works if tidyverse is loaded
+# hence included it here!
+####################################
+
+func_path = "~/git/LSHTM_analysis/scripts/functions/"
+source_files <- list.files(func_path, "\\.R$")  # locate all .R files
+map(paste0(func_path, source_files), source)  # source all your R scripts!
+
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index 5876d8d..89b477c 100755
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -39,8 +39,8 @@ import_dirs(drug, gene)
 #---------------------------
 # call: plotting_data()
 #---------------------------
-#if (!exists("infile_params") && exists("gene")){
-if (!is.character(infile_params) && exists("gene")){ # when running as cmd
+if (!exists("infile_params") && exists("gene")){
+#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
   #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
   in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
   infile_params = paste0(outdir, "/", in_filename_params)
@@ -67,8 +67,8 @@ cat("\nLigand distance cut off, colname:", LigDist_colname
 #--------------------------------
 # call: combining_dfs_plotting()
 #--------------------------------
-#if (!exists("infile_metadata") && exists("gene")){
-if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
+if (!exists("infile_metadata") && exists("gene")){
+#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
   in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
   infile_metadata = paste0(outdir, "/", in_filename_metadata)
   cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R
index 94bcd4a..b6f25e6 100755
--- a/scripts/plotting/lineage_basic_barplots_combined.R
+++ b/scripts/plotting/lineage_basic_barplots_combined.R
@@ -12,7 +12,6 @@ getwd()
 # 				Installing and loading required packages 			 
 ##########################################################
 source("Header_TT.R")
-source("../functions/bp_lineage.R")
 
 #===========
 # input
@@ -40,24 +39,6 @@ if(is.null(drug)|is.null(gene)) {
 
 source ('get_plotting_dfs.R')
 
-cat("Directories imported:"
-    , "\n===================="
-    , "\ndatadir:", datadir
-    , "\nindir:", indir
-    , "\noutdir:", outdir
-    , "\nplotdir:", plotdir)
-
-cat("Variables imported:"
-    , "\n====================="
-    , "\ndrug:", drug
-    , "\ngene:", gene
-    , "\ngene_match:", gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
-    #, "\nNo. of duplicated muts:", dup_muts_nu
-    , "\ndr_muts_col:", dr_muts_col
-    , "\nother_muts_col:", other_muts_col
-    , "\ndrtype_col:", resistance_col)
-
 #=======
 # output
 #=======
@@ -74,21 +55,21 @@ plot_basic_bp_lineage_cl
 # Data: All lineages or
 # selected few
 #------------------------
-sel_lineages = levels(lin_lf$sel_lineages_f)[1:4]
+sel_lineages = levels(lin_lf$sel_lineages)[1:4]
 sel_lineages
-lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
+lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%sel_lineages,]
 str(lin_lf_plot)
 
 # drop unused factor levels
-lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
-levels(lin_lf_plot$sel_lineages_f)
+lin_lf_plot$sel_lineages = factor(lin_lf_plot$sel_lineages)
+levels(lin_lf_plot$sel_lineages)
 str(lin_lf_plot)
 
 #------------------------
 # plot from my function:
 #------------------------
 lin_countP = lin_count_bp(lin_lf_plot
-             , x_categ = "sel_lineages_f"
+             , x_categ = "sel_lineages"
              , y_count = "p_count"
              , bar_fill_categ = "count_categ"
              , display_label_col = "p_count"
@@ -109,21 +90,21 @@ lin_countP
 # Data: All lineages or
 # selected few
 #------------------------
-sel_lineages = levels(lin_wf$sel_lineages_f)[1:4]
+sel_lineages = levels(lin_wf$sel_lineages)[1:4]
 sel_lineages
-lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
+lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%sel_lineages,]
 str(lin_wf_plot)
 
 # drop unused factor levels
-lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
-levels(lin_wf_plot$sel_lineages_f)
+lin_wf_plot$sel_lineages = factor(lin_wf_plot$sel_lineages)
+levels(lin_wf_plot$sel_lineages)
 str(lin_wf_plot)
 
 #------------------------
 # plot from my function:
 #------------------------
 lin_diversityP = lin_count_bp(lin_wf_plot
-             , x_categ = "sel_lineages_f"
+             , x_categ = "sel_lineages"
              , y_count = "snp_diversity"
              , display_label_col = "snp_diversity_f"
              , bar_stat_stype = "identity"
diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R
index 2d51ab0..e9ab929 100755
--- a/scripts/plotting/lineage_bp_data.R
+++ b/scripts/plotting/lineage_bp_data.R
@@ -27,13 +27,44 @@ cat("\nMissing samples with lineage classification:", table(merged_df2$lineage =
   
 }
 
+# Add pretty lineage labels and mut_info_labels
+class(merged_df2$lineage); table(merged_df2$lineage)
+merged_df2$lineage_labels =  gsub("lineage", "L", merged_df2$lineage)
+table(merged_df2$lineage_labels)
+
+class(merged_df2$lineage_labels)
+
+merged_df2$lineage_labels =  factor(merged_df2$lineage_labels, c("L1"
+                                                         , "L2"
+                                                         , "L3"
+                                                         , "L4"
+                                                         , "L5"
+                                                         , "L6"
+                                                         , "L7"
+                                                         , "LBOV"
+                                                         , "L1;L2"
+                                                         , "L1;L3"
+                                                         , "L1;L4"
+                                                         , "L2;L3"
+                                                         , "L2;L3;L4"
+                                                         , "L2;L4"  
+                                                         , "L2;L6"
+                                                         , "L2;LBOV"  
+                                                         , "L3;L4" 
+                                                         , "L4;L6"
+                                                         , "L4;L7"
+                                                         , ""))
+
+class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
+
+
 ##################################
 # WF data: lineages with 
 # snp count
 # total_samples
 # snp diversity (perc)
 ##################################
-sel_lineages = levels(as.factor(merged_df2$lineage))
+sel_lineages = levels(merged_df2$lineage_labels)
 
 lin_wf = data.frame(sel_lineages) #4, 1
 total_snps_u = NULL
@@ -41,12 +72,12 @@ total_samples = NULL
 
 for (i in sel_lineages){
   #print(i)
-  curr_total = length(unique(merged_df2$id)[merged_df2$lineage==i])
+  curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
   #print(curr_total)
   total_samples = c(total_samples, curr_total)
   print(total_samples)
 
-  foo = merged_df2[merged_df2$lineage==i,]
+  foo = merged_df2[merged_df2$lineage_labels==i,]
   print(paste0(i, "=======\n"))
   print(length(unique(foo$mutationinformation)))
   curr_count = length(unique(foo$mutationinformation))
@@ -70,33 +101,29 @@ lin_wf
 lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
 lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
 
-# Lineage names
-lin_wf$sel_lineages_f = gsub("lineage", "L", lin_wf$sel_lineages)
-lin_wf
+# Important: Check factors so that x-axis categ appear as you want
+lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
+                                                     , "L2"
+                                                     , "L3"
+                                                     , "L4"
+                                                     , "L5"
+                                                     , "L6"
+                                                     , "L7"
+                                                     , "LBOV"
+                                                     , "L1;L2"
+                                                     , "L1;L3"
+                                                     , "L1;L4"
+                                                     , "L2;L3"
+                                                     , "L2;L3;L4"
+                                                     , "L2;L4"  
+                                                     , "L2;L6"
+                                                     , "L2;LBOV"  
+                                                     , "L3;L4" 
+                                                     , "L4;L6"
+                                                     , "L4;L7"
+                                                     , ""))
 
-# Important: Relevel factors so that x-axis categ appear as you want
-lin_wf$sel_lineages_f =  factor(lin_wf$sel_lineages_f, c("L1"
-                                                         , "L2"
-                                                         , "L3"
-                                                         , "L4"
-                                                         , "L5"
-                                                         , "L6"
-                                                         , "L7"
-                                                         , "LBOV"
-                                                         , "L1;L2"
-                                                         , "L1;L3"
-                                                         , "L1;L4"
-                                                         , "L2;L3"
-                                                         , "L2;L3;L4"
-                                                         , "L2;L4"  
-                                                         , "L2;L6"
-                                                         , "L2;LBOV"  
-                                                         , "L3;L4" 
-                                                         , "L4;L6"
-                                                         , "L4;L7"
-                                                         , ""))
-
-levels(lin_wf$sel_lineages_f)
+levels(lin_wf$sel_lineages)
 
 ##################################
 # LF data: lineages with 
@@ -106,7 +133,7 @@ levels(lin_wf$sel_lineages_f)
 ##################################
 names(lin_wf)
 tot_cols = ncol(lin_wf)
-pivot_cols = c("sel_lineages", "sel_lineages_f", "snp_diversity", "snp_diversity_f")
+pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
 pivot_cols_n = length(pivot_cols)
 
 expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
@@ -129,25 +156,25 @@ if ( nrow(lin_lf)  ==  expected_rows ){
 }
 
 # Important: Relevel factors so that x-axis categ appear as you want
-lin_lf$sel_lineages_f =  factor(lin_lf$sel_lineages_f, c("L1"
-                                                         , "L2"
-                                                         , "L3"
-                                                         , "L4"
-                                                         , "L5"
-                                                         , "L6"
-                                                         , "L7"
-                                                         , "LBOV"
-                                                         , "L1;L2"
-                                                         , "L1;L3"
-                                                         , "L1;L4"
-                                                         , "L2;L3"
-                                                         , "L2;L3;L4"
-                                                         , "L2;L4"  
-                                                         , "L2;L6"
-                                                         , "L2;LBOV"  
-                                                         , "L3;L4" 
-                                                         , "L4;L6"
-                                                         , "L4;L7"
-                                                         , ""))
+lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
+                                                     , "L2"
+                                                     , "L3"
+                                                     , "L4"
+                                                     , "L5"
+                                                     , "L6"
+                                                     , "L7"
+                                                     , "LBOV"
+                                                     , "L1;L2"
+                                                     , "L1;L3"
+                                                     , "L1;L4"
+                                                     , "L2;L3"
+                                                     , "L2;L3;L4"
+                                                     , "L2;L4"  
+                                                     , "L2;L6"
+                                                     , "L2;LBOV"  
+                                                     , "L3;L4" 
+                                                     , "L4;L6"
+                                                     , "L4;L7"
+                                                     , ""))
 
-levels(lin_lf$sel_lineages_f)
+levels(lin_lf$sel_lineages)
diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R
deleted file mode 100755
index bf1c75b..0000000
--- a/scripts/plotting/lineage_dist_combined_PS.R
+++ /dev/null
@@ -1,303 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Lineage dist plots: ggridges
-
-# Output: 2 SVGs for PS stability
-
-# 1) all muts
-# 2) dr_muts
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
-source("Header_TT.R")
-library(ggridges)
-source("combining_dfs_plotting.R")
-# PS combined: 
-# 1) merged_df2
-# 2) merged_df2_comp
-# 3) merged_df3
-# 4) merged_df3_comp
-
-# LIG combined: 
-# 5) merged_df2_lig
-# 6) merged_df2_comp_lig
-# 7) merged_df3_lig
-# 8) merged_df3_comp_lig
-
-# 9) my_df_u
-# 10) my_df_u_lig
-
-cat("Directories imported:"
-    , "\n===================="
-    , "\ndatadir:", datadir
-    , "\nindir:", indir
-    , "\noutdir:", outdir
-    , "\nplotdir:", plotdir)
-
-cat("Variables imported:"
-    , "\n====================="
-    , "\ndrug:", drug
-    , "\ngene:", gene
-    , "\ngene_match:", gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
-    , "\nNo. of duplicated muts:", dup_muts_nu
-    , "\nNA count for ORs:", na_count
-    , "\nNA count in df2:", na_count_df2
-    , "\nNA count in df3:", na_count_df3
-    , "\ndr_muts_col:", dr_muts_col
-    , "\nother_muts_col:", other_muts_col
-    , "\ndrtype_col:", resistance_col)
-
-#=======
-# output
-#=======
-lineage_dist_combined = "lineage_dist_combined_PS.svg"
-plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
-#========================================================================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-# REASSIGNMENT
-my_df  = merged_df2
-
-# delete variables not required
-rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-# subset df with dr muts only
-my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
-table(my_df_dr$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot 1: ALL Muts
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-my_plot_name = 'lineage_dist_PS.svg'
-
-plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
-
-#===================
-# Data for plots
-#===================
-table(my_df$lineage); str(my_df$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4"
-                 #, "lineage5"
-                 #, "lineage6"
-                 #, "lineage7"
-                 )
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
-table(df_lin$lineage)
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
-
-table(df_lin$lineage)#{RESULT: No of samples within lineage}
-
-length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
-
-length(df_lin$mutationinformation)
-
-u2 = unique(my_df$mutationinformation)
-u = unique(df_lin$mutationinformation)
-check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# 2 : ggridges (good!)
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
-              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
-              )
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
-                    # , 'lineage5', 'lineage6', 'lineage7'
-                     )
-# check plot name
-plot_lineage_duet
-
-# output svg
-#svg(plot_lineage_duet)
-p1 = ggplot(df, aes(x = duet_scaled
-                            , y = duet_outcome))+
-  
-  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                               #, jittered_points = TRUE
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-             #, switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "DUET" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = my_als-5)
-         , legend.title = element_text(size = my_als)
-) 
-
-print(p1)
-#dev.off()
-
-#######################################################################
-# lineage distribution plot for dr_muts
-#######################################################################
-
-#==========================
-# Plot 2: dr muts ONLY
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
-
-plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
-
-#===================
-# Data for plots
-#===================
-table(my_df_dr$lineage); str(my_df_dr$lineage)
-
-# uncomment as necessary
-df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
-table(df_lin_dr$lineage)
-
-# refactor
-df_lin_dr$lineage = factor(df_lin_dr$lineage)
-
-sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
-
-table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
-
-length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
-
-length(df_lin_dr$mutationinformation)
-
-u2 = unique(my_df_dr$mutationinformation)
-u = unique(df_lin_dr$mutationinformation)
-check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df_dr <- df_lin_dr
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin_dr)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# 2 : ggridges (good!)
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-
-# check plot name
-plot_lineage_dr_duet
-
-# output svg
-#svg(plot_lineage_dr_duet)
-p2 = ggplot(df_dr, aes(x = duet_scaled
-                       , y = duet_outcome))+
-
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                               #, jittered_points = TRUE
-                               , scale = 3
-                               , size = 0.3) +
-  #geom_point(aes(size = or_mychisq))+
-  facet_wrap( ~lineage
-              , scales = "free"
-              #, switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
-                   #, ylim = c(0, 6)
-                   #, clip = "off" 
-  ) +
-  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-                       , name = "DUET" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_blank()
-        , axis.title.x = element_blank()
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = 10)
-        , legend.title = element_text(size = my_als)
-        #, legend.position = "none"
-  ) 
-
-print(p2)
-#dev.off()
-########################################################################
-#==============
-# combine plot
-#===============
-
-svg(plot_lineage_dist_combined, width = 12, height = 6)
-
-printFile = cowplot::plot_grid(p1, p2
-                               , label_size = my_als+10)
-
-print(printFile)
-dev.off()
diff --git a/scripts/plotting/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/lineage_dist_dm_om_combined_PS.R
deleted file mode 100755
index 07912ac..0000000
--- a/scripts/plotting/lineage_dist_dm_om_combined_PS.R
+++ /dev/null
@@ -1,387 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Lineage dist plots: ggridges
-
-# Output: 2 SVGs for PS stability
-
-# 1) all muts
-# 2) dr_muts
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
-source("Header_TT.R")
-library(ggridges)
-library(plyr)
-source("combining_dfs_plotting.R")
-# PS combined: 
-# 1) merged_df2
-# 2) merged_df2_comp
-# 3) merged_df3
-# 4) merged_df3_comp
-
-# LIG combined: 
-# 5) merged_df2_lig
-# 6) merged_df2_comp_lig
-# 7) merged_df3_lig
-# 8) merged_df3_comp_lig
-
-# 9) my_df_u
-# 10) my_df_u_lig
-
-cat("Directories imported:"
-    , "\n===================="
-    , "\ndatadir:", datadir
-    , "\nindir:", indir
-    , "\noutdir:", outdir
-    , "\nplotdir:", plotdir)
-
-cat("Variables imported:"
-    , "\n====================="
-    , "\ndrug:", drug
-    , "\ngene:", gene
-    , "\ngene_match:", gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
-    , "\nNo. of duplicated muts:", dup_muts_nu
-    , "\nNA count for ORs:", na_count
-    , "\nNA count in df2:", na_count_df2
-    , "\nNA count in df3:", na_count_df3
-    , "\ndr_muts_col:", dr_muts_col
-    , "\nother_muts_col:", other_muts_col
-    , "\ndrtype_col:", resistance_col)
-
-cat("cols imported:"
-    , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
-
-#=======
-# output
-#=======
-lineage_dist_combined_dm_om = "lineage_dist_combined_dm_om_PS.svg"
-plot_lineage_dist_combined_dm_om  =  paste0(plotdir,"/", lineage_dist_combined_dm_om)
-
-lineage_dist_combined_dm_om_L = "lineage_dist_combined_dm_om_PS_labelled.svg"
-plot_lineage_dist_combined_dm_om_L  =  paste0(plotdir,"/", lineage_dist_combined_dm_om_L)
-
-#========================================================================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-# REASSIGNMENT
-my_df  = merged_df2
-
-# delete variables not required
-rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp
-   , merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-table(my_df$mutation_info)
-
-#===================
-# Data for plots
-#===================
-table(my_df$lineage); str(my_df$lineage)
-
-# select lineages 1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-                 #, "lineage5"
-                 #, "lineage6"
-                 #, "lineage7")
-
-# works nicely with facet wrap using labeller, but not otherwise
-#my_labels = c('Lineage 1'
-#              , 'Lineage 2'
-#              , 'Lineage 3'
-#              , 'Lineage 4')
-#              #, 'Lineage 5'
-#              #, 'Lineage 6'
-#              #, 'Lineage 7')
-
-#names(my_labels) = c('lineage1'
-#                     , 'lineage2'
-#                     , 'lineage3'
-#                     , 'lineage4')
-#                     #, 'lineage5'
-#                     #, 'lineage6'
-#                     #, 'lineage7')
-
-#==========================
-# subset selected lineages
-#==========================
-df_lin = subset(my_df, subset = lineage %in% sel_lineages)
-table(df_lin$lineage)
-
-#{RESULT: Total number of samples for lineage}
-sum(table(df_lin$lineage)) 
-
-#{RESULT: No of samples within lineage}
-table(df_lin$lineage) 
-
-#{Result: No. of unique mutations the 4 lineages contribute to}
-length(unique(df_lin$mutationinformation)) 
-
-u2 = unique(my_df$mutationinformation)
-u = unique(df_lin$mutationinformation)
-
-#{Result:Muts not present within selected lineages}
-check = u2[!u2%in%u]; print(check) 
-
-# workaround to make labels appear nicely for in otherwise cases
-#==================
-# lineage: labels
-# from "plyr"
-#==================
-#{Result:No of samples in selected lineages}
-table(df_lin$lineage)
-
-df_lin$lineage_labels = mapvalues(df_lin$lineage
-                                  , from = c("lineage1","lineage2", "lineage3", "lineage4")
-                                  , to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4"))
-table(df_lin$lineage_labels)
-
-table(df_lin$lineage_labels) == table(df_lin$lineage)
-                              
-#========================
-# mutation_info: labels
-#========================
-#{Result:No of DM and OM muts in selected lineages}
-table(df_lin$mutation_info)
-
-df_lin$mutation_info_labels = ifelse(df_lin$mutation_info == dr_muts_col, "DM", "OM")
-table(df_lin$mutation_info_labels)
-
-table(df_lin$mutation_info) == table(df_lin$mutation_info_labels)
-
-
-#========================
-# duet_outcome: labels
-#========================
-#{Result: No. of D and S mutations in selected lineages}
-table(df_lin$duet_outcome)
-
-df_lin$duet_outcome_labels = ifelse(df_lin$duet_outcome == "Destabilising", "D", "S")
-table(df_lin$duet_outcome_labels)
-
-table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
-
-
-#=======================
-# subset dr muts only
-#=======================
-#my_df_dr = subset(df_lin, mutation_info == dr_muts_col) 
-#table(my_df_dr$mutation_info)
-#table(my_df_dr$lineage)
-
-#=========================
-# subset other muts only
-#=========================
-#my_df_other = subset(df_lin, mutation_info == other_muts_col) 
-#table(my_df_other$mutation_info)
-#table(my_df_other$lineage)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Distribution plots
-#============================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# 2 : ggridges (good!)
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-n_colours = length(unique(df$duet_scaled))
-my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
-
-#=======================================
-# Plot 1: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!)
-# else same as geom_density_ridges)
-# x = duet_scaled
-# y = duet_outcome
-# fill = duet_scaled
-# Facet: Lineage
-#=======================================
-# output individual svg
-#plot_lineage_dist_duet_f  paste0(plotdir,"/", "lineage_dist_duet_f.svg")
-#plot_lineage_dist_duet_f
-#svg(plot_lineage_dist_duet_f)
-
-p1 = ggplot(df, aes(x = duet_scaled
-                    , y = duet_outcome))+
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                               #, jittered_points = TRUE
-                               , scale = 3
-                               , size = 0.3 ) +
-  facet_wrap( ~lineage_labels
- # , scales = "free"
- # , labeller = labeller(lineage = my_labels)
-   ) +                            
-  coord_cartesian( xlim = c(-1, 1)) +
-  scale_fill_gradientn(colours = my_palette
-                       , name = "DUET"
-                       #, breaks = c(-1, 0, 1)
-                       #, labels = c(-1,0,1)
-                       #, limits = c(-1,1)
-                       ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        #, axis.text.y = element_blank()
-        , axis.text.y = element_text(size = my_ats)
-        , axis.title.x = element_text(size = my_ats)
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = my_als-10)
-        #, legend.title = element_text(size = my_als-6)
-        , legend.title = element_blank()
-        , legend.position = c(-0.08, 0.41) 
-        #, legend.direction = "horizontal"
-        #, legend.position = "left"
-)+
-  labs(x = "DUET") 
-
-p1
-
-
-#p1_with_legend = p1 + guides(fill = guide_colourbar(label = FALSE))
-
-#=======================================
-# Plot 2: lineage dist: geom_density_ridges, allows alpha to be set
-# x = duet_scaled
-# y = lineage_labels
-# fill = mutation_info
-# NO FACET
-#=======================================
-# output svg
-#plot_lineage_dist_duet_dm_om  =  paste0(plotdir,"/", "lineage_dist_duet_dm_om.svg")
-#plot_lineage_dist_duet_dm_om
-#svg(plot_lineage_dist_duet_dm_om)
-
-p2 = ggplot(df, aes(x = duet_scaled
-                    , y = lineage_labels))+
-  geom_density_ridges(aes(fill = factor(mutation_info_labels))
-                      , scale = 3
-                      , size = 0.3
-                      , alpha = 0.8) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  scale_fill_manual(values = c("#E69F00", "#999999")) +
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats)
-        , axis.title.x = element_text(size = my_ats)
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = my_als-4)
-        , legend.title = element_text(size = my_als-4)
-        , legend.position = c(0.8, 0.9)) +
-  labs(x = "DUET"
-       , fill = "Mutation class") # legend title
-
-p2
-
-#=======================================
-# Plot 3: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!)
-# else same as geom_density_ridges)
-# x = duet_scaled
-# y = lineage_labels
-# fill = duet_scaled 
-# NO FACET (nf)
-#=======================================
-# output individual svg
-#plot_lineage_dist_duet_nf  =  paste0(plotdir,"/", "lineage_dist_duet_nf.svg")
-#plot_lineage_dist_duet_nf
-#svg(plot_lineage_dist_duet_nf)
-
-p3 = ggplot(df, aes(x = duet_scaled
-                    , y = lineage_labels))+
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                               #, jittered_points = TRUE
-                               , scale = 3
-                               , size = 0.3 ) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  scale_fill_gradientn(colours = my_palette, name = "DUET") + 
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        
-        , axis.text.y = element_text(size = my_ats)
-        , axis.title.x = element_text(size = my_ats)
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = my_als-10)
-        , legend.title = element_text(size = my_als-3)
-        , legend.position = c(0.8, 0.8)) +
-        #, legend.direction = "horizontal")+
-        #, legend.position = "top")+
-  labs(x = "DUET") 
-
-p3
-
-########################################################################
-#==============
-# combine plots
-#===============
-# 1) without labels
-plot_lineage_dist_combined_dm_om
-svg(plot_lineage_dist_combined_dm_om, width = 12, height = 6)
-
-OutPlot1 = cowplot::plot_grid(p1, p2
-                               , rel_widths = c(0.5/2, 0.5/2))
-
-print(OutPlot1)
-dev.off()
-
-
-# 2) with labels
-plot_lineage_dist_combined_dm_om_L
-svg(plot_lineage_dist_combined_dm_om_L, width = 12, height = 6)
-
-OutPlot2 = cowplot::plot_grid(p1, p2
-                              #, labels = c("(a)", "(b)")
-                              , labels = "AUTO"
-                              #, label_x = -0.045, label_y = 0.92
-                              #, hjust = -0.7, vjust = -0.5
-                              #, align = "h"
-                              , rel_widths = c(0.5/2, 0.5/2)
-                              , label_size = my_als)
-
-print(OutPlot2)
-dev.off()
-
-##############################################################################

From b7d50fbbcd15d0a78b6cfa04b27520759e9e8e47 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 9 Sep 2021 16:10:11 +0100
Subject: [PATCH 16/51] added lineage_labels and mutation_info_labels to
 combinig_dfs_plotting

---
 scripts/functions/combining_dfs_plotting.R    |  34 ++++
 scripts/functions/tests/test_lineage_dist.R   |   3 +-
 .../lineage_basic_barplots_combined.R         |   8 +-
 scripts/plotting/lineage_bp_data.R            | 180 ------------------
 4 files changed, 38 insertions(+), 187 deletions(-)
 delete mode 100755 scripts/plotting/lineage_bp_data.R

diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R
index 18e0374..848face 100644
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@@ -152,6 +152,40 @@ combining_dfs_plotting <- function(  my_df_u
     unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
     quit()
   }
+
+  # Quick formatting: pretty labels
+  #-----------------------
+  # mutation_info_labels
+  #-----------------------
+  merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
+                                              , "DM", "OM")
+  merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
+  #-----------------------
+  # lineage labels
+  #-----------------------
+  merged_df2$lineage_labels =  gsub("lineage", "L", merged_df2$lineage)
+  
+  merged_df2$lineage_labels =  factor(merged_df2$lineage_labels, c("L1"
+                                                                   , "L2"
+                                                                   , "L3"
+                                                                   , "L4"
+                                                                   , "L5"
+                                                                   , "L6"
+                                                                   , "L7"
+                                                                   , "LBOV"
+                                                                   , "L1;L2"
+                                                                   , "L1;L3"
+                                                                   , "L1;L4"
+                                                                   , "L2;L3"
+                                                                   , "L2;L3;L4"
+                                                                   , "L2;L4"  
+                                                                   , "L2;L6"
+                                                                   , "L2;LBOV"  
+                                                                   , "L3;L4" 
+                                                                   , "L4;L6"
+                                                                   , "L4;L7"
+                                                                   , ""))
+  
   
   #=================================================================
   # Merge 2: merged_df3
diff --git a/scripts/functions/tests/test_lineage_dist.R b/scripts/functions/tests/test_lineage_dist.R
index 1f40d16..eeeebe5 100644
--- a/scripts/functions/tests/test_lineage_dist.R
+++ b/scripts/functions/tests/test_lineage_dist.R
@@ -16,12 +16,13 @@ cat("cols imported:"
 
 
 #############################################################
-
+# without facet
 lineage_distP(lin_dist_plot
               , with_facet = F
               , leg_label = "Mutation Class"
 )
 
+# without facet
 lineage_distP(lin_dist_plot
               , with_facet = T
               , facet_wrap_var = "mutation_info_labels"
diff --git a/scripts/plotting/lineage_basic_barplots_combined.R b/scripts/plotting/lineage_basic_barplots_combined.R
index b6f25e6..837e57b 100755
--- a/scripts/plotting/lineage_basic_barplots_combined.R
+++ b/scripts/plotting/lineage_basic_barplots_combined.R
@@ -55,9 +55,7 @@ plot_basic_bp_lineage_cl
 # Data: All lineages or
 # selected few
 #------------------------
-sel_lineages = levels(lin_lf$sel_lineages)[1:4]
-sel_lineages
-lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%sel_lineages,]
+lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
 str(lin_lf_plot)
 
 # drop unused factor levels
@@ -90,9 +88,7 @@ lin_countP
 # Data: All lineages or
 # selected few
 #------------------------
-sel_lineages = levels(lin_wf$sel_lineages)[1:4]
-sel_lineages
-lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%sel_lineages,]
+lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
 str(lin_wf_plot)
 
 # drop unused factor levels
diff --git a/scripts/plotting/lineage_bp_data.R b/scripts/plotting/lineage_bp_data.R
deleted file mode 100755
index e9ab929..0000000
--- a/scripts/plotting/lineage_bp_data.R
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for lineage barplots:
-# WF and LF data with lineage sample, and snp counts
-# sourced by get_plotting_dfs.R
-#########################################################
-# working dir and loading libraries
-# getwd()
-# setwd("~/git/LSHTM_analysis/scripts/plotting")
-# getwd()
-
-# make cmd
-# globals
-# drug = "streptomycin"
-# gene = "gid"
-
-# source("get_plotting_dfs.R")
-#=======================================================================
-#################################################
-# Get data with lineage count, and snp diversity
-#################################################
-table(merged_df2$lineage)
-
-if (table(merged_df2$lineage == "")[[2]]) {
-
-cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
-  
-}
-
-# Add pretty lineage labels and mut_info_labels
-class(merged_df2$lineage); table(merged_df2$lineage)
-merged_df2$lineage_labels =  gsub("lineage", "L", merged_df2$lineage)
-table(merged_df2$lineage_labels)
-
-class(merged_df2$lineage_labels)
-
-merged_df2$lineage_labels =  factor(merged_df2$lineage_labels, c("L1"
-                                                         , "L2"
-                                                         , "L3"
-                                                         , "L4"
-                                                         , "L5"
-                                                         , "L6"
-                                                         , "L7"
-                                                         , "LBOV"
-                                                         , "L1;L2"
-                                                         , "L1;L3"
-                                                         , "L1;L4"
-                                                         , "L2;L3"
-                                                         , "L2;L3;L4"
-                                                         , "L2;L4"  
-                                                         , "L2;L6"
-                                                         , "L2;LBOV"  
-                                                         , "L3;L4" 
-                                                         , "L4;L6"
-                                                         , "L4;L7"
-                                                         , ""))
-
-class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
-
-
-##################################
-# WF data: lineages with 
-# snp count
-# total_samples
-# snp diversity (perc)
-##################################
-sel_lineages = levels(merged_df2$lineage_labels)
-
-lin_wf = data.frame(sel_lineages) #4, 1
-total_snps_u = NULL
-total_samples = NULL
-
-for (i in sel_lineages){
-  #print(i)
-  curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
-  #print(curr_total)
-  total_samples = c(total_samples, curr_total)
-  print(total_samples)
-
-  foo = merged_df2[merged_df2$lineage_labels==i,]
-  print(paste0(i, "=======\n"))
-  print(length(unique(foo$mutationinformation)))
-  curr_count = length(unique(foo$mutationinformation))
-  
-  total_snps_u = c(total_snps_u, curr_count)
-}
-lin_wf
-
-# Add these counts as columns to the df
-lin_wf$num_snps_u = total_snps_u
-lin_wf$total_samples = total_samples
-
-# Add SNP diversity
-lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
-lin_wf
-
-#=====================
-# Add some formatting
-#=====================
-# SNP diversity 
-lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
-lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
-
-# Important: Check factors so that x-axis categ appear as you want
-lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
-                                                     , "L2"
-                                                     , "L3"
-                                                     , "L4"
-                                                     , "L5"
-                                                     , "L6"
-                                                     , "L7"
-                                                     , "LBOV"
-                                                     , "L1;L2"
-                                                     , "L1;L3"
-                                                     , "L1;L4"
-                                                     , "L2;L3"
-                                                     , "L2;L3;L4"
-                                                     , "L2;L4"  
-                                                     , "L2;L6"
-                                                     , "L2;LBOV"  
-                                                     , "L3;L4" 
-                                                     , "L4;L6"
-                                                     , "L4;L7"
-                                                     , ""))
-
-levels(lin_wf$sel_lineages)
-
-##################################
-# LF data: lineages with 
-# snp count
-# total_samples
-# snp diversity (perc)
-##################################
-names(lin_wf)
-tot_cols = ncol(lin_wf)
-pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
-pivot_cols_n = length(pivot_cols)
-
-expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
-  
-lin_lf <- gather(lin_wf
-                 , count_categ
-                 , p_count
-                 , num_snps_u:total_samples
-                 , factor_key = TRUE)
-lin_lf
-
-# quick checks
-if ( nrow(lin_lf)  ==  expected_rows ){
-  cat("\nPASS: Lineage LF data created"
-      , "\nnrow: ", nrow(lin_lf)
-      , "\nncol: ", ncol(lin_lf))
-} else {
-  cat("\nFAIL: numbers mismatch"
-      , "\nExpected nrow: ", expected_rows)
-}
-
-# Important: Relevel factors so that x-axis categ appear as you want
-lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
-                                                     , "L2"
-                                                     , "L3"
-                                                     , "L4"
-                                                     , "L5"
-                                                     , "L6"
-                                                     , "L7"
-                                                     , "LBOV"
-                                                     , "L1;L2"
-                                                     , "L1;L3"
-                                                     , "L1;L4"
-                                                     , "L2;L3"
-                                                     , "L2;L3;L4"
-                                                     , "L2;L4"  
-                                                     , "L2;L6"
-                                                     , "L2;LBOV"  
-                                                     , "L3;L4" 
-                                                     , "L4;L6"
-                                                     , "L4;L7"
-                                                     , ""))
-
-levels(lin_lf$sel_lineages)

From 93038fa17c677ace0321efad2025c608121602b4 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 9 Sep 2021 16:14:14 +0100
Subject: [PATCH 17/51] added lineage_dist.R nad renamed lineage_bp_data file
 to lineage_data

---
 scripts/functions/lineage_dist.R |  69 ++++++++++++++
 scripts/plotting/lineage_data.R  | 155 +++++++++++++++++++++++++++++++
 2 files changed, 224 insertions(+)
 create mode 100644 scripts/functions/lineage_dist.R
 create mode 100755 scripts/plotting/lineage_data.R

diff --git a/scripts/functions/lineage_dist.R b/scripts/functions/lineage_dist.R
new file mode 100644
index 0000000..aee1b62
--- /dev/null
+++ b/scripts/functions/lineage_dist.R
@@ -0,0 +1,69 @@
+###############################
+# TASK: function to plot lineage
+# dist plots with or without facet
+# think about color palette
+# for stability
+##############################
+
+#n_colours = length(unique(lin_dist_plot$duet_scaled))
+#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
+
+
+lineage_distP <- function(plotdf
+                          , x_axis = "duet_scaled"
+                          , y_axis = "lineage_labels"
+                          , x_lab = "DUET"
+                          , with_facet = F
+                          , facet_wrap_var = ""
+                          , fill_categ = "mutation_info_labels"
+                          , fill_categ_cols = c("#E69F00", "#999999")
+                          , my_ats = 15 # axis text size
+                          , my_als = 20 # axis label size
+                          , my_leg_ts = 16
+                          , my_leg_title = 16
+                          , my_strip_ts = 20
+                          , leg_pos = c(0.8, 0.9)
+                          , leg_pos_wf = c("top", "left", "bottom", "right")
+                          , leg_dir_wf = c("horizontal", "vertical")
+                          , leg_label = "")
+
+{
+
+LinDistP = ggplot(plotdf, aes_string(x = x_axis
+                              , y = y_axis))+
+
+  geom_density_ridges(aes_string(fill = fill_categ)
+                      , scale = 3
+                      , size = 0.3
+                      , alpha = 0.8) +
+  scale_x_continuous(expand = c(0.01, 0.01)) +
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_fill_manual(values = fill_categ_cols) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats)
+        , axis.title.x = element_text(size = my_ats)
+        , axis.title.y = element_blank()
+        , strip.text = element_text(size = my_strip_ts)
+        , legend.text = element_text(size = my_leg_ts)
+        , legend.title = element_text(size = my_leg_title)
+        , legend.position = c(0.8, 0.9)) +
+  labs(x = x_lab
+       , fill = leg_label)
+
+if (with_facet){
+
+  # used reformulate or make as formula
+  #fwv = reformulate(facet_wrap_var)
+  fwv = as.formula(paste0("~", facet_wrap_var))
+  
+  LinDistP = LinDistP + 
+    facet_wrap(fwv) + 
+    theme(legend.position = leg_pos_wf
+          , legend.direction = leg_dir_wf)
+}
+
+return(LinDistP)
+}
diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R
new file mode 100755
index 0000000..29a6348
--- /dev/null
+++ b/scripts/plotting/lineage_data.R
@@ -0,0 +1,155 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for lineage barplots:
+# WF and LF data with lineage sample, and snp counts
+# sourced by get_plotting_dfs.R
+#########################################################
+# working dir and loading libraries
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
+
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
+
+# source("get_plotting_dfs.R")
+#=======================================================================
+#################################################
+# Get data with lineage count, and snp diversity
+#################################################
+table(merged_df2$lineage)
+
+if (table(merged_df2$lineage == "")[[2]]) {
+
+cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
+  
+}
+
+table(merged_df2$lineage_labels)
+class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
+
+##################################
+# WF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+sel_lineages = levels(merged_df2$lineage_labels)
+
+lin_wf = data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
+  #print(curr_total)
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+
+  foo = merged_df2[merged_df2$lineage_labels==i,]
+  print(paste0(i, "=======\n"))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+  
+  total_snps_u = c(total_snps_u, curr_count)
+}
+lin_wf
+
+# Add these counts as columns to the df
+lin_wf$num_snps_u = total_snps_u
+lin_wf$total_samples = total_samples
+lin_wf
+
+# Add SNP diversity
+lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
+lin_wf
+
+#=====================
+# Add some formatting
+#=====================
+# SNP diversity 
+lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
+lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
+
+lin_wf$sel_lineages
+
+# Important: Check factors so that x-axis categ appear as you want
+lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
+                                                     , "L2"
+                                                     , "L3"
+                                                     , "L4"
+                                                     , "L5"
+                                                     , "L6"
+                                                     , "L7"
+                                                     , "LBOV"
+                                                     , "L1;L2"
+                                                     , "L1;L3"
+                                                     , "L1;L4"
+                                                     , "L2;L3"
+                                                     , "L2;L3;L4"
+                                                     , "L2;L4"  
+                                                     , "L2;L6"
+                                                     , "L2;LBOV"  
+                                                     , "L3;L4" 
+                                                     , "L4;L6"
+                                                     , "L4;L7"
+                                                     , ""))
+
+levels(lin_wf$sel_lineages)
+
+##################################
+# LF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+##################################
+names(lin_wf)
+tot_cols = ncol(lin_wf)
+pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
+pivot_cols_n = length(pivot_cols)
+
+expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
+  
+lin_lf <- gather(lin_wf
+                 , count_categ
+                 , p_count
+                 , num_snps_u:total_samples
+                 , factor_key = TRUE)
+lin_lf
+
+# quick checks
+if ( nrow(lin_lf )  ==  expected_rows ){
+  cat("\nPASS: Lineage LF data created"
+      , "\nnrow: ", nrow(lin_lf)
+      , "\nncol: ", ncol(lin_lf))
+} else {
+  cat("\nFAIL: numbers mismatch"
+      , "\nExpected nrow: ", expected_rows)
+}
+
+# Important: Relevel factors so that x-axis categ appear as you want
+lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
+                                                     , "L2"
+                                                     , "L3"
+                                                     , "L4"
+                                                     , "L5"
+                                                     , "L6"
+                                                     , "L7"
+                                                     , "LBOV"
+                                                     , "L1;L2"
+                                                     , "L1;L3"
+                                                     , "L1;L4"
+                                                     , "L2;L3"
+                                                     , "L2;L3;L4"
+                                                     , "L2;L4"  
+                                                     , "L2;L6"
+                                                     , "L2;LBOV"  
+                                                     , "L3;L4" 
+                                                     , "L4;L6"
+                                                     , "L4;L7"
+                                                     , ""))
+
+levels(lin_lf$sel_lineages)

From 2bd85f70212f08c9be3e65293270bbe0e3f84079 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 9 Sep 2021 16:15:07 +0100
Subject: [PATCH 18/51] added lineage_dist_plots.R

---
 scripts/plotting/lineage_dist_plots.R | 114 ++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 scripts/plotting/lineage_dist_plots.R

diff --git a/scripts/plotting/lineage_dist_plots.R b/scripts/plotting/lineage_dist_plots.R
new file mode 100644
index 0000000..a425f37
--- /dev/null
+++ b/scripts/plotting/lineage_dist_plots.R
@@ -0,0 +1,114 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Lineage dist plots: ggridges
+
+# Output: 1 or 2 SVGs for PS stability
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R") # also loads all my functions
+
+#===========
+# input
+#===========
+#drug = "streptomycin"
+#gene = "gid"
+source("get_plotting_dfs.R")
+
+spec = matrix(c(
+  "drug"       , "d",  1, "character",
+  "gene"       , "g",  1, "character",
+  "data_file1" , "fa", 2, "character",
+  "data_file2" , "fb", 2, "character" 
+), byrow = TRUE, ncol = 4)
+
+opt = getopt(spec)
+
+drug            = opt$drug
+gene            = opt$gene
+infile_params   = opt$data_file1
+infile_metadata = opt$data_file2
+
+if(is.null(drug)|is.null(gene)) {
+  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+
+#=======
+# output
+#=======
+lineage_dist_dm_om_ps      = "lineage_dist_dm_om_PS.svg"
+plot_lineage_dist_dm_om_ps =  paste0(plotdir,"/", lineage_dist_dm_om_ps)
+#========================================================================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+#===================
+# Data for plots
+#===================
+# quick checks
+table(merged_df2$mutation_info_labels); levels(merged_df2$lineage_labels)
+table(merged_df2$lineage_labels); levels(merged_df2$mutation_info_labels)
+
+lin_dist_plot = merged_df2[merged_df2$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
+table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels)
+
+# refactor
+lin_dist_plot$lineage_labels = factor(lin_dist_plot$lineage_labels)
+nlevels(lin_dist_plot$lineage_labels)
+
+#-----------------------------------------------------------------------
+# IMPORTANT RESULTS to put inside table or text for interactive plots
+
+sum(table(lin_dist_plot$lineage_labels)) #{RESULT: Total number of samples for lineage}
+
+table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage}
+
+length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to}
+length(lin_dist_plot$mutationinformation)
+
+u2 = unique(merged_df2$mutationinformation)
+u = unique(lin_dist_plot$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+#-----------------------------------------------------------------------
+# without facet
+linP_dm_om = lineage_distP(lin_dist_plot
+              , with_facet = F
+              , x_axis = "deepddg"
+              , y_axis = "lineage_labels"
+              , x_lab = "DeepDDG"
+              , leg_label = "Mutation Class"
+)
+linP_dm_om
+
+# with facet
+linP_dm_om_facet = lineage_distP(lin_dist_plot
+              , with_facet = T
+              , facet_wrap_var = "mutation_info_labels"
+              , leg_label = "Mutation Class"
+              , leg_pos_wf = "none"
+              , leg_dir_wf = "horizontal"
+              
+)
+linP_dm_om_facet
+
+#=================
+# output plot:
+# without facet
+#=================
+svg(plot_lineage_dist_dm_om_ps)
+linP_dm_om
+
+dev.off()

From dda5d1ea9386e8c6405eb29cd4ddd7cef58f1dfd Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 9 Sep 2021 16:16:18 +0100
Subject: [PATCH 19/51] moved old lineage_dist plot scripts to redundant

---
 .../redundant/lineage_dist_combined_PS.R      | 303 +++++++++++++++++
 .../lineage_dist_dm_om_combined_PS.R          | 309 ++++++++++++++++++
 2 files changed, 612 insertions(+)
 create mode 100755 scripts/plotting/redundant/lineage_dist_combined_PS.R
 create mode 100755 scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R

diff --git a/scripts/plotting/redundant/lineage_dist_combined_PS.R b/scripts/plotting/redundant/lineage_dist_combined_PS.R
new file mode 100755
index 0000000..bf1c75b
--- /dev/null
+++ b/scripts/plotting/redundant/lineage_dist_combined_PS.R
@@ -0,0 +1,303 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Lineage dist plots: ggridges
+
+# Output: 2 SVGs for PS stability
+
+# 1) all muts
+# 2) dr_muts
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+library(ggridges)
+source("combining_dfs_plotting.R")
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+#=======
+# output
+#=======
+lineage_dist_combined = "lineage_dist_combined_PS.svg"
+plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
+#========================================================================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot 1: ALL Muts
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+my_plot_name = 'lineage_dist_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
+#===================
+# Data for plots
+#===================
+table(my_df$lineage); str(my_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4"
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7"
+                 )
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+table(df_lin$lineage)
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin$mutationinformation)
+
+u2 = unique(my_df$mutationinformation)
+u = unique(df_lin$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df <- df_lin
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
+              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
+              )
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
+                    # , 'lineage5', 'lineage6', 'lineage7'
+                     )
+# check plot name
+plot_lineage_duet
+
+# output svg
+#svg(plot_lineage_duet)
+p1 = ggplot(df, aes(x = duet_scaled
+                            , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                               #, jittered_points = TRUE
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+             #, switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = my_als-5)
+         , legend.title = element_text(size = my_als)
+) 
+
+print(p1)
+#dev.off()
+
+#######################################################################
+# lineage distribution plot for dr_muts
+#######################################################################
+
+#==========================
+# Plot 2: dr muts ONLY
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
+
+plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
+
+#===================
+# Data for plots
+#===================
+table(my_df_dr$lineage); str(my_df_dr$lineage)
+
+# uncomment as necessary
+df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
+table(df_lin_dr$lineage)
+
+# refactor
+df_lin_dr$lineage = factor(df_lin_dr$lineage)
+
+sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin_dr$mutationinformation)
+
+u2 = unique(my_df_dr$mutationinformation)
+u = unique(df_lin_dr$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df_dr <- df_lin_dr
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin_dr)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+
+# check plot name
+plot_lineage_dr_duet
+
+# output svg
+#svg(plot_lineage_dr_duet)
+p2 = ggplot(df_dr, aes(x = duet_scaled
+                       , y = duet_outcome))+
+
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                               #, jittered_points = TRUE
+                               , scale = 3
+                               , size = 0.3) +
+  #geom_point(aes(size = or_mychisq))+
+  facet_wrap( ~lineage
+              , scales = "free"
+              #, switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)
+                   #, ylim = c(0, 6)
+                   #, clip = "off" 
+  ) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                       , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_blank()
+        , axis.title.x = element_blank()
+        , axis.title.y = element_blank()
+        , axis.ticks.y = element_blank()
+        , plot.title = element_blank()
+        , strip.text = element_text(size = my_als)
+        , legend.text = element_text(size = 10)
+        , legend.title = element_text(size = my_als)
+        #, legend.position = "none"
+  ) 
+
+print(p2)
+#dev.off()
+########################################################################
+#==============
+# combine plot
+#===============
+
+svg(plot_lineage_dist_combined, width = 12, height = 6)
+
+printFile = cowplot::plot_grid(p1, p2
+                               , label_size = my_als+10)
+
+print(printFile)
+dev.off()
diff --git a/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R
new file mode 100755
index 0000000..3875382
--- /dev/null
+++ b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R
@@ -0,0 +1,309 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Lineage dist plots: ggridges
+
+# Output: 2 SVGs for PS stability
+
+# 1) all muts
+# 2) dr_muts
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+
+source("get_plotting_dfs.R")
+
+cat("cols imported:"
+    , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
+
+#=======
+# output
+#=======
+lineage_dist_combined_dm_om = "lineage_dist_combined_dm_om_PS.svg"
+plot_lineage_dist_combined_dm_om  =  paste0(plotdir,"/", lineage_dist_combined_dm_om)
+
+lineage_dist_combined_dm_om_L = "lineage_dist_combined_dm_om_PS_labelled.svg"
+plot_lineage_dist_combined_dm_om_L  =  paste0(plotdir,"/", lineage_dist_combined_dm_om_L)
+
+#========================================================================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+#===================
+# Data for plots
+#===================
+lin_dist_plot = merged_df2[merged_df2$lineage%in%c("lineage1", "lineage2", "lineage3", "lineage4"),]
+table(lin_dist_plot$lineage)
+
+#{RESULT: Total number of samples for lineage}
+sum(table(lin_dist_plot$lineage)) 
+
+#{RESULT: No of samples within lineage}
+table(lin_dist_plot$lineage) 
+
+#{Result: No. of unique mutations the 4 lineages contribute to}
+length(unique(lin_dist_plot$mutationinformation)) 
+
+u2 = unique(lin_dist_plot$mutationinformation)
+u = unique(lin_dist_plot$mutationinformation)
+
+#{Result:Muts not present within selected lineages}
+check = u2[!u2%in%u]; print(check) 
+
+# workaround to make labels appear nicely for in otherwise cases
+#==================
+# lineage: labels
+# from "plyr"
+#==================
+#{Result:No of samples in selected lineages}
+table(lin_dist_plot$lineage)
+
+lin_dist_plot$lineage_labels = mapvalues(lin_dist_plot$lineage
+                                  , from = c("lineage1","lineage2", "lineage3", "lineage4")
+                                  , to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4"))
+table(lin_dist_plot$lineage_labels)
+
+table(lin_dist_plot$lineage_labels) == table(lin_dist_plot$lineage)
+                              
+#========================
+# mutation_info: labels
+#========================
+#{Result:No of DM and OM muts in selected lineages}
+table(lin_dist_plot$mutation_info)
+
+lin_dist_plot$mutation_info_labels = ifelse(lin_dist_plot$mutation_info == dr_muts_col
+                                            , "DM", "OM")
+table(lin_dist_plot$mutation_info_labels)
+
+table(lin_dist_plot$mutation_info) == table(lin_dist_plot$mutation_info_labels)
+
+#========================
+# duet_outcome: labels
+#========================
+#{Result: No. of D and S mutations in selected lineages}
+table(lin_dist_plot$duet_outcome)
+
+lin_dist_plot$duet_outcome_labels = ifelse(lin_dist_plot$duet_outcome == "Destabilising"
+                                           , "D", "S")
+table(lin_dist_plot$duet_outcome_labels)
+
+table(lin_dist_plot$duet_outcome) == table(lin_dist_plot$duet_outcome_labels)
+
+
+#=======================
+# subset dr muts only
+#=======================
+#my_df_dr = subset(df_lin, mutation_info == dr_muts_col) 
+#table(my_df_dr$mutation_info)
+#table(my_df_dr$lineage)
+
+#=========================
+# subset other muts only
+#=========================
+#my_df_other = subset(df_lin, mutation_info == other_muts_col) 
+#table(my_df_other$mutation_info)
+#table(my_df_other$lineage)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+n_colours = length(unique(lin_dist_plot$duet_scaled))
+
+my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
+
+#=======================================
+# Plot 1: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!)
+# else same as geom_density_ridges)
+# x = duet_scaled
+# y = duet_outcome
+# fill = duet_scaled
+# Facet: Lineage
+#=======================================
+# output individual svg
+#plot_lineage_dist_duet_f  paste0(plotdir,"/", "lineage_dist_duet_f.svg")
+#plot_lineage_dist_duet_f
+#svg(plot_lineage_dist_duet_f)
+
+p1 = ggplot(lin_dist_plot, aes(x = duet_scaled
+                    #, y = duet_outcome
+                    , y = mutation_info_labels
+                    ))+
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                               #, jittered_points = TRUE
+                               , scale = 3
+                               , size = 0.3 ) +
+  facet_wrap( ~lineage_labels
+              #~mutation_info_labels
+             # ~mutation_info_labels
+ # , scales = "free"
+ # , labeller = labeller(lineage = my_labels)
+   ) +                            
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_x_continuous(expand = c(0.01, 0)) +
+  
+  scale_fill_gradientn(colours = my_palette
+                       , name = "DUET"
+                       #, breaks = c(-1, 0, 1)
+                       #, labels = c(-1,0,1)
+                       #, limits = c(-1,1)
+                       ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        #, axis.text.y = element_blank()
+        , axis.text.y = element_text(size = my_ats)
+        , axis.title.x = element_text(size = my_ats)
+        , axis.title.y = element_blank()
+        , axis.ticks.y = element_blank()
+        , plot.title = element_blank()
+        , strip.text = element_text(size = my_als)
+        , legend.text = element_text(size = my_als-10)
+        #, legend.title = element_text(size = my_als-6)
+        , legend.title = element_blank()
+        , legend.position = c(-0.08, 0.41) 
+        , legend.direction = "horizontal"
+        , legend.position = "top"
+)+
+  labs(x = "DUET") 
+
+p1
+
+
+#p1_with_legend = p1 + guides(fill = guide_colourbar(label = FALSE))
+
+#=======================================
+# Plot 2: lineage dist: geom_density_ridges, allows alpha to be set
+# x = duet_scaled
+# y = lineage_labels
+# fill = mutation_info
+# NO FACET
+#=======================================
+# output svg
+#plot_lineage_dist_duet_dm_om  =  paste0(plotdir,"/", "lineage_dist_duet_dm_om.svg")
+#plot_lineage_dist_duet_dm_om
+#svg(plot_lineage_dist_duet_dm_om)
+
+p2 = ggplot(lin_dist_plot, aes(x = duet_scaled
+                    , y = lineage_labels))+
+  geom_density_ridges(aes(fill = factor(mutation_info_labels))
+                      , scale = 3
+                      , size = 0.3
+                      , alpha = 0.8) +
+  scale_x_continuous(expand = c(0.01, 0)) +
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_fill_manual(values = c("#E69F00", "#999999")) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats)
+        , axis.title.x = element_text(size = my_ats)
+        , axis.title.y = element_blank()
+        , axis.ticks.y = element_blank()
+        , plot.title = element_blank()
+        , strip.text = element_text(size = my_als)
+        , legend.text = element_text(size = my_als-4)
+        , legend.title = element_text(size = my_als-4)
+        , legend.position = c(0.8, 0.9)) +
+  labs(x = "DUET"
+       , fill = "Mutation class") # legend title
+
+p2
+
+#=======================================
+# Plot 3: lineage dist: geom_density_ridges_gradient (allows aesthetics to vary along ridgeline, no alpha setting!)
+# else same as geom_density_ridges)
+# x = duet_scaled
+# y = lineage_labels
+# fill = duet_scaled 
+# NO FACET (nf)
+#=======================================
+# output individual svg
+#plot_lineage_dist_duet_nf  =  paste0(plotdir,"/", "lineage_dist_duet_nf.svg")
+#plot_lineage_dist_duet_nf
+#svg(plot_lineage_dist_duet_nf)
+
+p3 = ggplot(lin_dist_plot, aes(x = duet_scaled
+                    , y = lineage_labels))+
+  # geom_density_ridges_gradient(aes(fill = ..x..)
+  #                              #, jittered_points = TRUE
+  #                              , scale = 3
+  #                              , size = 0.3 ) +
+  geom_density_ridges()+
+  #facet_wrap (~mutation_info_labels) +
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_x_continuous(expand = c(0.01, 0)) +
+  
+  #scale_fill_gradientn(colours = my_palette, name = "DUET") + 
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        
+        , axis.text.y = element_text(size = my_ats)
+        , axis.title.x = element_text(size = my_ats)
+        , axis.title.y = element_blank()
+        , axis.ticks.y = element_blank()
+        , plot.title = element_blank()
+        , strip.text = element_text(size = my_als)
+        , legend.text = element_text(size = my_als-10)
+        , legend.title = element_text(size = my_als-3)
+        , legend.position = c(0.8, 0.8)) +
+        #, legend.direction = "horizontal")+
+        #, legend.position = "top")+
+  labs(x = "DUET") 
+
+p3
+
+########################################################################
+#==============
+# combine plots
+#===============
+# 1) without labels
+plot_lineage_dist_combined_dm_om
+svg(plot_lineage_dist_combined_dm_om, width = 12, height = 6)
+
+OutPlot1 = cowplot::plot_grid(p1, p2
+                               , rel_widths = c(0.5/2, 0.5/2))
+
+print(OutPlot1)
+dev.off()
+
+
+# 2) with labels
+plot_lineage_dist_combined_dm_om_L
+svg(plot_lineage_dist_combined_dm_om_L, width = 12, height = 6)
+
+OutPlot2 = cowplot::plot_grid(p1, p2
+                              #, labels = c("(a)", "(b)")
+                              , labels = "AUTO"
+                              #, label_x = -0.045, label_y = 0.92
+                              #, hjust = -0.7, vjust = -0.5
+                              #, align = "h"
+                              , rel_widths = c(0.5/2, 0.5/2)
+                              , label_size = my_als)
+
+print(OutPlot2)
+dev.off()
+
+##############################################################################

From 4ba4ff602e6235298cddb9408fb13a59bc1e437e Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 10 Sep 2021 16:58:36 +0100
Subject: [PATCH 20/51] added foldx_scaled and deepddg_scaled values added to
 combine_df.py and also used that script to merge all the dfs so that
 merged_df2 and merged_df3 are infact what we need for downstream processing

---
 scripts/combining_dfs.py              | 240 +++++++++---
 scripts/plotting/get_plotting_dfs.R   | 452 ++++++----------------
 scripts/plotting/lineage_data.R       |  30 +-
 scripts/plotting/lineage_dist_plots.R |  71 +++-
 scripts/plotting/other_plots_data.R   | 538 --------------------------
 5 files changed, 354 insertions(+), 977 deletions(-)
 delete mode 100755 scripts/plotting/other_plots_data.R

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 634af18..4e2781e 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -41,6 +41,7 @@ import pandas as pd
 from pandas import DataFrame
 import numpy as np
 import argparse
+from functools import reduce
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
@@ -92,19 +93,6 @@ outdir  = args.output_dir
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 
-# !"Redundant, now that improvements have been made!
-# See section "REGEX"
-# nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
-# print('nsSNP for gene', gene, ':',  nssnp_match)
-
-# wt_regex = gene_match.lower()+'([A-Za-z]{3})'
-# print('wt regex:', wt_regex)
-
-# mut_regex = r'[0-9]+(\w{3})$'
-# print('mt regex:', mut_regex)
-
-# pos_regex = r'([0-9]+)'
-# print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@@ -122,49 +110,52 @@ if not outdir:
 # input
 #=======
 #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' 
-in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb
-in_filename_foldx = gene.lower() + '_foldx.csv'
-in_filename_deepddg = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir
-
-in_filename_dssp = gene.lower() + '_dssp.csv'
-in_filename_kd = gene.lower() + '_kd.csv'
-in_filename_rd = gene.lower() + '_rd.csv'
-
+in_filename_mcsm     = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb
+in_filename_foldx    = gene.lower() + '_foldx.csv'
+in_filename_deepddg  = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir
+in_filename_dssp     = gene.lower() + '_dssp.csv'
+in_filename_kd       = gene.lower() + '_kd.csv'
+in_filename_rd       = gene.lower() + '_rd.csv'
 #in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
-in_filename_afor = gene.lower() + '_af_or.csv'
+in_filename_afor     = gene.lower() + '_af_or.csv'
 #in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
+infilename_dynamut   = gene.lower() + '_complex_dynamut_norm.csv'
+infilename_dynamut2  = gene.lower() + '_complex_dynamut2_norm.csv'
+infilename_mcsm_na   = gene.lower() + '_complex_mcsm_na_norm.csv'
+infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
 
-infile_mcsm = outdir + in_filename_mcsm
-infile_foldx = outdir + in_filename_foldx
+infile_mcsm    = outdir + in_filename_mcsm
+infile_foldx   = outdir + in_filename_foldx
 infile_deepddg = outdir + in_filename_deepddg
+infile_dssp    = outdir + in_filename_dssp
+infile_kd      = outdir + in_filename_kd
+infile_rd      = outdir + in_filename_rd
+#infile_snpinfo = outdir + in_filename_snpinfo 
+infile_afor    = outdir + in_filename_afor
+#infile_afor_kin = outdir + in_filename_afor_kin
+infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
+infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
+infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
+infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
 
-infile_dssp = outdir + in_filename_dssp
-infile_kd = outdir + in_filename_kd
-infile_rd = outdir + in_filename_rd
-
-#infile_snpinfo = outdir + '/' + in_filename_snpinfo 
-infile_afor = outdir + '/' + in_filename_afor
-#infile_afor_kin = outdir + '/' + in_filename_afor_kin
-
-print('\nInput path:', indir
-      , '\nOutput path:', outdir, '\n'
-      , '\nInput filename mcsm:', infile_mcsm
-      , '\nInput filename foldx:', infile_foldx, '\n'
-      , '\nInput filename deepddg', infile_deepddg , '\n'
-      , '\nInput filename dssp:', infile_dssp
-      , '\nInput filename kd:', infile_kd 
-      , '\nInput filename rd', infile_rd
-     
-      #, '\nInput filename snp info:', infile_snpinfo, '\n'
-      , '\nInput filename af or:', infile_afor
-      #, '\nInput filename afor kinship:', infile_afor_kin
-      , '\n============================================================')
+# read csv
+mcsm_df      = pd.read_csv(infile_mcsm, sep = ',')
+foldx_df     = pd.read_csv(infile_foldx , sep = ',')
+deepddg_df   = pd.read_csv(infile_deepddg, sep = ',')
+dssp_df      = pd.read_csv(infile_dssp, sep = ',')
+kd_df        = pd.read_csv(infile_kd, sep = ',')
+rd_df        = pd.read_csv(infile_rd, sep = ',')
+afor_df      = pd.read_csv(infile_afor, sep = ',') 
+dynamut_df   = pd.read_csv(infile_dynamut, sep = ',')
+dynamut2_df  = pd.read_csv(infile_dynamut2, sep = ',')
+mcsm_na_df   = pd.read_csv(infile_mcsm_na, sep = ',')
+mcsm_f_snps  = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
 
 #=======
 # output 
 #=======
 out_filename_comb = gene.lower() + '_all_params.csv'
-outfile_comb =  outdir + '/' + out_filename_comb
+outfile_comb =  outdir + out_filename_comb
 print('Output filename:', outfile_comb
       , '\n===================================================================')
 
@@ -174,12 +165,101 @@ r_join = 'right'
 i_join = 'inner'
 
 # end of variable assignment for input and output files
-#%%============================================================================  
+#%%############################################################################  
+#=====================
+# some preprocessing
+#=====================
+#-------------
+# FoldX
+#-------------
+foldx_df.shape
+#=======================
+# scale foldx values
+#=======================
+  
+# Rescale values in Foldx_change col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+foldx_min = foldx_df['ddg'].min() 
+foldx_max = foldx_df['ddg'].max() 
+foldx_min
+foldx_max
+
+foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
+
+foldx_df['foldx_scaled'] = foldx_df['ddg'].apply(foldx_scale)
+print('Raw foldx scores:\n', foldx_df['ddg']
+    , '\n---------------------------------------------------------------'
+    , '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
+
+# additional check added
+fsmi = foldx_df['foldx_scaled'].min()
+fsma = foldx_df['foldx_scaled'].max()
+
+c = foldx_df[foldx_df['ddg']>=0].count()
+foldx_pos = c.get(key = 'ddg')
+
+c2 = foldx_df[foldx_df['foldx_scaled']>=0].count()
+foldx_pos2 = c2.get(key = 'foldx_scaled')
+
+if foldx_pos == foldx_pos2 and fsmi == -1 and fsma == 1:
+    print('\nPASS: Foldx values scaled correctly b/w -1 and 1')
+else:
+    print('\nFAIL: Foldx values scaled numbers MISmatch'
+          , '\nExpected number:', foldx_pos
+          , '\nGot:', foldx_pos2
+          , '\n======================================================')
+
+# rename ddg column to ddg_foldx
+foldx_df['ddg']
+foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'})   
+foldx_df['ddg_foldx']
+
+#-------------
+# Deepddg
+#-------------
+deepddg_df.shape
+
+#=======================
+# scale Deepddg values
+#=======================
+
+# Rescale values in deepddg_change col b/w -1 and 1 so negative numbers
+# stay neg and pos numbers stay positive
+deepddg_min = deepddg_df['deepddg'].min() 
+deepddg_max = deepddg_df['deepddg'].max() 
+
+deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed')
+
+deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale)
+print('Raw deepddg scores:\n', deepddg_df['deepddg']
+    , '\n---------------------------------------------------------------'
+    , '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled'])
+    
+# additional check added
+dsmi = deepddg_df['deepddg_scaled'].min()
+dsma = deepddg_df['deepddg_scaled'].max()
+
+c = deepddg_df[deepddg_df['deepddg']>=0].count()
+deepddg_pos = c.get(key = 'deepddg')
+
+c2 = deepddg_df[deepddg_df['deepddg_scaled']>=0].count()
+deepddg_pos2 = c2.get(key = 'deepddg_scaled')
+    
+if deepddg_pos == deepddg_pos2 and dsmi == -1 and dsma == 1:
+    print('\nPASS: deepddg values scaled correctly b/w -1 and 1')
+else:
+    print('\nFAIL: deepddg values scaled numbers MISmatch'
+          , '\nExpected number:', deepddg_pos
+          , '\nGot:', deepddg_pos2
+          , '\n======================================================')
+#%%=============================================================================
+# Now merges begin
+#%%=============================================================================
 print('==================================='
       , '\nFirst merge: mcsm + foldx'
       , '\n===================================')
 
-mcsm_df =  pd.read_csv(infile_mcsm, sep = ',')
+mcsm_df.shape
 
 # add 3 lowercase aa code for wt and mutant
 get_aa_3lower(df = mcsm_df
@@ -189,7 +269,7 @@ get_aa_3lower(df = mcsm_df
               , col_mut = 'mut_aa_3lower')
 
 #mcsm_df.columns = mcsm_df.columns.str.lower()
-foldx_df =  pd.read_csv(infile_foldx , sep = ',')
+# foldx_df.shape
 
 #mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join)
 merging_cols_m1  = detect_common_cols(mcsm_df, foldx_df)
@@ -205,8 +285,8 @@ print('==================================='
       , '\nSecond merge: mcsm_foldx_dfs + deepddg'
       , '\n===================================')
 
-deepddg_df =  pd.read_csv(infile_deepddg, sep = ',')
-deepddg_df.columns
+#deepddg_df =  pd.read_csv(infile_deepddg, sep = ',')
+#deepddg_df.columns
 
 # merge with mcsm_foldx_dfs and deepddg_df
 mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_df, on = 'mutationinformation',  how = l_join)
@@ -218,9 +298,9 @@ print('==================================='
       , '\Third merge: dssp + kd'
       , '\n===================================')
 
-dssp_df = pd.read_csv(infile_dssp, sep = ',')
-kd_df = pd.read_csv(infile_kd, sep = ',')
-rd_df = pd.read_csv(infile_rd, sep = ',')
+dssp_df.shape
+kd_df.shape
+rd_df.shape
 
 #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join)
 merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
@@ -308,8 +388,8 @@ print('\n======================================='
       , '\ncombined_df_clean + afor_df '
       , '\n=======================================')
 
-afor_df = pd.read_csv(infile_afor, sep = ',') 
 afor_cols = afor_df.columns
+afor_df.shape
 
 # create a mapping from the gwas mutation column i.e <gene_match>_abcXXXrst
 #----------------------
@@ -360,16 +440,60 @@ else:
     sys.exit('\nFAIL: merge unsuccessful for af and or')
 
 #%%============================================================================
-# Output columns
+# Output columns: when dynamut, dynamut2 and others weren't being combined
 out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
 outfile_comb_afor =  outdir + '/' + out_filename_comb_afor
 print('Output filename:', outfile_comb_afor
       , '\n===================================================================')
 
-# write csv
+# # write csv
 print('Writing file: combined stability and afor')
 combined_stab_afor.to_csv(outfile_comb_afor, index = False)
 print('\nFinished writing file:'
       , '\nNo. of rows:', combined_stab_afor.shape[0]
       , '\nNo. of cols:', combined_stab_afor.shape[1])
-#%% end of script
+#%%============================================================================
+# combine dynamut, dynamut2, and mcsm_na
+dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
+
+dfs_merged = reduce(lambda  left,right: pd.merge(left
+                                                , right
+                                                , on = ['mutationinformation']
+                                                , how = 'inner')
+                   , dfs_list)
+# drop excess columns
+drop_cols = detect_common_cols(dfs_merged, combined_stab_afor)
+drop_cols.remove('mutationinformation')
+
+dfs_merged_clean = dfs_merged.drop(drop_cols, axis = 1)
+merging_cols_m6 = detect_common_cols(dfs_merged_clean, combined_stab_afor)
+
+len(dfs_merged_clean.columns)
+len(combined_stab_afor.columns)
+
+combined_all_params = pd.merge(combined_stab_afor
+                               , dfs_merged_clean
+                               , on = merging_cols_m6
+                               , how  = i_join)
+
+expected_ncols = len(dfs_merged_clean.columns) + len(combined_stab_afor.columns) - len(merging_cols_m6)
+expected_nrows = len(combined_stab_afor)
+
+if len(combined_all_params.columns) == expected_ncols and len(combined_all_params) == expected_nrows:
+    print('\nPASS: All dfs combined')
+else:
+    print('\nFAIL:lengths mismatch'
+          , '\nExpected ncols:', expected_ncols
+          , '\nGot:', len(dfs_merged_clean.columns) 
+          , '\nExpected nrows:', expected_nrows
+          , '\nGot:', len(dfs_merged_clean) )
+    
+#%% Done for gid on 10/09/2021
+# write csv
+print('Writing file: all params')
+combined_all_params.to_csv(outfile_comb, index = False)
+
+print('\nFinished writing file:'
+      , '\nNo. of rows:', combined_all_params.shape[0]
+      , '\nNo. of cols:', combined_all_params.shape[1])
+#%% end of script
\ No newline at end of file
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index 89b477c..f1a7620 100755
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -8,11 +8,11 @@ setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 
 source("Header_TT.R")
-source("../functions/my_pairs_panel.R") # with lower panel turned off
-source("../functions/plotting_globals.R")
-source("../functions/plotting_data.R")
-source("../functions/combining_dfs_plotting.R")
-source("../functions/bp_subcolours.R")
+# source("../functions/my_pairs_panel.R") # with lower panel turned off
+# source("../functions/plotting_globals.R")
+# source("../functions/plotting_data.R")
+# source("../functions/combining_dfs_plotting.R")
+# source("../functions/bp_subcolours.R")
 
 #********************
 # cmd args passed 
@@ -41,8 +41,8 @@ import_dirs(drug, gene)
 #---------------------------
 if (!exists("infile_params") && exists("gene")){
 #if (!is.character(infile_params) && exists("gene")){ # when running as cmd
-  #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
-  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
+  in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21
+  #in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
   infile_params = paste0(outdir, "/", in_filename_params)
   cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
 }
@@ -91,369 +91,139 @@ merged_df3      = all_plot_dfs[[2]]
 merged_df2_comp = all_plot_dfs[[3]]
 merged_df3_comp = all_plot_dfs[[4]]
 #======================================================================
-# read other files
-infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
-                            , "_complex_dynamut_norm.csv")
+#TODO: Think! MOVE TO COMBINE or singular file for deepddg
 
-infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
-                              , "_complex_dynamut2_norm.csv")
+#============================
+# adding deepddg scaled values
+# scale data b/w -1 and 1
+#============================
+n = which(colnames(merged_df3) == "deepddg"); n 
 
-infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
-                            , "_complex_mcsm_na_norm.csv")
-                            
-infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
-                      , "_mcsm_formatted_snps.csv")
-                      
-dynamut_df   = read.csv(infilename_dynamut)
-dynamut2_df  = read.csv(infilename_dynamut2)
-mcsm_na_df   = read.csv(infilename_mcsm_na)
-mcsm_f_snps  = read.csv(infilename_mcsm_f_snps, header = F)
-names(mcsm_f_snps) = "mutationinformation"
+my_min = min(merged_df3[,n]); my_min 
+my_max = max(merged_df3[,n]); my_max 
 
-####################################################################
-#                        Data for subcols barplot (~heatmpa)
-####################################################################
-# can include: mutation, or_kin, pwald, af_kin
-cols_to_select = c("mutationinformation", "drtype"
-                   , "wild_type"
-                   , "position"
-                   , "mutant_type"
-                   , "chain", "ligand_id", "ligand_distance"
-                   , "duet_stability_change", "duet_outcome", "duet_scaled"
-                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
-                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
-                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
-                   , "asa", "rsa", "rd_values", "kd_values"
-                   , "af", "or_mychisq", "pval_fisher" 
-                   , "or_fisher", "or_logistic", "pval_logistic"
-                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
-                   , "wt_calcprop", "mut_calcprop")
+merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
+                                   , merged_df3[,n]/abs(my_min)
+                                   , merged_df3[,n]/my_max) 
+# sanity check
+my_min = min(merged_df3$deepddg_scaled); my_min 
+my_max = max(merged_df3$deepddg_scaled); my_max
 
-#=======================
-# Data for sub colours
-# barplot: PS
-#=======================
-
-cat("\nNo. of cols to select:", length(cols_to_select))
-
-subcols_df_ps = merged_df3[, cols_to_select]
-
-cat("\nNo of unique positions for ps:"
-    , length(unique(subcols_df_ps$position)))
-
-# add count_pos col that counts the no. of nsSNPS at a position
-setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
-
-# should be a factor
-if (is.factor(subcols_df_ps$duet_outcome)){
-  cat("\nDuet_outcome is factor")
-  table(subcols_df_ps$duet_outcome)
+if (my_min == -1 && my_max == 1){
+   cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
+       #, "\nProceeding with assigning deep outcome category")
+       , "\n")
 }else{
-  cat("\nConverting duet_outcome to factor")
-  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
-  table(subcols_df_ps$duet_outcome)
+   cat("\nFAIL: could not scale DeepDDG ddg values"
+       , "Aborting!")
 }
 
-# should be -1 and 1
-min(subcols_df_ps$duet_scaled)
-max(subcols_df_ps$duet_scaled)
 
-tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
-tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
+####################################################################
+#                        Data for combining other dfs
+####################################################################
 
-# check unique values in normalised data
-cat("\nNo. of unique values in duet scaled, no rounding:"
-    , length(unique(subcols_df_ps$duet_scaled)))
+source("other_dfs_data.R")
 
-# No rounding    
-my_grp = subcols_df_ps$duet_scaled; length(my_grp)
+####################################################################
+#                        Data for subcols barplot (~heatmap)
+####################################################################
 
-# Add rounding is to be used
-n = 3 
-subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
-
-cat("\nNo. of unique values in duet scaled", n, "places rounding:"
-    , length(unique(subcols_df_ps$duet_scaledR)))
-
-my_grp_r = subcols_df_ps$duet_scaledR  # rounding
-
-# Add grp cols
-subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
-subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
-
-# Call the function to create the palette based on the group defined above
-subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
-subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
-
-print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
-print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
+source("coloured_bp_data.R")
 
 ####################################################################
 #                        Data for logoplots
 ####################################################################
-#-------------------------
-# choose df for logoplot
-#-------------------------
-logo_data = merged_df3
-#logo_data = merged_df3_comp
 
-# quick checks
-colnames(logo_data)
-str(logo_data)
+source("logo_data.R")
 
-c1 = unique(logo_data$position) 
-nrow(logo_data)
-cat("No. of rows in my_data:", nrow(logo_data)
-    , "\nDistinct positions corresponding to snps:", length(c1)
-    , "\n===========================================================")
-#=======================================================================
-#==================
-# logo data: OR
-#==================
-foo = logo_data[, c("position"
-                      , "mutant_type","duet_scaled", "or_mychisq"
-                      , "mut_prop_polarity", "mut_prop_water")] 
+s1 = c("\nSuccessfully sourced logo_data.R")
+cat(s1)
 
-logo_data$log10or = log10(logo_data$or_mychisq)
-logo_data_plot = logo_data[, c("position"
-                            , "mutant_type", "or_mychisq", "log10or")]
-
-logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
-wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
-
-wide_df_or = as.matrix(wide_df_or)
-rownames(wide_df_or) = wide_df_or[,1]
-dim(wide_df_or)
-wide_df_or = wide_df_or[,-1]
-str(wide_df_or)
-
-position_or = as.numeric(colnames(wide_df_or))
-
-#==================
-# logo data: logOR
-#==================
-logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
-wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
-
-wide_df_logor = as.matrix(wide_df_logor)
-
-rownames(wide_df_logor) = wide_df_logor[,1]
-wide_df_logor = subset(wide_df_logor, select = -c(1) )
-colnames(wide_df_logor)
-wide_df_logor_m = data.matrix(wide_df_logor)
-
-rownames(wide_df_logor_m)
-colnames(wide_df_logor_m)
-
-position_logor = as.numeric(colnames(wide_df_logor_m))
-
-#===============================
-# logo data: multiple nsSNPs (>1)
-#=================================
-#require(data.table)
-
-# get freq count of positions so you can subset freq<1
-setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
-
-table(logo_data$position)
-table(logo_data$mut_pos_occurrence)
-
-max_mut = max(table(logo_data$position))
-
-# extract freq_pos > 1
-my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
-u = unique(my_data_snp$position)
-max_mult_mut = max(table(my_data_snp$position))
-
-if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
-  
-  cat("PASS: positions with  multiple muts extracted"
-      , "\nNo. of mutations:", nrow(my_data_snp)
-      , "\nNo. of positions:", length(u)
-      , "\nMax no. of muts at any position", max_mult_mut)
-}else{
-  cat("FAIL: positions with multiple muts could NOT be extracted"
-      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
-      , "\nGot:", nrow(my_data_snp) )
-}
-
-cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
-
-#--------------------------------------
-# matrix for_mychisq mutant type
-# frequency of mutant type by position
-#---------------------------------------
-table(my_data_snp$mutant_type, my_data_snp$position)
-tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
-class(tab_mt)
-
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-# should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#-------------------------------------
-# matrix for wild type
-# frequency of wild type by position
-#-------------------------------------
-tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
-tab_wt = unclass(tab_wt)
-
-# remove wt duplicates
-wt = my_data_snp[, c("position", "wild_type")]
-wt = wt[!duplicated(wt),]
-
-tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
-
-rownames(tab_wt)
-rownames(tab_wt)
-
-identical(colnames(tab_mt), colnames(tab_wt))
-identical(ncol(tab_mt), ncol(tab_wt))
-
-#----------------------------------
-# logo data OR: multiple nsSNPs (>1)
-#----------------------------------
-logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
-#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
-wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
-
-wide_df_or_mult = as.matrix(wide_df_or_mult)
-rownames(wide_df_or_mult) = wide_df_or_mult[,1]
-wide_df_or_mult = wide_df_or_mult[,-1]
-str(wide_df_or_mult)
-
-position_or_mult = as.numeric(colnames(wide_df_or_mult))
-
-####################################################################
-#                        Data for Corrplots
-####################################################################
-cat("\n=========================================="
-    , "\nCORR PLOTS data: PS"
-    , "\n===========================================")
-
-df_ps = merged_df2
-
-#--------------------
-# adding log cols : NEW UNCOMMENT
-#--------------------
-#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
-#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
-
-##df_ps$log10_or_kin = log10(df_ps$or_kin)
-##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
-
-#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
-
-#----------------------------
-# columns for corr plots:PS
-#----------------------------
-# subset data to generate pairwise correlations
-cols_to_select =  c("mutationinformation"
-                    , "duet_scaled"
-                    , "foldx_scaled"
-                    #, "mutation_info_labels"
-                    , "asa"
-                    , "rsa"
-                    , "rd_values"
-                    , "kd_values"
-                    , "log10_or_mychisq"
-                    , "neglog_pval_fisher"
-                    ##, "or_kin"
-                    ##, "neglog_pwald_kin"
-                    , "af"
-                    ##, "af_kin"
-                    , "duet_outcome"
-                    , drug)
-
-corr_data_ps = df_ps[cols_to_select]
-
-dim(corr_data_ps)
-
-#--------------------------------------
-# assign nice colnames (for display)
-#--------------------------------------
-my_corr_colnames = c("Mutation"
-                     , "DUET"
-                     , "FoldX"
-                     #, "Mutation class"
-                     , "ASA"
-                     , "RSA"
-                     , "RD"
-                     , "KD"
-                     , "Log (OR)"
-                     , "-Log (P)"
-                     ##, "Adjusted (OR)"
-                     ##, "-Log (P wald)"
-                     , "MAF"
-                     ##, "AF_kin"
-                     , "duet_outcome"
-                     , drug)
-
-length(my_corr_colnames)
-
-colnames(corr_data_ps)
-colnames(corr_data_ps) <- my_corr_colnames
-colnames(corr_data_ps)
-
-start = 1
-end = which(colnames(corr_data_ps) == drug); end # should be the last column
-offset = 1
-
-#===========================
-# Corr data for plots: PS
-# big_df ps: ~ merged_df2
-#===========================
-
-#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
-corr_ps_df2 = corr_data_ps[start:end]
-head(corr_ps_df2)
-
-#===========================
-# Corr data for plots: PS
-# short_df ps: ~merged_df3
-#===========================
-corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
-
-na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
-check1 = nrow(corr_ps_df3) - na_or
-
-##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
-##check2 = nrow(corr_ps_df3) - na_adj_or 
-
-if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
-  cat( "\nPASS: No. of rows for corr_ps_df3 match"
-       , "\nPASS: No. of OR values checked: " , check1)
-} else {
-  cat("\nFAIL: Numbers  mismatch:"
-      , "\nExpected nrows: ", nrow(merged_df3)
-      , "\nGot: ", nrow(corr_ps_df3)
-      , "\nExpected OR values: ", nrow(merged_df3_comp)
-      , "\nGot: ", check1)
-}
-
-rm(foo)
 ####################################################################
 #                        Data for DM OM Plots: Long format dfs
 ####################################################################
 
-source("other_plots_data.R")
+#source("other_plots_data.R")
+
+source("dm_om_data.R")
+
+s2 = c("\nSuccessfully sourced other_plots_data.R")
+cat(s2)
 
 ####################################################################
 #                  Data for Lineage barplots: WF and LF dfs
 ####################################################################
 
-source("lineage_bp_data.R")
+source("lineage_data.R")
+
+s3 = c("\nSuccessfully sourced lineage_data.R")
+cat(s3)
+
+####################################################################
+#                  Data for corr plots:
+####################################################################
+# make sure the above script works because merged_df2_combined is needed
+source("corr_data.R")
+
+s4 = c("\nSuccessfully sourced corr_data.R")
+cat(s4)
 
 ########################################################################
 #                           End of script
 ########################################################################
+if (  all( length(s1), length(s2), length(s3), length(s4) ) >0 ){
+ cat(
+  "\n##################################################"
+ , "\nSuccessful: get_plotting_dfs.R worked!"
+ , "\n###################################################\n") 
+} else {
+ cat(
+  "\n#################################################"
+ , "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check"
+ , "\n###################################################\n" )
+ }   
+ 
+########################################################################
+# clear excess variables
+rm(c1, c2, c3, c4, check1
+   , curr_count, curr_total
+   , cols_check
+   , cols_to_select
+   , cols_to_select_deepddg
+   , cols_to_select_duet
+   , cols_to_select_dynamut
+   , cols_to_select_dynamut2
+   , cols_to_select_encomddg
+   , cols_to_select_encomdds
+   , cols_to_select_mcsm
+   , cols_to_select_mcsm_na
+   , cols_to_select_sdm
+   , infile_metadata
+   , infile_params
+   #, infilename_dynamut
+   #, infilename_dynamut2
+   #, infilename_mcsm_f_snps
+   #, infilename_mcsm_na
+   )
 
-cat("\n######################################################\n"
-      , "\nSuccessful: get_plotting_dfs.R worked!"
-      , "\n###################################################\n")
+rm(pivot_cols
+, pivot_cols_deepddg
+, pivot_cols_duet
+, pivot_cols_dynamut
+, pivot_cols_dynamut2
+, pivot_cols_encomddg
+, pivot_cols_encomdds
+, pivot_cols_foldx
+, pivot_cols_mcsm
+, pivot_cols_mcsm_na
+, pivot_cols_n
+, pivot_cols_sdm)
+
+rm(expected_cols
+, expected_ncols
+, expected_rows
+, expected_rows_lf
+, fact_cols)
+
+   
diff --git a/scripts/plotting/lineage_data.R b/scripts/plotting/lineage_data.R
index 29a6348..9549863 100755
--- a/scripts/plotting/lineage_data.R
+++ b/scripts/plotting/lineage_data.R
@@ -4,21 +4,10 @@
 # WF and LF data with lineage sample, and snp counts
 # sourced by get_plotting_dfs.R
 #########################################################
-# working dir and loading libraries
-# getwd()
-# setwd("~/git/LSHTM_analysis/scripts/plotting")
-# getwd()
 
-# make cmd
-# globals
-# drug = "streptomycin"
-# gene = "gid"
-
-# source("get_plotting_dfs.R")
-#=======================================================================
-#################################################
+#=================================================
 # Get data with lineage count, and snp diversity
-#################################################
+#=================================================
 table(merged_df2$lineage)
 
 if (table(merged_df2$lineage == "")[[2]]) {
@@ -30,12 +19,12 @@ cat("\nMissing samples with lineage classification:", table(merged_df2$lineage =
 table(merged_df2$lineage_labels)
 class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
 
-##################################
+#==========================================
 # WF data: lineages with 
 # snp count
 # total_samples
 # snp diversity (perc)
-##################################
+#==========================================
 sel_lineages = levels(merged_df2$lineage_labels)
 
 lin_wf = data.frame(sel_lineages) #4, 1
@@ -67,9 +56,9 @@ lin_wf
 lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
 lin_wf
 
-#=====================
+#----------------------
 # Add some formatting
-#=====================
+#----------------------
 # SNP diversity 
 lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
 lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
@@ -100,12 +89,12 @@ lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
 
 levels(lin_wf$sel_lineages)
 
-##################################
+#=================================
 # LF data: lineages with 
 # snp count
 # total_samples
 # snp diversity (perc)
-##################################
+#=================================
 names(lin_wf)
 tot_cols = ncol(lin_wf)
 pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
@@ -153,3 +142,6 @@ lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
                                                      , ""))
 
 levels(lin_lf$sel_lineages)
+
+################################################################
+
diff --git a/scripts/plotting/lineage_dist_plots.R b/scripts/plotting/lineage_dist_plots.R
index a425f37..cd1563d 100644
--- a/scripts/plotting/lineage_dist_plots.R
+++ b/scripts/plotting/lineage_dist_plots.R
@@ -16,9 +16,9 @@ source("Header_TT.R") # also loads all my functions
 #===========
 # input
 #===========
-#drug = "streptomycin"
-#gene = "gid"
-source("get_plotting_dfs.R")
+drug = "streptomycin"
+gene = "gid"
+#source("get_plotting_dfs.R")
 
 spec = matrix(c(
   "drug"       , "d",  1, "character",
@@ -47,7 +47,7 @@ plot_lineage_dist_dm_om_ps =  paste0(plotdir,"/", lineage_dist_dm_om_ps)
 
 ###########################
 # Data for plots
-# you need merged_df2 or merged_df2_comp
+# you need merged_df2_combined or merged_df2_combined_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 # using the _comp dataset means
@@ -59,10 +59,12 @@ plot_lineage_dist_dm_om_ps =  paste0(plotdir,"/", lineage_dist_dm_om_ps)
 # Data for plots
 #===================
 # quick checks
-table(merged_df2$mutation_info_labels); levels(merged_df2$lineage_labels)
-table(merged_df2$lineage_labels); levels(merged_df2$mutation_info_labels)
+table(merged_df2_combined$mutation_info_labels); levels(merged_df2_combined$lineage_labels)
+table(merged_df2_combined$lineage_labels); levels(merged_df2_combined$mutation_info_labels)
 
-lin_dist_plot = merged_df2[merged_df2$lineage_labels%in%c("L1", "L2", "L3", "L4"),]
+sel_lineages = c("L1", "L2", "L3", "L4")
+
+lin_dist_plot = merged_df2_combined[merged_df2_combined$lineage_labels%in%sel_lineages,]
 table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels)
 
 # refactor
@@ -79,29 +81,55 @@ table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage}
 length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to}
 length(lin_dist_plot$mutationinformation)
 
-u2 = unique(merged_df2$mutationinformation)
+u2 = unique(merged_df2_combined$mutationinformation)
 u = unique(lin_dist_plot$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
 #-----------------------------------------------------------------------
-# without facet
+
+my_x_and_t = c("duet_scaled", "mCSM-DUET")
+my_x_and_t = c("foldx_scaled", "FoldX")
+#my_x_and_t = c("deepddg_scaled", "DeepDDG")
+
+my_x_and_t = c("ddg_dynamut2_scaled", "Dynamut2")
+my_x_and_t = c("ddg_dynamut_scaled", "Dynamut")
+
+my_x_and_t = c("ddg_mcsm_scaled", "mCSM")
+my_x_and_t = c("ddg_sdm_scaled", "SDM")
+my_x_and_t = c("ddg_duet_scaled", "DUET-d")
+
+my_x_and_t = c("ddg_encom_scaled", "EnCOM-Stability")
+my_x_and_t = c("dds_encom_scaled", "EnCOM-Flexibility")
+
+my_x_and_t = c("mcsm_na_scaled", "mCSM-NA")
+
+# TO DO
+my_x_and_t = c("affinity_scaled", "mCSM-Lig") #ligdist< 10
+
+#=====================
+# Plot: without facet
+#=====================
+
 linP_dm_om = lineage_distP(lin_dist_plot
-              , with_facet = F
-              , x_axis = "deepddg"
+              , x_axis = my_x_and_t[1]
+              , x_lab = my_x_and_t[2]
               , y_axis = "lineage_labels"
-              , x_lab = "DeepDDG"
               , leg_label = "Mutation Class"
-)
+              , with_facet = F)
 linP_dm_om
 
-# with facet
+#=====================
+# Plot: with facet
+#=====================
+
 linP_dm_om_facet = lineage_distP(lin_dist_plot
-              , with_facet = T
-              , facet_wrap_var = "mutation_info_labels"
-              , leg_label = "Mutation Class"
-              , leg_pos_wf = "none"
-              , leg_dir_wf = "horizontal"
-              
-)
+                                 , x_axis = my_x_and_t[1]
+                                 , x_lab = my_x_and_t[2]
+                                 , y_axis = "lineage_labels"
+                                 , with_facet = T
+                                 , facet_wrap_var = "mutation_info_labels"
+                                 , leg_label = "Mutation Class"
+                                 , leg_pos_wf = "none"
+                                 , leg_dir_wf = "horizontal")
 linP_dm_om_facet
 
 #=================
@@ -109,6 +137,7 @@ linP_dm_om_facet
 # without facet
 #=================
 svg(plot_lineage_dist_dm_om_ps)
+
 linP_dm_om
 
 dev.off()
diff --git a/scripts/plotting/other_plots_data.R b/scripts/plotting/other_plots_data.R
deleted file mode 100755
index a55303b..0000000
--- a/scripts/plotting/other_plots_data.R
+++ /dev/null
@@ -1,538 +0,0 @@
-#!/usr/bin/env Rscript  
-#########################################################
-# TASK: Script to format data for dm om plots: 
-# generating LF data
-# sourced by get_plotting_dfs.R
-#########################################################
-# working dir and loading libraries
-# getwd()
-# setwd("~/git/LSHTM_analysis/scripts/plotting")
-# getwd()
-
-# make cmd
-# globals
-# drug = "streptomycin"
-# gene = "gid"
-
-# source("get_plotting_dfs.R")
-#=======================================================================
-# MOVE TO COMBINE or singular file for deepddg
-# 
-# cols_to_select = c("mutation", "mutationinformation"
-#                    , "wild_type", "position", "mutant_type"
-#                    , "mutation_info")
-# 
-# merged_df3_short = merged_df3[, cols_to_select]
-
-# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
-#                       , "_mcsm_formatted_snps.csv")
-# 
-# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
-# names(mcsm_f_snps) <- "mutationinformation"
-
-# write merged_df3 to generate structural figure on chimera
-#write.csv(merged_df3_short, "merged_df3_short.csv")
-#========================================================================
-# MOVE TO COMBINE or singular file for deepddg
-
-#============================
-# adding deepddg scaled values
-# scale data b/w -1 and 1
-#============================
-n = which(colnames(merged_df3) == "deepddg"); n 
-
-my_min = min(merged_df3[,n]); my_min 
-my_max = max(merged_df3[,n]); my_max 
-
-merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
-                            , merged_df3[,n]/abs(my_min)
-                            , merged_df3[,n]/my_max) 
-# sanity check
-my_min = min(merged_df3$deepddg_scaled); my_min 
-my_max = max(merged_df3$deepddg_scaled); my_max
-
-if (my_min == -1 && my_max == 1){
-  cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
-      #, "\nProceeding with assigning deep outcome category")
-      , "\n")
-}else{
-  cat("\nFAIL: could not scale DeepDDG ddg values"
-      , "Aborting!")
-}
-
-#========================================================================
-# cols to select
-
-cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation"
-                               , "mutation_info", "position"
-                               , LigDist_colname
-                               , "duet_stability_change", "duet_scaled", "duet_outcome"
-                               , "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
-                               , "ddg_foldx", "foldx_scaled", "foldx_outcome"
-                               , "deepddg", "deepddg_scaled", "deepddg_outcome"
-                               , "asa", "rsa"
-                               , "rd_values", "kd_values"
-                               , "log10_or_mychisq", "neglog_pval_fisher", "af")]
-
-cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" 
-                                  , "mcsm_na_affinity", "mcsm_na_scaled"
-                                  , "mcsm_na_outcome")]
-# entire dynamut_df
-
-cols_dynamut2_df <- dynamut2_df[, c("mutationinformation"
-                                    , "ddg_dynamut2", "ddg_dynamut2_scaled"
-                                    , "ddg_dynamut2_outcome")]
-
-n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + 
-  length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols
-
-i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df))
-i2<- intersect(names(dynamut_df), names(cols_dynamut2_df))
-merging_cols <- intersect(i1, i2)
-cat("\nmerging_cols:", merging_cols)
-
-if (merging_cols == "mutationinformation") {
-  cat("\nStage 1: Found common col between dfs, checking values in it...")
-  c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]])
-  c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]])
-  c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]])
-  c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]])
-  cols_check <- c(c1, c2, c3, c4)
-  expected_cols = n_comb_cols - ( length(cols_check) - 1)
-  if (all(cols_check)){
-    cat("\nStage 2: Proceeding with merging dfs:\n")
-    comb_df <- Reduce(inner_join, list(cols_mcsm_df
-                                       , cols_mcsm_na_df
-                                       , dynamut_df
-                                       , cols_dynamut2_df))
-    comb_df_s = arrange(comb_df, position)
-    
-    # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) {
-    #   cat("\Stage3, PASS: dfs merged sucessfully"
-    #       , "\nnrow of merged_df: ", nrow(comb_df_s)
-    #       , "\nncol of merged_df:", ncol(comb_df_s))
-    #   }
-    
-    }
-}
-#names(comb_df_s)
-cat("\n!!!IT GOT TO HERE!!!!")
-#=======================================================================
-fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
-fact_cols
-lapply(comb_df_s[, fact_cols], class)
-comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
-
-if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
-  cat("\nChanging cols to factor")
-  comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
-  if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
-    cat("\nSuccessful: cols changed to factor")
-  }
-}
-lapply(comb_df_s[, fact_cols], class)
-
-#=======================================================================
-table(comb_df_s$mutation_info)
-
- # further checks to make sure dr and other muts are indeed unique
-dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
-dr_muts_names = unique(dr_muts$mutation)
-
-other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
-other_muts_names = unique(other_muts$mutation)
-
-if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
-  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
-  cat("PASS: dr and other muts are indeed unique")
-}else{
-  cat("FAIL: dr and others muts are NOT unique!")
-  quit()
-}
-
-# pretty display names i.e. labels to reduce major code duplication later
-foo_cnames = data.frame(colnames(comb_df_s))
-names(foo_cnames) <- "old_name"
-
-stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
-flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
-
-lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
-duet_dn      = paste0("DUET ", stability_suffix); duet_dn
-foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
-deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
-mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
-dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
-dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
-encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
-encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
-sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
-mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
-
-# Change colnames of some columns using datatable 
-comb_df_sl = comb_df_s
-names(comb_df_sl)
-
-setnames(comb_df_sl
-         , old = c("asa", "rsa", "rd_values", "kd_values"
-                   , "log10_or_mychisq", "neglog_pval_fisher", "af"
-                   , LigDist_colname
-                   , "duet_scaled"
-                   , "foldx_scaled"
-                   , "deepddg_scaled"
-                   , "mcsm_na_scaled"
-                   , "ddg_dynamut_scaled"
-                   , "ddg_dynamut2_scaled"
-                   , "ddg_encom_scaled"
-                   , "dds_encom_scaled"
-                   , "ddg_sdm"
-                   , "ddg_mcsm")
-                   
-         , new = c("ASA", "RSA", "RD", "KD"
-                   , "Log10 (OR)", "-Log (P)", "MAF"
-                   , lig_dn
-                   , duet_dn
-                   , foldx_dn
-                   , deepddg_dn
-                   , mcsm_na_dn
-                   , dynamut_dn
-                   , dynamut2_dn
-                   , encom_ddg_dn
-                   , encom_dds_dn
-                   , sdm_dn
-                   , mcsm_dn)
-         )
-
-foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
-
-# some more pretty labels
-table(comb_df_sl$mutation_info)
-
-levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
-levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
-
-table(comb_df_sl$mutation_info)
-
-#######################################################################
-#======================
-# Selecting dfs
-# with appropriate cols
-#=======================
-static_cols_start =  c("mutationinformation"
-                       , "position"
-                       , "mutation"
-                       , "mutation_info")
-
-static_cols_end = c(lig_dn
-                    , "ASA"
-                    , "RSA"
-                    , "RD"
-                    , "KD")
-
-# ordering is important!
-
-#########################################################################
-#==============
-# DUET: LF
-#==============
-cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
-wf_duet = comb_df_sl[, cols_to_select_duet]
-
-#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
-pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
-
-expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
-expected_rows_lf
-
-# LF data: duet
-lf_duet = gather(wf_duet
-                  , key = param_type
-                  , value = param_value
-                  , all_of(duet_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_duet) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", duet_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# FoldX: LF
-#==============
-cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
-wf_foldx = comb_df_sl[, cols_to_select_foldx]
-
-pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
-
-expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
-expected_rows_lf
-
-# LF data: duet
-print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>")
-lf_foldx <<- gather(wf_foldx
-                 , key = param_type
-                 , value = param_value
-                 , all_of(foldx_dn):tail(static_cols_end,1)
-                 , factor_key = TRUE)
-
-if (nrow(lf_foldx) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", foldx_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# Deepddg: LF
-#==============
-cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
-wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
-
-pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
-
-expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
-expected_rows_lf
-
-# LF data: duet
-lf_deepddg = gather(wf_deepddg
-                  , key = param_type
-                  , value = param_value
-                  , all_of(deepddg_dn):tail(static_cols_end,1)
-                  , factor_key = TRUE)
-
-if (nrow(lf_deepddg) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", deepddg_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# mCSM-NA: LF
-#==============
-cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
-wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
-
-pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
-
-expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
-expected_rows_lf
-
-# LF data: duet
-lf_mcsm_na = gather(wf_mcsm_na
-                    , key = param_type
-                    , value = param_value
-                    , all_of(mcsm_na_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_mcsm_na) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", mcsm_na_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# Dynamut: LF
-#==============
-cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
-wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
-
-pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
-
-expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
-expected_rows_lf
-
-# LF data: duet
-lf_dynamut = gather(wf_dynamut
-                    , key = param_type
-                    , value = param_value
-                    , all_of(dynamut_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_dynamut) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", dynamut_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# Dynamut2: LF
-#==============
-cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
-
-wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
-
-pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
-
-expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
-expected_rows_lf
-
-# LF data: duet
-lf_dynamut2 = gather(wf_dynamut2
-                    , key = param_type
-                    , value = param_value
-                    , all_of(dynamut2_dn):tail(static_cols_end,1)
-                    , factor_key = TRUE)
-
-if (nrow(lf_dynamut2) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", dynamut2_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# EnCOM ddg: LF
-#==============
-cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
-wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
-
-pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
-
-expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
-expected_rows_lf
-
-# LF data: encomddg 
-lf_encomddg  = gather(wf_encomddg 
-                     , key = param_type
-                     , value = param_value
-                     , all_of(encom_ddg_dn):tail(static_cols_end,1)
-                     , factor_key = TRUE)
-
-if (nrow(lf_encomddg) == expected_rows_lf){
-  cat("\nPASS: long format data created for ", encom_ddg_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-############################################################################
-#==============
-# EnCOM dds: LF
-#==============
-cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
-wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
-
-pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
-
-expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
-expected_rows_lf
-
-# LF data: encomddg 
-lf_encomdds  = gather(wf_encomdds
-                      , key = param_type
-                      , value = param_value
-                      , all_of(encom_dds_dn):tail(static_cols_end,1)
-                      , factor_key = TRUE)
-
-if (nrow(lf_encomdds) == expected_rows_lf){
-  cat("\nPASS: long format data created for", encom_dds_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# SDM: LF
-#==============
-cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
-wf_sdm = comb_df_sl[, cols_to_select_sdm]
-
-pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
-
-expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
-expected_rows_lf
-
-# LF data: encomddg 
-lf_sdm  = gather(wf_sdm
-                 , key = param_type
-                 , value = param_value
-                 , all_of(sdm_dn):tail(static_cols_end,1)
-                 , factor_key = TRUE)
-
-if (nrow(lf_sdm) == expected_rows_lf){
-  cat("\nPASS: long format data created for", sdm_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-
-############################################################################
-#==============
-# mCSM: LF
-#==============
-cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
-wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
-
-pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
-
-expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
-expected_rows_lf
-
-# LF data: encomddg 
-lf_mcsm  = gather(wf_mcsm
-                 , key = param_type
-                 , value = param_value
-                 , all_of(mcsm_dn):tail(static_cols_end,1)
-                 , factor_key = TRUE)
-
-if (nrow(lf_mcsm) == expected_rows_lf){
-  cat("\nPASS: long format data created for", mcsm_dn)
-}else{
-  cat("\nFAIL: long format data could not be created for duet")
-  quit()
-}
-############################################################################
-# clear excess variables
-rm(all_plot_dfs
-   , cols_dynamut2_df
-   , cols_mcsm_df
-   , cols_mcsm_na_df
-   , comb_df
-   , corr_data_ps
-   , corr_ps_df3
-   , df_lf_ps
-   , foo
-   , foo_cnames
-   , gene_metadata
-   , logo_data
-   , logo_data_or_mult
-   , logo_data_plot
-   , logo_data_plot_logor
-   , logo_data_plot_or
-   , my_data_snp
-   , my_df
-   , my_df_u
-   , other_muts
-   , pd_df
-   , subcols_df_ps
-   , tab_mt
-   , wide_df_logor
-   , wide_df_logor_m
-   , wide_df_or
-   , wide_df_or_mult
-   , wt)
-
-
-rm(c3, c4, check1
-   , cols_check
-   , cols_to_select
-   , cols_to_select_deepddg
-   , cols_to_select_duet
-   , cols_to_select_dynamut
-   , cols_to_select_dynamut2
-   , cols_to_select_encomddg
-   , cols_to_select_encomdds
-   , cols_to_select_mcsm
-   , cols_to_select_mcsm_na
-   , cols_to_select_sdm)

From 5c8a9e8f0013f0970cbf2d607dec3126ecadbc93 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 10 Sep 2021 18:16:41 +0100
Subject: [PATCH 21/51] sorted combining_dfs.py with all other data files and
 tidied up get_plotting_dfs.R

---
 scripts/combining_dfs.py            | 71 ++++++++++++++++++++--------
 scripts/functions/plotting_data.R   | 72 +++++++++++++++--------------
 scripts/plotting/get_plotting_dfs.R | 44 +++++++++---------
 3 files changed, 111 insertions(+), 76 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 4e2781e..faa9677 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -169,25 +169,31 @@ i_join = 'inner'
 #=====================
 # some preprocessing
 #=====================
-#-------------
+
+#===========
 # FoldX
-#-------------
+#===========
 foldx_df.shape
-#=======================
+
+#----------------------
 # scale foldx values
-#=======================
+#----------------------
+# rename ddg column to ddg_foldx
+foldx_df['ddg']
+foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'})   
+foldx_df['ddg_foldx']
   
 # Rescale values in Foldx_change col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
-foldx_min = foldx_df['ddg'].min() 
-foldx_max = foldx_df['ddg'].max() 
+foldx_min = foldx_df['ddg_foldx'].min() 
+foldx_max = foldx_df['ddg_foldx'].max() 
 foldx_min
 foldx_max
 
 foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
 
-foldx_df['foldx_scaled'] = foldx_df['ddg'].apply(foldx_scale)
-print('Raw foldx scores:\n', foldx_df['ddg']
+foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
+print('Raw foldx scores:\n', foldx_df['ddg_foldx']
     , '\n---------------------------------------------------------------'
     , '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
 
@@ -195,8 +201,8 @@ print('Raw foldx scores:\n', foldx_df['ddg']
 fsmi = foldx_df['foldx_scaled'].min()
 fsma = foldx_df['foldx_scaled'].max()
 
-c = foldx_df[foldx_df['ddg']>=0].count()
-foldx_pos = c.get(key = 'ddg')
+c = foldx_df[foldx_df['ddg_foldx']>=0].count()
+foldx_pos = c.get(key = 'ddg_foldx')
 
 c2 = foldx_df[foldx_df['foldx_scaled']>=0].count()
 foldx_pos2 = c2.get(key = 'foldx_scaled')
@@ -209,20 +215,30 @@ else:
           , '\nGot:', foldx_pos2
           , '\n======================================================')
 
-# rename ddg column to ddg_foldx
-foldx_df['ddg']
-foldx_df = foldx_df.rename(columns = {'ddg':'ddg_foldx'})   
-foldx_df['ddg_foldx']
+#-------------------------
+# foldx outcome category
+#--------------------------
+foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+foldx_df[foldx_df['ddg_foldx']>=0].count()
+foc = foldx_df['foldx_outcome'].value_counts()
 
-#-------------
+if foc['Stabilising'] == foldx_pos and  foc['Stabilising'] == foldx_pos2:
+    print('\nPASS: Foldx outcome category created')
+else:
+    print('\nFAIL: Foldx outcome category could NOT be created'
+          , '\nExpected number:', foldx_pos
+          , '\nGot:', foc[0]
+          , '\n======================================================')
+    sys.exit()
+
+#=======================
 # Deepddg
-#-------------
+#=======================
 deepddg_df.shape
 
-#=======================
+#-------------------------
 # scale Deepddg values
-#=======================
-
+#-------------------------
 # Rescale values in deepddg_change col b/w -1 and 1 so negative numbers
 # stay neg and pos numbers stay positive
 deepddg_min = deepddg_df['deepddg'].min() 
@@ -252,6 +268,23 @@ else:
           , '\nExpected number:', deepddg_pos
           , '\nGot:', deepddg_pos2
           , '\n======================================================')
+    
+#--------------------------
+# Deepddg outcome category
+#--------------------------
+deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+deepddg_df[deepddg_df['deepddg']>=0].count()
+doc = deepddg_df['deepddg_outcome'].value_counts()
+
+if doc['Stabilising'] == deepddg_pos and  doc['Stabilising'] == deepddg_pos2:
+    print('\nPASS: Deepddg outcome category created')
+else:
+    print('\nFAIL: Deepddg outcome category could NOT be created'
+          , '\nExpected number:', deepddg_pos
+          , '\nGot:', doc[0]
+          , '\n======================================================')
+    sys.exit()
+
 #%%=============================================================================
 # Now merges begin
 #%%=============================================================================
diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R
index ddda207..5744faa 100755
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@@ -16,7 +16,9 @@ library(dplyr)
   ## my_df_u_lig
   ## dup_muts
 #========================================================
-plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
+plotting_data <- function(df
+                          , lig_dist_colname = 'ligand_distance'
+                          , lig_dist_cutoff = 10) {
 my_df       = data.frame()
 my_df_u     = data.frame()
 my_df_u_lig = data.frame()
@@ -38,51 +40,51 @@ cat("\nInput dimensions:", dim(df))
 #==================================
 
 #------------------------------
-# adding foldx scaled values
-# scale data b/w -1 and 1
-#------------------------------
-n = which(colnames(df) == "ddg"); n 
-
-my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max 
-
-df$foldx_scaled = ifelse(df[,n] < 0
-                         , df[,n]/abs(my_min)
-                         , df[,n]/my_max) 
-# sanity check
-my_min = min(df$foldx_scaled); my_min 
-my_max = max(df$foldx_scaled); my_max
-
-if (my_min == -1 && my_max == 1){
-  cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
-      , "\nProceeding with assigning foldx outcome category")
-}else{
-  cat("\nFAIL: could not scale foldx ddg values"
-      , "Aborting!\n")
-}
+# # adding foldx scaled values
+# # scale data b/w -1 and 1
+# #------------------------------
+# n = which(colnames(df) == "ddg"); n 
+# 
+# my_min = min(df[,n]); my_min 
+# my_max = max(df[,n]); my_max 
+# 
+# df$foldx_scaled = ifelse(df[,n] < 0
+#                          , df[,n]/abs(my_min)
+#                          , df[,n]/my_max) 
+# # sanity check
+# my_min = min(df$foldx_scaled); my_min 
+# my_max = max(df$foldx_scaled); my_max
+# 
+# if (my_min == -1 && my_max == 1){
+#   cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
+#       , "\nProceeding with assigning foldx outcome category")
+# }else{
+#   cat("\nFAIL: could not scale foldx ddg values"
+#       , "Aborting!\n")
+# }
 
 #------------------------------
 # adding foldx outcome category
 # ddg<0 = "Stabilising" (-ve)
 #------------------------------
-c1 = table(df$ddg < 0)
-df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
-c2 = table(df$ddg < 0)
-
-if ( all(c1 == c2) ){
-  cat("\nPASS: foldx outcome successfully created")
-}else{
-  cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
-  exit()
-}
+# c1 = table(df$ddg < 0)
+# df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
+# c2 = table(df$ddg < 0)
+# 
+# if ( all(c1 == c2) ){
+#   cat("\nPASS: foldx outcome successfully created")
+# }else{
+#   cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
+#   exit()
+# }
 
 #------------------------------
 # renaming foldx column from 
 # "ddg" --> "ddg_foldx"
 #------------------------------
 
-# change name to foldx
-colnames(df)[n] <- "ddg_foldx"
+# # change name to foldx
+# colnames(df)[n] <- "ddg_foldx"
 
 #==================================
 # extract unique mutation entries
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index f1a7620..c1ce5b2 100755
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -97,33 +97,33 @@ merged_df3_comp = all_plot_dfs[[4]]
 # adding deepddg scaled values
 # scale data b/w -1 and 1
 #============================
-n = which(colnames(merged_df3) == "deepddg"); n 
-
-my_min = min(merged_df3[,n]); my_min 
-my_max = max(merged_df3[,n]); my_max 
-
-merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
-                                   , merged_df3[,n]/abs(my_min)
-                                   , merged_df3[,n]/my_max) 
-# sanity check
-my_min = min(merged_df3$deepddg_scaled); my_min 
-my_max = max(merged_df3$deepddg_scaled); my_max
-
-if (my_min == -1 && my_max == 1){
-   cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
-       #, "\nProceeding with assigning deep outcome category")
-       , "\n")
-}else{
-   cat("\nFAIL: could not scale DeepDDG ddg values"
-       , "Aborting!")
-}
-
+# n = which(colnames(merged_df3) == "deepddg"); n 
+# 
+# my_min = min(merged_df3[,n]); my_min 
+# my_max = max(merged_df3[,n]); my_max 
+# 
+# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
+#                                    , merged_df3[,n]/abs(my_min)
+#                                    , merged_df3[,n]/my_max) 
+# # sanity check
+# my_min = min(merged_df3$deepddg_scaled); my_min 
+# my_max = max(merged_df3$deepddg_scaled); my_max
+# 
+# if (my_min == -1 && my_max == 1){
+#    cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
+#        #, "\nProceeding with assigning deep outcome category")
+#        , "\n")
+# }else{
+#    cat("\nFAIL: could not scale DeepDDG ddg values"
+#        , "Aborting!")
+# }
+# 
 
 ####################################################################
 #                        Data for combining other dfs
 ####################################################################
 
-source("other_dfs_data.R")
+#source("other_dfs_data.R")
 
 ####################################################################
 #                        Data for subcols barplot (~heatmap)

From 3ddbee8c90428d9203d73c34fcd342d3b995f631 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 10 Sep 2021 18:19:01 +0100
Subject: [PATCH 22/51] finally moved foldx_outcome and deepddg_outcome calcs
 to combine_dfs.py in python script i.e cleaned source data

---
 scripts/functions/plotting_data.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R
index 5744faa..fa4e9c1 100755
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@@ -37,6 +37,7 @@ cat("\nInput dimensions:", dim(df))
 
 # This will enable to always have these variables available
 # when calling for plots
+# included this now in combine_dfs.py!!!! finallyS
 #==================================
 
 #------------------------------

From 27f0b15d4c162efccc0b7fa3a9eb4dd297c32b12 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 10 Sep 2021 18:19:56 +0100
Subject: [PATCH 23/51] tidied script plotting_data.R by removing superceded
 code

---
 scripts/functions/plotting_data.R | 56 -------------------------------
 1 file changed, 56 deletions(-)

diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R
index fa4e9c1..faaebca 100755
--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@@ -31,62 +31,6 @@ dup_muts    = data.frame()
 
 cat("\nInput dimensions:", dim(df)) 
 
-#==================================
-# add foldx outcome category
-# and foldx scaled values 
-
-# This will enable to always have these variables available
-# when calling for plots
-# included this now in combine_dfs.py!!!! finallyS
-#==================================
-
-#------------------------------
-# # adding foldx scaled values
-# # scale data b/w -1 and 1
-# #------------------------------
-# n = which(colnames(df) == "ddg"); n 
-# 
-# my_min = min(df[,n]); my_min 
-# my_max = max(df[,n]); my_max 
-# 
-# df$foldx_scaled = ifelse(df[,n] < 0
-#                          , df[,n]/abs(my_min)
-#                          , df[,n]/my_max) 
-# # sanity check
-# my_min = min(df$foldx_scaled); my_min 
-# my_max = max(df$foldx_scaled); my_max
-# 
-# if (my_min == -1 && my_max == 1){
-#   cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
-#       , "\nProceeding with assigning foldx outcome category")
-# }else{
-#   cat("\nFAIL: could not scale foldx ddg values"
-#       , "Aborting!\n")
-# }
-
-#------------------------------
-# adding foldx outcome category
-# ddg<0 = "Stabilising" (-ve)
-#------------------------------
-# c1 = table(df$ddg < 0)
-# df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
-# c2 = table(df$ddg < 0)
-# 
-# if ( all(c1 == c2) ){
-#   cat("\nPASS: foldx outcome successfully created")
-# }else{
-#   cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
-#   exit()
-# }
-
-#------------------------------
-# renaming foldx column from 
-# "ddg" --> "ddg_foldx"
-#------------------------------
-
-# # change name to foldx
-# colnames(df)[n] <- "ddg_foldx"
-
 #==================================
 # extract unique mutation entries
 #==================================

From 3f3fe89a6b36bee4df1e0d89863aa0285e6ce309 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 10 Sep 2021 18:20:45 +0100
Subject: [PATCH 24/51] added shorter scripts for each different processing for
 plots to make it wasire to read code

---
 scripts/plotting/coloured_bp_data.R           |  80 +++
 scripts/plotting/corr_data.R                  |  67 +++
 scripts/plotting/dm_om_data.R                 | 416 ++++++++++++++++
 scripts/plotting/logo_data.R                  | 142 ++++++
 scripts/plotting/redundant/other_dfs_data.R   | 117 +++++
 scripts/plotting/redundant/other_plots_data.R | 470 ++++++++++++++++++
 6 files changed, 1292 insertions(+)
 create mode 100644 scripts/plotting/coloured_bp_data.R
 create mode 100644 scripts/plotting/corr_data.R
 create mode 100644 scripts/plotting/dm_om_data.R
 create mode 100644 scripts/plotting/logo_data.R
 create mode 100644 scripts/plotting/redundant/other_dfs_data.R
 create mode 100755 scripts/plotting/redundant/other_plots_data.R

diff --git a/scripts/plotting/coloured_bp_data.R b/scripts/plotting/coloured_bp_data.R
new file mode 100644
index 0000000..a1f0964
--- /dev/null
+++ b/scripts/plotting/coloured_bp_data.R
@@ -0,0 +1,80 @@
+#!/usr/bin/env Rscript  
+#################################################################
+# TASK: Script to add bp colours ~ barplot heatmap
+#################################################################
+
+my_df = merged_df3
+
+cols_to_select = c("mutationinformation", "drtype"
+                   , "wild_type"
+                   , "position"
+                   , "mutant_type"
+                   , "chain", "ligand_id", "ligand_distance"
+                   , "duet_stability_change", "duet_outcome", "duet_scaled"
+                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
+                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
+                   , "asa", "rsa", "rd_values", "kd_values"
+                   , "af", "or_mychisq", "pval_fisher" 
+                   , "or_fisher", "or_logistic", "pval_logistic"
+                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
+                   , "wt_calcprop", "mut_calcprop")
+
+#=======================
+# Data for sub colours
+# barplot: PS
+#=======================
+
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_ps = my_df[, cols_to_select]
+
+cat("\nNo of unique positions for ps:"
+    , length(unique(subcols_df_ps$position)))
+
+# add count_pos col that counts the no. of nsSNPS at a position
+setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
+
+# should be a factor
+if (is.factor(subcols_df_ps$duet_outcome)){
+  cat("\nDuet_outcome is factor")
+  table(subcols_df_ps$duet_outcome)
+}else{
+  cat("\nConverting duet_outcome to factor")
+  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
+  table(subcols_df_ps$duet_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_ps$duet_scaled)
+max(subcols_df_ps$duet_scaled)
+
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in duet scaled, no rounding:"
+    , length(unique(subcols_df_ps$duet_scaled)))
+
+# No rounding    
+my_grp = subcols_df_ps$duet_scaled; length(my_grp)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_ps$duet_scaledR)))
+
+my_grp_r = subcols_df_ps$duet_scaledR  # rounding
+
+# Add grp cols
+subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
+subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
+subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
+
+cat("Colour palette generated for my_grp: ", length(subcols_ps), " colours")
+cat("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")
diff --git a/scripts/plotting/corr_data.R b/scripts/plotting/corr_data.R
new file mode 100644
index 0000000..d33efc5
--- /dev/null
+++ b/scripts/plotting/corr_data.R
@@ -0,0 +1,67 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for corr plots
+#########################################################
+
+#=================================================
+#         Data for Corrplots
+#=================================================
+cat("\n=========================================="
+    , "\nCORR PLOTS data: ALL params"
+    , "\n=========================================")
+
+# use data
+#merged_df2
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find!
+big_df_colnames = data.frame(names(merged_df2))
+
+corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels"
+                   , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa"
+                   , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af"
+                   , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity"
+                   , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance")
+
+#===========================
+# Corr data for plots: PS
+# big_df ps: ~ merged_df2
+#===========================
+
+corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select]
+
+#===========================
+# Corr data for plots: PS
+# short_df ps: ~merged_df3
+#===========================
+
+corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),]
+
+na_or = sum(is.na(corr_df_m3$log10_or_mychisq))
+check1 = nrow(corr_df_m3) - na_or; check1
+
+if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
+  cat( "\nPASS: No. of rows for corr_df_m3 match"
+       , "\nPASS: No. of OR values checked: " , check1)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3)
+      , "\nGot: ", nrow(corr_df_m3)
+      , "\nExpected OR values: ", nrow(merged_df3_comp)
+      , "\nGot: ", check1)
+}
+
+cat("\nCorr Data created:"
+, "\n==================================="
+, "\ncorr_df_m2: created from merged_df2"
+, "\n==================================="
+, "\nnrows:", nrow(corr_df_m2)
+, "\nncols:", ncol(corr_df_m2)
+, "\n==================================="
+, "\ncorr_df_m3: created from merged_df3"
+, "\n==================================="
+, "\nnrows:", nrow(corr_df_m3)
+, "\nncols:", ncol(corr_df_m3)
+)
diff --git a/scripts/plotting/dm_om_data.R b/scripts/plotting/dm_om_data.R
new file mode 100644
index 0000000..4bd82e7
--- /dev/null
+++ b/scripts/plotting/dm_om_data.R
@@ -0,0 +1,416 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
+#########################################################
+##========================================================================
+# cols to select: 
+# THINK: whu
+
+comb_df <- merged_df3[, c("mutationinformation", "mutation"
+                               , "mutation_info","mutation_info_labels"
+                               , "position"
+                               , LigDist_colname
+                               , "duet_stability_change", "duet_scaled", "duet_outcome"
+                               , "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
+                               , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                               , "deepddg", "deepddg_scaled", "deepddg_outcome"
+                               , "asa", "rsa"
+                               , "rd_values", "kd_values"
+                               , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                               , "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome"
+                               , "ddg_dynamut", "ddg_dynamut_scaled","ddg_dynamut_outcome"
+                               , "ddg_encom",  "ddg_encom_scaled", "ddg_encom_outcome"
+                               , "dds_encom",  "dds_encom_scaled", "dds_encom_outcome"
+                               , "ddg_mcsm",  "ddg_mcsm_scaled",  "ddg_mcsm_outcome"    
+                               , "ddg_sdm",  "ddg_sdm_scaled",  "ddg_sdm_outcome"
+                               , "ddg_duet", "ddg_duet_scaled",  "ddg_duet_outcome"
+                               , "ddg_dynamut2","ddg_dynamut2_scaled",  "ddg_dynamut2_outcome")]
+                               
+
+comb_df_s = arrange(comb_df, position)
+    
+#=======================================================================
+fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
+fact_cols
+lapply(comb_df_s[, fact_cols], class)
+comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
+
+if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
+  cat("\nChanging cols to factor")
+  comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
+  if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}
+lapply(comb_df_s[, fact_cols], class)
+
+#=======================================================================
+table(comb_df_s$mutation_info)
+
+ # further checks to make sure dr and other muts are indeed unique
+dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
+dr_muts_names = unique(dr_muts$mutation)
+
+other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
+other_muts_names = unique(other_muts$mutation)
+
+if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
+  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
+  cat("PASS: dr and other muts are indeed unique")
+}else{
+  cat("FAIL: dr and others muts are NOT unique!")
+  quit()
+}
+
+# pretty display names i.e. labels to reduce major code duplication later
+foo_cnames = data.frame(colnames(comb_df_s))
+names(foo_cnames) <- "old_name"
+
+stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
+flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
+
+lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
+duet_dn      = paste0("DUET ", stability_suffix); duet_dn
+foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
+deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
+mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
+dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
+dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
+encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
+encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
+sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
+mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
+
+# Change colnames of some columns using datatable 
+comb_df_sl = comb_df_s
+names(comb_df_sl)
+
+setnames(comb_df_sl
+         , old = c("asa", "rsa", "rd_values", "kd_values"
+                   , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                   , LigDist_colname
+                   , "duet_scaled"
+                   , "foldx_scaled"
+                   , "deepddg_scaled"
+                   , "mcsm_na_scaled"
+                   , "ddg_dynamut_scaled"
+                   , "ddg_dynamut2_scaled"
+                   , "ddg_encom_scaled"
+                   , "dds_encom_scaled"
+                   , "ddg_sdm"
+                   , "ddg_mcsm")
+                   
+         , new = c("ASA", "RSA", "RD", "KD"
+                   , "Log10 (OR)", "-Log (P)", "MAF"
+                   , lig_dn
+                   , duet_dn
+                   , foldx_dn
+                   , deepddg_dn
+                   , mcsm_na_dn
+                   , dynamut_dn
+                   , dynamut2_dn
+                   , encom_ddg_dn
+                   , encom_dds_dn
+                   , sdm_dn
+                   , mcsm_dn)
+         )
+
+foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
+
+# some more pretty labels
+table(comb_df_sl$mutation_info)
+
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
+
+table(comb_df_sl$mutation_info)
+
+#######################################################################
+#======================
+# Selecting dfs
+# with appropriate cols
+#=======================
+static_cols_start =  c("mutationinformation"
+                       , "position"
+                       , "mutation"
+                       , "mutation_info")
+
+static_cols_end = c(lig_dn
+                    , "ASA"
+                    , "RSA"
+                    , "RD"
+                    , "KD")
+
+# ordering is important!
+
+#########################################################################
+#==============
+# DUET: LF
+#==============
+cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
+wf_duet = comb_df_sl[, cols_to_select_duet]
+
+#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
+
+expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
+expected_rows_lf
+
+# LF data: duet
+lf_duet = gather(wf_duet
+                  , key = param_type
+                  , value = param_value
+                  , all_of(duet_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_duet) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", duet_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# FoldX: LF
+#==============
+cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
+wf_foldx = comb_df_sl[, cols_to_select_foldx]
+
+pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
+
+expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
+expected_rows_lf
+
+# LF data: Foldx
+lf_foldx <<- gather(wf_foldx
+                 , key = param_type
+                 , value = param_value
+                 , all_of(foldx_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_foldx) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", foldx_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Deepddg: LF
+#==============
+cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
+wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
+
+pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
+
+expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
+expected_rows_lf
+
+# LF data: Deepddg
+lf_deepddg = gather(wf_deepddg
+                  , key = param_type
+                  , value = param_value
+                  , all_of(deepddg_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_deepddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", deepddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM-NA: LF
+#==============
+cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
+wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+
+pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
+
+expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+expected_rows_lf
+
+# LF data: mcsm_na
+lf_mcsm_na = gather(wf_mcsm_na
+                    , key = param_type
+                    , value = param_value
+                    , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_mcsm_na) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", mcsm_na_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut: LF
+#==============
+cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
+wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
+
+pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
+
+expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
+expected_rows_lf
+
+# LF data: dynamut
+lf_dynamut = gather(wf_dynamut
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut2: LF
+#==============
+cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
+
+wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
+
+pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
+
+expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
+expected_rows_lf
+
+# LF data: dynamut2
+lf_dynamut2 = gather(wf_dynamut2
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut2_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut2) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut2_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# EnCOM ddg: LF
+#==============
+cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
+wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
+
+pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
+
+expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomddg  = gather(wf_encomddg 
+                     , key = param_type
+                     , value = param_value
+                     , all_of(encom_ddg_dn):tail(static_cols_end,1)
+                     , factor_key = TRUE)
+
+if (nrow(lf_encomddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", encom_ddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+#==============
+# EnCOM dds: LF
+#==============
+cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
+wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
+
+pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
+
+expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
+expected_rows_lf
+
+# LF data: encomdds 
+lf_encomdds  = gather(wf_encomdds
+                      , key = param_type
+                      , value = param_value
+                      , all_of(encom_dds_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+
+if (nrow(lf_encomdds) == expected_rows_lf){
+  cat("\nPASS: long format data created for", encom_dds_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# SDM: LF
+#==============
+cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
+wf_sdm = comb_df_sl[, cols_to_select_sdm]
+
+pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
+
+expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
+expected_rows_lf
+
+# LF data: sdm
+lf_sdm  = gather(wf_sdm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(sdm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_sdm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", sdm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM: LF
+#==============
+cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
+wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
+
+pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
+
+expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
+expected_rows_lf
+
+# LF data: mcsm
+lf_mcsm  = gather(wf_mcsm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(mcsm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_mcsm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", mcsm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+#==========================
+# Duet-d(from Dynamut): LF
+#===========================
+
+#Not created, redundant and chaos!
+
+############################################################################
+
diff --git a/scripts/plotting/logo_data.R b/scripts/plotting/logo_data.R
new file mode 100644
index 0000000..7eaf1b6
--- /dev/null
+++ b/scripts/plotting/logo_data.R
@@ -0,0 +1,142 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for Logo_plots
+#########################################################
+#-------------------------
+# choose df for logoplot
+#-------------------------
+logo_data = merged_df3
+#logo_data = merged_df3_comp
+
+# quick checks
+colnames(logo_data)
+str(logo_data)
+
+c1 = unique(logo_data$position) 
+nrow(logo_data)
+cat("No. of rows in my_data:", nrow(logo_data)
+    , "\nDistinct positions corresponding to snps:", length(c1)
+    , "\n===========================================================")
+#=======================================================================
+#==================
+# logo data: OR
+#==================
+foo = logo_data[, c("position"
+                    , "mutant_type","duet_scaled", "or_mychisq"
+                    , "mut_prop_polarity", "mut_prop_water")] 
+
+logo_data$log10or = log10(logo_data$or_mychisq)
+logo_data_plot = logo_data[, c("position"
+                               , "mutant_type", "or_mychisq", "log10or")]
+
+logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
+wide_df_or  =  logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
+
+wide_df_or = as.matrix(wide_df_or)
+rownames(wide_df_or) = wide_df_or[,1]
+dim(wide_df_or)
+wide_df_or = wide_df_or[,-1]
+str(wide_df_or)
+
+position_or = as.numeric(colnames(wide_df_or))
+
+#==================
+# logo data: logOR
+#==================
+logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
+wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
+
+wide_df_logor = as.matrix(wide_df_logor)
+
+rownames(wide_df_logor) = wide_df_logor[,1]
+wide_df_logor = subset(wide_df_logor, select = -c(1) )
+colnames(wide_df_logor)
+wide_df_logor_m = data.matrix(wide_df_logor)
+
+rownames(wide_df_logor_m)
+colnames(wide_df_logor_m)
+
+position_logor = as.numeric(colnames(wide_df_logor_m))
+
+#===============================
+# logo data: multiple nsSNPs (>1)
+#=================================
+#require(data.table)
+
+# get freq count of positions so you can subset freq<1
+setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
+
+table(logo_data$position)
+table(logo_data$mut_pos_occurrence)
+
+max_mut = max(table(logo_data$position))
+
+# extract freq_pos > 1
+my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
+
+#--------------------------------------
+# matrix for_mychisq mutant type
+# frequency of mutant type by position
+#---------------------------------------
+table(my_data_snp$mutant_type, my_data_snp$position)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)
+
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+# should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos
+
+#-------------------------------------
+# matrix for wild type
+# frequency of wild type by position
+#-------------------------------------
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
+tab_wt = unclass(tab_wt)
+
+# remove wt duplicates
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]
+
+tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
+
+rownames(tab_wt)
+rownames(tab_wt)
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
+
+#----------------------------------
+# logo data OR: multiple nsSNPs (>1)
+#----------------------------------
+logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
+#wide_df_or  =  logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
+wide_df_or_mult  =  logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
+
+wide_df_or_mult = as.matrix(wide_df_or_mult)
+rownames(wide_df_or_mult) = wide_df_or_mult[,1]
+wide_df_or_mult = wide_df_or_mult[,-1]
+str(wide_df_or_mult)
+
+position_or_mult = as.numeric(colnames(wide_df_or_mult))
diff --git a/scripts/plotting/redundant/other_dfs_data.R b/scripts/plotting/redundant/other_dfs_data.R
new file mode 100644
index 0000000..97b0567
--- /dev/null
+++ b/scripts/plotting/redundant/other_dfs_data.R
@@ -0,0 +1,117 @@
+#!/usr/bin/env Rscript  
+
+# Didn't end up using it: sorted it at the source
+# .py script to combine all dfs to output all_params
+
+#################################################################
+# TASK: Script to add all other dfs to merged_df2 and merged_df3
+
+#################################################################
+# Combine other dfs:
+# dynamut_df, dynamut2_df, mcsm_na_df, 
+# perhaps : deepddg and mcsm ppi (for embb)
+################################################################
+# read other files
+infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
+                            , "_complex_dynamut_norm.csv")
+
+infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
+                              , "_complex_dynamut2_norm.csv")
+
+infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
+                            , "_complex_mcsm_na_norm.csv")
+
+infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+                                 , "_mcsm_formatted_snps.csv")
+
+dynamut_df   = read.csv(infilename_dynamut)
+dynamut2_df  = read.csv(infilename_dynamut2)
+mcsm_na_df   = read.csv(infilename_mcsm_na)
+mcsm_f_snps  = read.csv(infilename_mcsm_f_snps, header = F)
+names(mcsm_f_snps) = "mutationinformation"
+
+#=================================
+# check with intersect to find the common col, but use 
+c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
+c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
+
+if (c1 == 1 && c2 == 1) {
+  n_common = 1
+}else{
+  cat("\nMore than one common col found, inspect before merging!")
+}
+
+# mutationinformation column to be on the safe side
+# delete chain from dynamut2_df
+#dynamut2_df = subset(dynamut2_df, select = -chain)
+
+# quick checks
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), ncol)
+
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), colnames)
+
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), nrow)
+
+ncols_comb = lapply(list(dynamut_df
+                         , dynamut2_df
+                         , mcsm_na_df), ncol)
+
+#---------------------------------
+# Combine 1: all other params dfs
+#---------------------------------
+combined_dfs = Reduce(inner_join, list(dynamut_df
+                                       , dynamut2_df
+                                       , mcsm_na_df))
+# Reduce("+", ncols_comb)
+
+#-----------------------------------------
+# Combine 2: combine1 result + merged_df2
+#-----------------------------------------
+drop_cols = intersect(names(combined_dfs), names(merged_df2))
+drop_cols = drop_cols
+
+drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
+
+combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
+
+nrow(combined_dfs_f); nrow(merged_df2)
+ncol(combined_dfs_f); ncol(merged_df2)
+
+#-----------------------------------------
+# Combined merged_df2
+#-----------------------------------------
+merged_df2_combined = merge(merged_df2
+                            , combined_dfs_f
+                            , by = "mutationinformation"
+)
+
+expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
+
+if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
+  
+  cat("\nPASS: merged_df2 combined with other parameters dfs."
+      , "\nUse this for lineage distribution plots")
+}else{
+  
+  cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
+  quit()
+  
+}
+
+rm(combined_dfs, combined_dfs_f)
+
+#================================
+# combined data
+# short_df ps: ~ merged_df3
+# TODO: later integrate properly
+#================================
+#-----------------------------------------
+# Combined merged_df2
+#-----------------------------------------
+merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]
diff --git a/scripts/plotting/redundant/other_plots_data.R b/scripts/plotting/redundant/other_plots_data.R
new file mode 100755
index 0000000..61a508f
--- /dev/null
+++ b/scripts/plotting/redundant/other_plots_data.R
@@ -0,0 +1,470 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
+#########################################################
+# working dir and loading libraries
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
+
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
+
+# source("get_plotting_dfs.R")
+#=======================================================================
+# MOVE TO COMBINE or singular file for deepddg
+# 
+# cols_to_select = c("mutation", "mutationinformation"
+#                    , "wild_type", "position", "mutant_type"
+#                    , "mutation_info")
+# 
+# merged_df3_short = merged_df3[, cols_to_select]
+
+# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+#                       , "_mcsm_formatted_snps.csv")
+# 
+# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
+# names(mcsm_f_snps) <- "mutationinformation"
+
+# write merged_df3 to generate structural figure on chimera
+#write.csv(merged_df3_short, "merged_df3_short.csv")
+#========================================================================
+
+#========================================================================
+# cols to select
+
+cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation"
+                               , "mutation_info", "position"
+                               , LigDist_colname
+                               , "duet_stability_change", "duet_scaled", "duet_outcome"
+                               , "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
+                               , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                               , "deepddg", "deepddg_scaled", "deepddg_outcome"
+                               , "asa", "rsa"
+                               , "rd_values", "kd_values"
+                               , "log10_or_mychisq", "neglog_pval_fisher", "af")]
+
+cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" 
+                                  , "mcsm_na_affinity", "mcsm_na_scaled"
+                                  , "mcsm_na_outcome")]
+# entire dynamut_df
+
+cols_dynamut2_df <- dynamut2_df[, c("mutationinformation"
+                                    , "ddg_dynamut2", "ddg_dynamut2_scaled"
+                                    , "ddg_dynamut2_outcome")]
+
+n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + 
+  length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols
+
+i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df))
+i2<- intersect(names(dynamut_df), names(cols_dynamut2_df))
+merging_cols <- intersect(i1, i2)
+cat("\nmerging_cols:", merging_cols)
+
+if (merging_cols == "mutationinformation") {
+  cat("\nStage 1: Found common col between dfs, checking values in it...")
+  c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]])
+  c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]])
+  c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]])
+  c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]])
+  cols_check <- c(c1, c2, c3, c4)
+  expected_cols = n_comb_cols - ( length(cols_check) - 1)
+  if (all(cols_check)){
+    cat("\nStage 2: Proceeding with merging dfs:\n")
+    comb_df <- Reduce(inner_join, list(cols_mcsm_df
+                                       , cols_mcsm_na_df
+                                       , dynamut_df
+                                       , cols_dynamut2_df))
+    comb_df_s = arrange(comb_df, position)
+    
+    # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) {
+    #   cat("\Stage3, PASS: dfs merged sucessfully"
+    #       , "\nnrow of merged_df: ", nrow(comb_df_s)
+    #       , "\nncol of merged_df:", ncol(comb_df_s))
+    #   }
+    
+    }
+}
+#names(comb_df_s)
+cat("\n!!!IT GOT TO HERE!!!!")
+#=======================================================================
+fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
+fact_cols
+lapply(comb_df_s[, fact_cols], class)
+comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
+
+if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
+  cat("\nChanging cols to factor")
+  comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
+  if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}
+lapply(comb_df_s[, fact_cols], class)
+
+#=======================================================================
+table(comb_df_s$mutation_info)
+
+ # further checks to make sure dr and other muts are indeed unique
+dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
+dr_muts_names = unique(dr_muts$mutation)
+
+other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
+other_muts_names = unique(other_muts$mutation)
+
+if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
+  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
+  cat("PASS: dr and other muts are indeed unique")
+}else{
+  cat("FAIL: dr and others muts are NOT unique!")
+  quit()
+}
+
+# pretty display names i.e. labels to reduce major code duplication later
+foo_cnames = data.frame(colnames(comb_df_s))
+names(foo_cnames) <- "old_name"
+
+stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
+flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
+
+lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
+duet_dn      = paste0("DUET ", stability_suffix); duet_dn
+foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
+deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
+mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
+dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
+dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
+encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
+encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
+sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
+mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
+
+# Change colnames of some columns using datatable 
+comb_df_sl = comb_df_s
+names(comb_df_sl)
+
+setnames(comb_df_sl
+         , old = c("asa", "rsa", "rd_values", "kd_values"
+                   , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                   , LigDist_colname
+                   , "duet_scaled"
+                   , "foldx_scaled"
+                   , "deepddg_scaled"
+                   , "mcsm_na_scaled"
+                   , "ddg_dynamut_scaled"
+                   , "ddg_dynamut2_scaled"
+                   , "ddg_encom_scaled"
+                   , "dds_encom_scaled"
+                   , "ddg_sdm"
+                   , "ddg_mcsm")
+                   
+         , new = c("ASA", "RSA", "RD", "KD"
+                   , "Log10 (OR)", "-Log (P)", "MAF"
+                   , lig_dn
+                   , duet_dn
+                   , foldx_dn
+                   , deepddg_dn
+                   , mcsm_na_dn
+                   , dynamut_dn
+                   , dynamut2_dn
+                   , encom_ddg_dn
+                   , encom_dds_dn
+                   , sdm_dn
+                   , mcsm_dn)
+         )
+
+foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
+
+# some more pretty labels
+table(comb_df_sl$mutation_info)
+
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
+
+table(comb_df_sl$mutation_info)
+
+#######################################################################
+#======================
+# Selecting dfs
+# with appropriate cols
+#=======================
+static_cols_start =  c("mutationinformation"
+                       , "position"
+                       , "mutation"
+                       , "mutation_info")
+
+static_cols_end = c(lig_dn
+                    , "ASA"
+                    , "RSA"
+                    , "RD"
+                    , "KD")
+
+# ordering is important!
+
+#########################################################################
+#==============
+# DUET: LF
+#==============
+cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
+wf_duet = comb_df_sl[, cols_to_select_duet]
+
+#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
+
+expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
+expected_rows_lf
+
+# LF data: duet
+lf_duet = gather(wf_duet
+                  , key = param_type
+                  , value = param_value
+                  , all_of(duet_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_duet) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", duet_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# FoldX: LF
+#==============
+cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
+wf_foldx = comb_df_sl[, cols_to_select_foldx]
+
+pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
+
+expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
+expected_rows_lf
+
+# LF data: duet
+print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>")
+lf_foldx <<- gather(wf_foldx
+                 , key = param_type
+                 , value = param_value
+                 , all_of(foldx_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_foldx) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", foldx_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Deepddg: LF
+#==============
+cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
+wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
+
+pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
+
+expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
+expected_rows_lf
+
+# LF data: duet
+lf_deepddg = gather(wf_deepddg
+                  , key = param_type
+                  , value = param_value
+                  , all_of(deepddg_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_deepddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", deepddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM-NA: LF
+#==============
+cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
+wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+
+pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
+
+expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+expected_rows_lf
+
+# LF data: duet
+lf_mcsm_na = gather(wf_mcsm_na
+                    , key = param_type
+                    , value = param_value
+                    , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_mcsm_na) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", mcsm_na_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut: LF
+#==============
+cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
+wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
+
+pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
+
+expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
+expected_rows_lf
+
+# LF data: duet
+lf_dynamut = gather(wf_dynamut
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut2: LF
+#==============
+cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
+
+wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
+
+pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
+
+expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
+expected_rows_lf
+
+# LF data: duet
+lf_dynamut2 = gather(wf_dynamut2
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut2_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut2) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut2_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# EnCOM ddg: LF
+#==============
+cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
+wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
+
+pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
+
+expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomddg  = gather(wf_encomddg 
+                     , key = param_type
+                     , value = param_value
+                     , all_of(encom_ddg_dn):tail(static_cols_end,1)
+                     , factor_key = TRUE)
+
+if (nrow(lf_encomddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", encom_ddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+#==============
+# EnCOM dds: LF
+#==============
+cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
+wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
+
+pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
+
+expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomdds  = gather(wf_encomdds
+                      , key = param_type
+                      , value = param_value
+                      , all_of(encom_dds_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+
+if (nrow(lf_encomdds) == expected_rows_lf){
+  cat("\nPASS: long format data created for", encom_dds_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# SDM: LF
+#==============
+cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
+wf_sdm = comb_df_sl[, cols_to_select_sdm]
+
+pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
+
+expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_sdm  = gather(wf_sdm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(sdm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_sdm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", sdm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM: LF
+#==============
+cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
+wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
+
+pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
+
+expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_mcsm  = gather(wf_mcsm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(mcsm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_mcsm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", mcsm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+

From 996d67b423bc4ee5033f53c78ca2f8be93a52ae0 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 13 Sep 2021 10:24:41 +0100
Subject: [PATCH 25/51] added pretty colnames to corr_data.R

---
 scripts/plotting/corr_data.R | 53 ++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/scripts/plotting/corr_data.R b/scripts/plotting/corr_data.R
index d33efc5..3120763 100644
--- a/scripts/plotting/corr_data.R
+++ b/scripts/plotting/corr_data.R
@@ -32,6 +32,32 @@ corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels"
 
 corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select]
 
+#-----------------------
+# formatting: some cols
+# Add pretty colnames
+#-----------------------
+corr_df_m2_f <- corr_df_m2 %>% 
+  rename(
+      DUET       = duet_stability_change
+    , 'mCSM-lig' = ligand_affinity_change
+    , FoldX      = ddg_foldx
+    , DeepDDG    = deepddg
+    , ASA        = asa
+    , RSA        = rsa
+    , KD         = kd_values
+    , RD         = rd_values
+    , MAF        = af
+    , 'Log (OR)' = log10_or_mychisq
+    , '-Log (P)' = neglog_pval_fisher
+    , Dynamut    = ddg_dynamut        
+    , 'ENCoM-DDG'= ddg_encom
+    , mCSM       = ddg_mcsm
+    , SDM        = ddg_sdm           
+    , 'DUET-d'   = ddg_duet
+    , 'ENCoM-DDS'= dds_encom
+    , Dynamut2   = ddg_dynamut2
+    , 'mCSM-NA'  = mcsm_na_affinity ) 
+
 #===========================
 # Corr data for plots: PS
 # short_df ps: ~merged_df3
@@ -53,6 +79,33 @@ if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
       , "\nGot: ", check1)
 }
 
+#-----------------------
+# formatting: some cols
+# Add pretty colnames
+#-----------------------
+corr_df_m3_f <- corr_df_m3 %>% 
+  rename(
+      DUET       = duet_stability_change
+    , 'mCSM-lig' = ligand_affinity_change
+    , FoldX      = ddg_foldx
+    , DeepDDG    = deepddg
+    , ASA        = asa
+    , RSA        = rsa
+    , KD         = kd_values
+    , RD         = rd_values
+    , MAF        = af
+    , 'Log (OR)' = log10_or_mychisq
+    , '-Log (P)' = neglog_pval_fisher
+    , Dynamut    = ddg_dynamut        
+    , 'ENCoM-DDG'= ddg_encom
+    , mCSM       = ddg_mcsm
+    , SDM        = ddg_sdm           
+    , 'DUET-d'   = ddg_duet
+    , 'ENCoM-DDS'= dds_encom
+    , Dynamut2   = ddg_dynamut2
+    , 'mCSM-NA'  = mcsm_na_affinity ) 
+
+########################################################################
 cat("\nCorr Data created:"
 , "\n==================================="
 , "\ncorr_df_m2: created from merged_df2"

From b98977336cb1fec4223563188a539de4bf79ef85 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 14 Sep 2021 15:36:05 +0100
Subject: [PATCH 26/51] updated my_pairs_panel.R to make the dots coloured

---
 scripts/functions/my_pairs_panel.R | 44 ++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R
index 0c73192..eb0268a 100644
--- a/scripts/functions/my_pairs_panel.R
+++ b/scripts/functions/my_pairs_panel.R
@@ -1,24 +1,40 @@
-my_corr_pairs <- function (corr_data){
+my_corr_pairs <- function (corr_data_all
+                           , corr_data_range = 1:length(corr_data_all)
+                           , corr_method = "spearman" # other options: "pearson" or "kendall"
+                           , colour_categ_col = "mutation_info_labels"
+                           , categ_colour =  c("#E69F00", "#999999")
+                           , density_show = F
+                           , hist_col = "coral4"
+                           , dot_size = 1.6
+                           , ats = 1
+                           , corr_lab_size = 1
+                           , corr_value_size = 1) 
+  {
   
-  OutPlot_corr = pairs.panels(corr_data
-                              , method = "spearman" # correlation method
-                              , hist.col = "grey" ##00AFBB
-                              , density = TRUE  # show density plots
-                              , ellipses = F # show correlation ellipses
+  corr_data_df =  corr_data_all[corr_data_range]
+  my_bg = categ_colour[corr_data_all[[colour_categ_col]] ]
+
+  OutPlot_corr = pairs.panels(corr_data_df
+                              , method = corr_method
+                              , hist.col = hist_col
+                              , density = density_show  
+                              , ellipses = F 
+                              , smooth = F
                               , stars = T
                               , rug = F
                               , breaks = "Sturges"
                               , show.points = T
-                              #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed
-                              #, pch = 21 # for bg
-                              , jitter = T
+                              #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed
+                              , bg = my_bg
+                              , pch = 21 
                               , alpha = 1
-                              , cex = 1.8
-                              , cex.axis = 2
-                              , cex.labels = 3.5
-                              , cex.cor = 1
-                              , smooth = F)
+                              , cex = dot_size
+                              , cex.axis = ats
+                              , cex.labels = corr_lab_size
+                              , cex.cor = corr_value_size
+                              )
   return(OutPlot_corr)
+  #return (my_bg)
   
 }
 

From bf432cd054485d63d9b1f9a722ef5f0893b8af2c Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 14 Sep 2021 18:20:12 +0100
Subject: [PATCH 27/51] more updates to pairs_panels to take colnames for
 plotting

---
 scripts/functions/my_pairs_panel.R | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R
index eb0268a..8968111 100644
--- a/scripts/functions/my_pairs_panel.R
+++ b/scripts/functions/my_pairs_panel.R
@@ -1,17 +1,17 @@
 my_corr_pairs <- function (corr_data_all
-                           , corr_data_range = 1:length(corr_data_all)
+                           , corr_cols = colnames(corr_data_all)
                            , corr_method = "spearman" # other options: "pearson" or "kendall"
                            , colour_categ_col = "mutation_info_labels"
                            , categ_colour =  c("#E69F00", "#999999")
                            , density_show = F
                            , hist_col = "coral4"
                            , dot_size = 1.6
-                           , ats = 1
-                           , corr_lab_size = 1
+                           , ats = 1.5
+                           , corr_lab_size = 3
                            , corr_value_size = 1) 
   {
   
-  corr_data_df =  corr_data_all[corr_data_range]
+  corr_data_df =  corr_data_all[corr_cols]
   my_bg = categ_colour[corr_data_all[[colour_categ_col]] ]
 
   OutPlot_corr = pairs.panels(corr_data_df
@@ -38,6 +38,14 @@ my_corr_pairs <- function (corr_data_all
   
 }
 
+c_plot <- my_corr_pairs(corrplot_df
+                        
+                        , dot_size = 1.6
+                        , ats = 1.5
+                        , corr_lab_size = 1.5
+                        , corr_value_size = 1)
+
+
 ######################################################################
 my_pp = function (x, smooth = TRUE, scale = FALSE, density = TRUE, ellipses = TRUE, 
           digits = 2, method = "pearson", pch = 20, lm = FALSE, cor = TRUE, 

From 449af7acf424dae644e3d40bce3be963066b65f1 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 15:46:42 +0100
Subject: [PATCH 28/51] fixed pos_count cals in function by specifying dplyr
 and changed summarize to summarise

---
 scripts/functions/my_pairs_panel.R    |  8 --------
 scripts/functions/position_count_bp.R | 21 ++++++++++++++-------
 scripts/plotting/Header_TT.R          | 11 ++++++++++-
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/scripts/functions/my_pairs_panel.R b/scripts/functions/my_pairs_panel.R
index 8968111..808417f 100644
--- a/scripts/functions/my_pairs_panel.R
+++ b/scripts/functions/my_pairs_panel.R
@@ -38,14 +38,6 @@ my_corr_pairs <- function (corr_data_all
   
 }
 
-c_plot <- my_corr_pairs(corrplot_df
-                        
-                        , dot_size = 1.6
-                        , ats = 1.5
-                        , corr_lab_size = 1.5
-                        , corr_value_size = 1)
-
-
 ######################################################################
 my_pp = function (x, smooth = TRUE, scale = FALSE, density = TRUE, ellipses = TRUE, 
           digits = 2, method = "pearson", pch = 20, lm = FALSE, cor = TRUE, 
diff --git a/scripts/functions/position_count_bp.R b/scripts/functions/position_count_bp.R
index ce0767c..2907b4b 100755
--- a/scripts/functions/position_count_bp.R
+++ b/scripts/functions/position_count_bp.R
@@ -42,7 +42,9 @@ site_snp_count_bp <- function (plotdf
              , "\nNo. of cols:", ncol(plotdf)
              , "\nNow adding column: frequency of mutational positions"))
   
-  # adding snpcount for each position 
+  #-------------------------------------------
+  # adding column: snpcount for each position 
+  #-------------------------------------------
   setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))] 
 
   cat("\nCumulative nssnp count\n"
@@ -64,15 +66,20 @@ site_snp_count_bp <- function (plotdf
   cat(paste0("\nrevised df dimensions:"
              , "\nNo. of rows:", nrow(plotdf)
              , "\nNo. of cols:", ncol(plotdf)))
-  
+
+  #------------------------------------------------------
+  # creating df: average count of snpcount for each position 
+  # created in earlier step
+  #-------------------------------------------------------
   # use group by on pos_count
   snpsBYpos_df <- plotdf %>%
-    group_by(eval(parse(text = df_colname))) %>%
-    summarize(snpsBYpos = mean(pos_count))
-  
-  cat("\nnssnp count\n"
-      , table(snpsBYpos_df$snpsBYpos))
+    dplyr::group_by(eval(parse(text = df_colname))) %>%
+    dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize!
   
+  cat("\nnssnp count per position\n"
+      , table(snpsBYpos_df$snpsBYpos)
+      , "\n")
+
   # calculating total no. of sites associated with nsSNPs
   tot_sites = sum(table(snpsBYpos_df$snpsBYpos))
   
diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R
index 47599d3..2fa892c 100755
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@@ -6,7 +6,6 @@
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")
 
-
 require("getopt", quietly = TRUE) # cmd parse arguments
 
 if (!require("tidyverse")) {
@@ -19,6 +18,11 @@ if (!require("shiny")) {
   library(shiny)
 }
 
+if (!require("shinyBS")) {
+  install.packages("shinyBS", dependencies = TRUE)
+  library(shinyBS)
+}
+
 if (!require("gridExtra")) {
   install.packages("gridExtra", dependencies = TRUE)
   library(gridExtra)
@@ -39,6 +43,11 @@ if (!require("ggridges")) {
 #   library(dplyr)
 # }
 
+if (!require ("DT")){
+  install.packages("DT")
+  library(DT)
+}
+
 if (!require ("plyr")){
    install.packages("plyr")
    library(plyr)

From 7550efbd4c45c852af4ebd3fd56fa50e86641188 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 19:29:09 +0100
Subject: [PATCH 29/51] added wideplot subcols generation within
 bp_subcolours.R to make it easier to call the whole thing as a function and
 use merged_df3 to generate plot without having to separately generate special
 data for it. Tested with real data on different stability params

---
 scripts/functions/bp_subcolours.R   | 81 ++++++++++++++++++++++++++++-
 scripts/plotting/get_plotting_dfs.R |  2 +-
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/scripts/functions/bp_subcolours.R b/scripts/functions/bp_subcolours.R
index a3cc403..3db4079 100755
--- a/scripts/functions/bp_subcolours.R
+++ b/scripts/functions/bp_subcolours.R
@@ -3,7 +3,7 @@
 # LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
 #########################################################
 
-ColourPalleteMulti <- function(df, group, subgroup){
+ColourPalleteMulti = function(df, group, subgroup){
   
   # Find how many colour categories to create and the number of colours in each
   categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
@@ -24,4 +24,81 @@ ColourPalleteMulti <- function(df, group, subgroup){
                                                          , category.end[i]))(categories[i,2])}))
   return(colours)
 }
-#########################################################
\ No newline at end of file
+#########################################################################
+
+bp_stability_hmap <- function(plotdf = merged_df3
+                              , xvar_colname = "position"
+                              #, bar_col_colname = "group"
+                              , stability_colname = "duet_scaled"
+                              , stability_outcome_colname = "duet_outcome"
+                              , p_title = ""  # "Protein stability (DUET)"
+                              , my_xaxls = 12 # x-axis label size
+                              , my_yaxls = 20 # y-axis label size
+                              , my_xaxts = 18 # x-axis text size
+                              , my_yaxts = 20 # y-axis text size
+                              , my_pts  = 20  # plot-title size
+                              , my_xlab = "Position"
+                              , my_ylab = "No. of nsSNPs"
+)
+{
+  
+  # order the df by position and ensure it is a factor
+  plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
+  plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
+  
+  #cat("\nSneak peak:\n")
+  head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
+  
+  # stability values isolated to help with generating column called: 'group'
+  my_grp = plotdf[[stability_colname]]
+  cat( "\nLength of nsSNPs:", length(my_grp)
+       , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
+  
+  # Add col: 'group'
+  plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
+  
+  # check unique values in normalised data
+  cat("\nNo. of unique values in", stability_colname, "no rounding:"
+      , length(unique(plotdf[[stability_colname]])))
+  
+  # Call the function to create the palette based on the group defined above
+  #subcols_ps
+  subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
+  
+  cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
+  
+  #-------------------------------
+  # Generate the subcols barplot
+  #-------------------------------
+  
+  #g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
+  g = ggplot(plotdf, aes_string(x = xvar_colname
+                                # , ordered = T)
+  ))
+  
+  
+  OutWidePlot = g + geom_bar(aes(fill = group)
+                             , colour = "grey") +
+    
+    scale_fill_manual( values = subcols_bp_hmap
+                       , guide = "none") +
+    
+    theme( axis.text.x = element_text(size = my_xaxls
+                                      , angle = 90
+                                      , hjust = 1
+                                      , vjust = 0.4)
+           , axis.text.y = element_text(size = my_yaxls
+                                        , angle = 0
+                                        , hjust = 1
+                                        , vjust = 0)
+           , axis.title.x = element_text(size = my_xaxts)
+           , axis.title.y = element_text(size = my_yaxts ) 
+           , plot.title = element_text(size = my_pts
+                                       , hjust = 0.5)) +
+    
+    labs(title = p_title
+         , x = my_xlab
+         , y = my_ylab)
+  
+  return(OutWidePlot)
+}
diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index c1ce5b2..a50b0a9 100755
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -129,7 +129,7 @@ merged_df3_comp = all_plot_dfs[[4]]
 #                        Data for subcols barplot (~heatmap)
 ####################################################################
 
-source("coloured_bp_data.R")
+#source("coloured_bp_data.R")
 
 ####################################################################
 #                        Data for logoplots

From 2ac5ec410eb78d2ceeb5ea690596bba354e716ed Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 19:33:52 +0100
Subject: [PATCH 30/51] added test_bp_subcolours.R

---
 scripts/functions/tests/test_bp_subcolours.R | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 scripts/functions/tests/test_bp_subcolours.R

diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R
new file mode 100644
index 0000000..83fca05
--- /dev/null
+++ b/scripts/functions/tests/test_bp_subcolours.R
@@ -0,0 +1,16 @@
+#!/usr/bin/env Rscript      
+source("~/git/Misc/shiny/myshiny/gid_data.R")
+setwd("~/git/LSHTM_analysis/scripts/functions/")
+source("bp_subcolours.R")
+
+
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "deepddg_scaled"
+                  , stability_outcome_colname = "deepddg_outcome"
+                  , p_title = "DeepDDG" )
+
+
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "foldx_scaled"
+                  , stability_outcome_colname = "foldx_outcome"
+                  , p_title = "FoldX" )

From f0e66b2f7b3e489e13a65cad15046fb292856717 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 19:34:24 +0100
Subject: [PATCH 31/51] added the scratch script as _v2 to play while
 repurposing bp_subcolours.R

---
 .../functions/redundant/bp_subcolours_v2.R    | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 scripts/functions/redundant/bp_subcolours_v2.R

diff --git a/scripts/functions/redundant/bp_subcolours_v2.R b/scripts/functions/redundant/bp_subcolours_v2.R
new file mode 100644
index 0000000..a049ba2
--- /dev/null
+++ b/scripts/functions/redundant/bp_subcolours_v2.R
@@ -0,0 +1,104 @@
+#########################################################
+# 1b: Define function: coloured barplot by subgroup
+# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
+#########################################################
+
+ColourPalleteMulti = function(df, group, subgroup){
+  
+  # Find how many colour categories to create and the number of colours in each
+  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
+                          , df
+                          , function(x) length(unique(x)))
+  #  return(categories) }
+  
+  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
+  
+  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
+  
+  #return(category.start); return(category.end)}
+  
+  # Build Colour pallette
+  colours <- unlist(lapply(1:nrow(categories),
+                           function(i){
+                             colorRampPalette(colors = c(category.start[i]
+                                                         , category.end[i]))(categories[i,2])}))
+  return(colours)
+}
+#########################################################################
+
+bp_stability_hmap <- function(plotdf = merged_df3
+                              , xvar_colname = "position"
+                              #, bar_col_colname = "group"
+                              , stability_colname = "duet_scaled"
+                              , stability_outcome_colname = "duet_outcome"
+                              , p_title = ""  # "Protein stability (DUET)"
+                              , my_xaxls = 12 # x-axis label size
+                              , my_yaxls = 20 # y-axis label size
+                              , my_xaxts = 18 # x-axis text size
+                              , my_yaxts = 20 # y-axis text size
+                              , my_pts  = 20  # plot-title size
+                              , my_xlab = "Position"
+                              , my_ylab = "No. of nsSNPs"
+                              )
+{
+
+  # order the df by position and ensure it is a factor
+  plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
+  plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
+  
+  #cat("\nSneak peak:\n")
+  head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
+
+  # stability values isolated to help with generating column called: 'group'
+  my_grp = plotdf[[stability_colname]]
+  cat( "\nLength of nsSNPs:", length(my_grp)
+       , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
+  
+  # Add col: 'group'
+  plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
+
+  # check unique values in normalised data
+  cat("\nNo. of unique values in", stability_colname, "no rounding:"
+      , length(unique(plotdf[[stability_colname]])))
+  
+  # Call the function to create the palette based on the group defined above
+  #subcols_ps
+  subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
+  
+  cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
+  
+  #-------------------------------
+  # Generate the subcols barplot
+  #-------------------------------
+  
+  #g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
+  g = ggplot(plotdf, aes_string(x = xvar_colname
+                               # , ordered = T)
+             ))
+  
+
+  OutWidePlot = g + geom_bar(aes(fill = group)
+                             , colour = "grey") +
+    
+  scale_fill_manual( values = subcols_bp_hmap
+                       , guide = "none") +
+    
+    theme( axis.text.x = element_text(size = my_xaxls
+                                      , angle = 90
+                                      , hjust = 1
+                                      , vjust = 0.4)
+           , axis.text.y = element_text(size = my_yaxls
+                                        , angle = 0
+                                        , hjust = 1
+                                        , vjust = 0)
+           , axis.title.x = element_text(size = my_xaxts)
+           , axis.title.y = element_text(size = my_yaxts ) 
+           , plot.title = element_text(size = my_pts
+                                       , hjust = 0.5)) +
+    
+    labs(title = p_title
+         , x = my_xlab
+         , y = my_ylab)
+
+  return(OutWidePlot)
+}
\ No newline at end of file

From 96e6e8db5da59c6da0aabe70cf9679cf7c4d7aa1 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 19:37:39 +0100
Subject: [PATCH 32/51] saving work and tidying script

---
 scripts/functions/bp_subcolours.R | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/functions/bp_subcolours.R b/scripts/functions/bp_subcolours.R
index 3db4079..91a8914 100755
--- a/scripts/functions/bp_subcolours.R
+++ b/scripts/functions/bp_subcolours.R
@@ -26,11 +26,18 @@ ColourPalleteMulti = function(df, group, subgroup){
 }
 #########################################################################
 
+########################
+# Generate bp with
+# colour palette derived
+# from the data using 
+# above function
+#########################
+
 bp_stability_hmap <- function(plotdf = merged_df3
                               , xvar_colname = "position"
                               #, bar_col_colname = "group"
-                              , stability_colname = "duet_scaled"
-                              , stability_outcome_colname = "duet_outcome"
+                              , stability_colname = ""
+                              , stability_outcome_colname = ""
                               , p_title = ""  # "Protein stability (DUET)"
                               , my_xaxls = 12 # x-axis label size
                               , my_yaxls = 20 # y-axis label size

From 1d16c6848ec6dafd81ab26064aac7389e21dc533 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 19:42:08 +0100
Subject: [PATCH 33/51] moved coloured_bp_data.R to redundant in light of
 updated function and reflected this in notes withing get_plotting_dfs.R

---
 scripts/plotting/get_plotting_dfs.R                 | 5 +++++
 scripts/plotting/{ => redundant}/coloured_bp_data.R | 0
 2 files changed, 5 insertions(+)
 rename scripts/plotting/{ => redundant}/coloured_bp_data.R (100%)

diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index a50b0a9..ec67a49 100755
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -124,12 +124,17 @@ merged_df3_comp = all_plot_dfs[[4]]
 ####################################################################
 
 #source("other_dfs_data.R")
+# Fixed this at source i.e python script
+# Moved: "other_dfs_data.R" to redundant/
 
 ####################################################################
 #                        Data for subcols barplot (~heatmap)
 ####################################################################
 
 #source("coloured_bp_data.R")
+# Repurposed function so that params can be passed instead to generate
+# data required for plotting.
+# Moved "coloured_bp_data.R" to redundant/
 
 ####################################################################
 #                        Data for logoplots
diff --git a/scripts/plotting/coloured_bp_data.R b/scripts/plotting/redundant/coloured_bp_data.R
similarity index 100%
rename from scripts/plotting/coloured_bp_data.R
rename to scripts/plotting/redundant/coloured_bp_data.R

From 746889b07538a23dff34761f54097752d5986ade Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 15 Sep 2021 19:48:56 +0100
Subject: [PATCH 34/51] saving work for the day after massive repurpose

---
 scripts/plotting/get_plotting_dfs.R | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R
index ec67a49..d5d1535 100755
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@@ -1,26 +1,21 @@
 #!/usr/bin/env Rscript
+
 #########################################################
 # TASK: Get formatted data for plots
-#=======================================================================
+#########################################################
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 
 source("Header_TT.R")
-# source("../functions/my_pairs_panel.R") # with lower panel turned off
-# source("../functions/plotting_globals.R")
-# source("../functions/plotting_data.R")
-# source("../functions/combining_dfs_plotting.R")
-# source("../functions/bp_subcolours.R")
 
 #********************
 # cmd args passed 
 # in from other scripts
 # to call this
 #********************
-#drug = 'streptomycin'
-#gene = 'gid'
+
 #====================
 # variables for lig
 #====================

From 56600ac3f8bc9cbe0bdb98a6407665f36e75707f Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 16 Sep 2021 10:05:28 +0100
Subject: [PATCH 35/51] added config/ with drug gene names

---
 config/gid.R                                 | 2 ++
 scripts/functions/tests/test_bp_subcolours.R | 4 ++++
 2 files changed, 6 insertions(+)
 create mode 100644 config/gid.R

diff --git a/config/gid.R b/config/gid.R
new file mode 100644
index 0000000..226af91
--- /dev/null
+++ b/config/gid.R
@@ -0,0 +1,2 @@
+gene = "gid"
+drug = "streptomycin"
diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R
index 83fca05..2156e49 100644
--- a/scripts/functions/tests/test_bp_subcolours.R
+++ b/scripts/functions/tests/test_bp_subcolours.R
@@ -9,6 +9,10 @@ bp_stability_hmap(plotdf =  merged_df3
                   , stability_outcome_colname = "deepddg_outcome"
                   , p_title = "DeepDDG" )
 
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_dynamut2_scaled"
+                  , stability_outcome_colname = "ddg_dynamut2_outcome"
+                  , p_title = "Dynamut2" )
 
 bp_stability_hmap(plotdf =  merged_df3
                   , stability_colname = "foldx_scaled"

From cb5d7aa5ab7bcca24f365e67d218afbc4384990d Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 16 Sep 2021 10:59:55 +0100
Subject: [PATCH 36/51] corrected foldx_outcome classification in
 combining_dfs.py as positive are Destabilising and neg as Stabilising

---
 scripts/combining_dfs.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index faa9677..9331edd 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -190,6 +190,10 @@ foldx_max = foldx_df['ddg_foldx'].max()
 foldx_min
 foldx_max
 
+# quick check
+len(foldx_df.loc[foldx_df['ddg_foldx'] >= 0])
+len(foldx_df.loc[foldx_df['ddg_foldx'] < 0])
+
 foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
 
 foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
@@ -216,13 +220,16 @@ else:
           , '\n======================================================')
 
 #-------------------------
-# foldx outcome category
+# foldx outcome category:
+# Remember, its inverse
+# +ve: Destabilising
+# -ve: Stabilising
 #--------------------------
-foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+foldx_df['foldx_outcome'] = foldx_df['ddg_foldx'].apply(lambda x: 'Destabilising' if x >= 0 else 'Stabilising')
 foldx_df[foldx_df['ddg_foldx']>=0].count()
 foc = foldx_df['foldx_outcome'].value_counts()
 
-if foc['Stabilising'] == foldx_pos and  foc['Stabilising'] == foldx_pos2:
+if foc['Destabilising'] == foldx_pos and foc['Destabilising'] == foldx_pos2:
     print('\nPASS: Foldx outcome category created')
 else:
     print('\nFAIL: Foldx outcome category could NOT be created'

From e8734b1c4be4b9d99184b23567337703a0a90d79 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 16 Sep 2021 12:43:36 +0100
Subject: [PATCH 37/51] sorted merged_df2 and consequently others by position
 in combining_dfs_plotting.R

---
 scripts/functions/combining_dfs_plotting.R   |  8 ++-
 scripts/functions/tests/test_bp_subcolours.R | 70 ++++++++++++++++++--
 2 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R
index 848face..107c114 100644
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@@ -153,7 +153,13 @@ combining_dfs_plotting <- function(  my_df_u
     quit()
   }
 
-  # Quick formatting: pretty labels
+  # Quick formatting: ordering df and pretty labels
+  
+  #------------------------------
+  # sorting by column: position
+  #------------------------------
+  merged_df2 = merged_df2[order(merged_df2$position), ]
+  
   #-----------------------
   # mutation_info_labels
   #-----------------------
diff --git a/scripts/functions/tests/test_bp_subcolours.R b/scripts/functions/tests/test_bp_subcolours.R
index 2156e49..8866ffe 100644
--- a/scripts/functions/tests/test_bp_subcolours.R
+++ b/scripts/functions/tests/test_bp_subcolours.R
@@ -1,20 +1,78 @@
 #!/usr/bin/env Rscript      
-source("~/git/Misc/shiny/myshiny/gid_data.R")
-setwd("~/git/LSHTM_analysis/scripts/functions/")
-source("bp_subcolours.R")
+#source("~/git/Misc/shiny/myshiny/gid_data.R")
 
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
 
+# p1
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "duet_scaled"
+                  , stability_outcome_colname = "duet_outcome"
+                  , p_title = "DUET" )
+
+# p2
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "foldx_scaled"
+                  , stability_outcome_colname = "foldx_outcome"
+                  , p_title = "FoldX" )
+
+# p3
 bp_stability_hmap(plotdf =  merged_df3
                   , stability_colname = "deepddg_scaled"
                   , stability_outcome_colname = "deepddg_outcome"
                   , p_title = "DeepDDG" )
 
+# p4
 bp_stability_hmap(plotdf =  merged_df3
                   , stability_colname = "ddg_dynamut2_scaled"
                   , stability_outcome_colname = "ddg_dynamut2_outcome"
                   , p_title = "Dynamut2" )
 
+# p5
 bp_stability_hmap(plotdf =  merged_df3
-                  , stability_colname = "foldx_scaled"
-                  , stability_outcome_colname = "foldx_outcome"
-                  , p_title = "FoldX" )
+                  , stability_colname = "mcsm_na_scaled"
+                  , stability_outcome_colname = "mcsm_na_outcome"
+                  , p_title = "mCSM-NA" )
+
+# p6
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_dynamut_scaled"
+                  , stability_outcome_colname = "ddg_dynamut_outcome"
+                  , p_title = "Dynamut" )
+
+# p7
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_mcsm_scaled"
+                  , stability_outcome_colname = "ddg_mcsm_outcome"
+                  , p_title = "mCSM" )
+
+# p8
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_duet_scaled"
+                  , stability_outcome_colname = "ddg_duet_outcome"
+                  , p_title = "DUET-d" )
+
+# p9
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_sdm_scaled"
+                  , stability_outcome_colname = "ddg_sdm_outcome"
+                  , p_title = "SDM" )
+
+# p10
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_encom_scaled"
+                  , stability_outcome_colname = "ddg_encom_outcome"
+                  , p_title = "ENCoM-Stability" )
+
+# p11
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "dds_encom_scaled"
+                  , stability_outcome_colname = "dds_encom_outcome"
+                  , p_title = "ENCoM-Flexibility" )
+
+# p12
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "affinity_scaled"
+                  , stability_outcome_colname = "ligand_outcome"
+                  , p_title = "mCSM-lig" )

From 51aa3217928b17159a10a6ef5bf5cb33d396f8d8 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 16 Sep 2021 12:44:42 +0100
Subject: [PATCH 38/51] sorting out bp_subcolours in interaction

---
 .../functions/tests/test_bp_subcolours_i.R    | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 scripts/functions/tests/test_bp_subcolours_i.R

diff --git a/scripts/functions/tests/test_bp_subcolours_i.R b/scripts/functions/tests/test_bp_subcolours_i.R
new file mode 100644
index 0000000..d8c1b42
--- /dev/null
+++ b/scripts/functions/tests/test_bp_subcolours_i.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript      
+#source("~/git/Misc/shiny/myshiny/gid_data.R")
+
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
+
+# p1
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "duet_scaled"
+                  , stability_outcome_colname = "duet_outcome"
+                  , p_title = "DUET" )
+
+# p2
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "foldx_scaled"
+                  , stability_outcome_colname = "foldx_outcome"
+                  , p_title = "FoldX" )
+
+# p3
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "deepddg_scaled"
+                  , stability_outcome_colname = "deepddg_outcome"
+                  , p_title = "DeepDDG" )
+
+##################################################
+
+merged_df3_f = merged_df3
+
+setDT(merged_df3_f)[, pos_count := .N, by = position] 
+
+##################################################
+ui <- basicPage(
+  plotOutput("plot1", click = "plot_click"),
+  verbatimTextOutput("info")
+)
+
+server <- function(input, output) {
+  output$plot1 <- renderPlot({
+    
+    #plot(mtcars$wt, mtcars$mpg)
+    bp_stability_hmap(plotdf =  merged_df3_f
+                      , xvar_colname = "position"
+                      , stability_colname = "foldx_scaled"
+                      , stability_outcome_colname = "foldx_outcome"
+                      , p_title = "FoldX" )
+    
+  })
+  
+  output$info <- renderPrint({
+    # With base graphics, need to tell it what the x and y variables are.
+    nearPoints(merged_df3_f, input$plot_click
+                  , xvar = "position"
+                  , yvar = "pos_count"
+                  )
+  })
+}
+
+shinyApp(ui, server)
\ No newline at end of file

From e2d7a6567e5e0cb196deb6291d6d57fe5af513df Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 16 Sep 2021 18:59:02 +0100
Subject: [PATCH 39/51] minor bug fixes to allow i_graps for stability to
 render correctly

---
 scripts/functions/stability_count_bp.R | 35 ++++++++++++++------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/scripts/functions/stability_count_bp.R b/scripts/functions/stability_count_bp.R
index 8f7d9ed..e5ed684 100644
--- a/scripts/functions/stability_count_bp.R
+++ b/scripts/functions/stability_count_bp.R
@@ -15,39 +15,42 @@ theme_set(theme_grey())
   ## ...opt args
 #==========================================================
 stability_count_bp <- function(plotdf
-         , df_colname
-         , leg_title = "Legend title"
-         , axis_text_size = 25
-         , axis_label_size = 22
-         , leg_text_size = 20
-         , leg_title_size = 22
+         , df_colname = ""
+         , leg_title = "Legend Title"
+         , ats = 25     # axis text size
+         , als = 22     # axis label size
+         , lts = 20     # legend text size
+         , ltis = 22    # label title size
+         , geom_ls = 10 # geom_label size
          , yaxis_title = "Number of nsSNPs"
          , bp_plot_title = ""
          , label_categories = c("Destabilising", "Stabilising")
          , title_colour = "chocolate4"
          , subtitle_text = NULL
-         , subtitle_size = 20
+         , sts = 20
          , subtitle_colour = "pink"
          #, leg_position = c(0.73,0.8) # within plot area
          , leg_position = "top"){
  
-  OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + 
+#  OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
+  OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) +     
     geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
     geom_label(stat = "count"
                , aes(label = ..count..)
                , color = "black"
                , show.legend = FALSE
-               , size = 10) +
+               , size = geom_ls) +
     theme(axis.text.x = element_blank()
           , axis.title.x = element_blank()
-          , axis.title.y = element_text(size =  axis_label_size)
-          , axis.text.y = element_text(size = axis_text_size)
+          , axis.title.y = element_text(size =  als)
+          , axis.text.y = element_text(size = ats)
           , legend.position = leg_position
-          , legend.text = element_text(size = leg_text_size)
-          , legend.title = element_text(size =  leg_title_size)
-          , plot.title = element_text(size =  axis_label_size
-                                      , colour = title_colour)
-          , plot.subtitle = element_text(size = subtitle_size
+          , legend.text = element_text(size = lts)
+          , legend.title = element_text(size =  ltis)
+          , plot.title = element_text(size =  als
+                                      , colour = title_colour
+                                      , hjust = 0.5)
+          , plot.subtitle = element_text(size = sts
                                          , hjust = 0.5
                                          , colour = subtitle_colour)) + 
     labs(title      = bp_plot_title

From e115c3636c61c2f34f9dd414f92d8a6e762c3d95 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 17 Sep 2021 13:33:19 +0100
Subject: [PATCH 40/51] fixed lf_bp function with aes_string and reformulate

---
 scripts/functions/lf_bp.R | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R
index 608247d..71b0472 100644
--- a/scripts/functions/lf_bp.R
+++ b/scripts/functions/lf_bp.R
@@ -29,11 +29,17 @@ lf_bp <- function(lf_df
                   , stat_method = "wilcox.test"
                   , my_paired = FALSE
                   , stat_label = c("p.format", "p.signif") ){
-
-  p1 <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
-                    , y = eval(parse(text = y_var)) ))  + 
+  
+  fwv = as.formula(paste0("~", facet_var))
+  p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var))  + 
     
-    facet_wrap(~ eval(parse(text = facet_var))
+    #fwv = eval(parse(text = facet_var))
+    # facet_wrap(~ fwv
+    #            , nrow = n_facet_row
+    #            , scales = y_scales) +
+    #   
+    # fwv = as.formula(paste0("~", facet_var))
+    facet_wrap( fwv
                , nrow = n_facet_row
                , scales = y_scales) +
     
@@ -73,7 +79,7 @@ lf_bp <- function(lf_df
                     , cex = 0.8
                     , aes(colour = factor(eval(parse(text = colour_categ))) )) 
     }
-  
+
    # Add foramtting to graph
    OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats)
                    , axis.text.y = element_text(size = my_ats

From 5cd6c300a709aaefad2a6d397a1a905067868854 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 17 Sep 2021 13:35:48 +0100
Subject: [PATCH 41/51] saving minor update to function fix

---
 scripts/functions/lf_bp.R | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/functions/lf_bp.R b/scripts/functions/lf_bp.R
index 71b0472..675658e 100644
--- a/scripts/functions/lf_bp.R
+++ b/scripts/functions/lf_bp.R
@@ -31,14 +31,10 @@ lf_bp <- function(lf_df
                   , stat_label = c("p.format", "p.signif") ){
   
   fwv = as.formula(paste0("~", facet_var))
+  #fwv = reformulate(facet_var)
+  
   p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var))  + 
     
-    #fwv = eval(parse(text = facet_var))
-    # facet_wrap(~ fwv
-    #            , nrow = n_facet_row
-    #            , scales = y_scales) +
-    #   
-    # fwv = as.formula(paste0("~", facet_var))
     facet_wrap( fwv
                , nrow = n_facet_row
                , scales = y_scales) +

From daa3556ede6414c03ab1fd0009800026daaceeb4 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 20 Sep 2021 16:12:45 +0100
Subject: [PATCH 42/51] split csv for isoniazid

---
 dynamut/split_csv.sh |   1 +
 mcsm/run_mcsm.py     |   2 +-
 my_header.R          | 107 ++++++++++++++++++++++++++++++++-----------
 3 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/dynamut/split_csv.sh b/dynamut/split_csv.sh
index 17c1a03..1f7a793 100755
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@@ -19,5 +19,6 @@ split ../../${INFILE} -l ${CHUNK} -d snp_batch_
 #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50     #Date: 20/09/2021
 
 # add .txt to the files
diff --git a/mcsm/run_mcsm.py b/mcsm/run_mcsm.py
index 7e38543..da621f5 100755
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@@ -104,7 +104,7 @@ if mutation_filename:
     in_filename_snps = mutation_filename
 else:
     in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
-    
+   
 infile_snps = outdir + '/' + in_filename_snps
 
 #=======
diff --git a/my_header.R b/my_header.R
index 5009f29..2fa892c 100644
--- a/my_header.R
+++ b/my_header.R
@@ -1,21 +1,31 @@
 #########################################################
-### A) Installing and loading required packages
+# A) Installing and loading required packages
+# B) My functions
+#########################################################
+
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")
 
-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
+require("getopt", quietly = TRUE) # cmd parse arguments
 
-#if (!require("tidyverse")) {
-#  install.packages("tidyverse", dependencies = TRUE)
-#  library(tidyverse)
-#}
+if (!require("tidyverse")) {
+  install.packages("tidyverse", dependencies = TRUE)
+  library(tidyverse)
+}
 
-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
+if (!require("shiny")) {
+  install.packages("shiny", dependencies = TRUE)
+  library(shiny)
+}
+
+if (!require("shinyBS")) {
+  install.packages("shinyBS", dependencies = TRUE)
+  library(shinyBS)
+}
+
+if (!require("gridExtra")) {
+  install.packages("gridExtra", dependencies = TRUE)
+  library(gridExtra)
 }
 
 if (!require("ggridges")) {
@@ -23,6 +33,35 @@ if (!require("ggridges")) {
   library(ggridges)
 }
 
+# if (!require("ggplot2")) {
+#   install.packages("ggplot2", dependencies = TRUE)
+#   library(ggplot2)
+# }
+
+# if (!require ("dplyr")){
+#   install.packages("dplyr")
+#   library(dplyr)
+# }
+
+if (!require ("DT")){
+  install.packages("DT")
+  library(DT)
+}
+
+if (!require ("plyr")){
+   install.packages("plyr")
+   library(plyr)
+ }
+
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+if (!require ("ggbeeswarm")){
+   install.packages("ggbeeswarm")
+   library(ggbeeswarm)
+}
+
 if (!require("plotly")) {
   install.packages("plotly", dependencies = TRUE)
   library(plotly)
@@ -103,11 +142,6 @@ if (!require ("psych")){
   library(psych)
 }
 
-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(dplyr)
-}
-
 if (!require ("compare")){
   install.packages("compare")
   library(compare)
@@ -118,18 +152,37 @@ if (!require ("arsenal")){
   library(arsenal)
 }
 
+if(!require(ggseqlogo)){
+  install.packages("ggseqlogo")
+  library(ggseqlogo)
+}
 
-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-#library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
+# for PDB files
 if(!require(bio3d)){
   install.packages("bio3d")
   library(bio3d)
 }
+
+library(protr)
+if(!require(protr)){
+  install.packages("protr")
+  library(protr)
+}
+
+#if (!requireNamespace("BiocManager", quietly = TRUE))
+#  install.packages("BiocManager")
+
+#BiocManager::install("Logolas")
+library("Logolas")
+
+
+####################################
+# Load all my functions:
+# only works if tidyverse is loaded
+# hence included it here!
+####################################
+
+func_path = "~/git/LSHTM_analysis/scripts/functions/"
+source_files <- list.files(func_path, "\\.R$")  # locate all .R files
+map(paste0(func_path, source_files), source)  # source all your R scripts!
+

From d443ecea6b6471e4fcfcc31c771d4857dce58af0 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 20 Sep 2021 16:13:15 +0100
Subject: [PATCH 43/51] added separate script for splitting csv after adding
 chain ID. saves lots of post processing

---
 dynamut/split_csv_chain.sh | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100755 dynamut/split_csv_chain.sh

diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh
new file mode 100755
index 0000000..2526b3f
--- /dev/null
+++ b/dynamut/split_csv_chain.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the dynamut dir
+# use sed to add chain ID to snp file and then split to avoid post processing
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}/chain_added
+cd ${OUTDIR}/${CHUNK}/chain_added
+
+# makes the 2 dirs, hence ../..
+split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+# use case
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps.csv snp_batches 50     #Date: 20/09/2021
+
+# add .txt to the files

From 93a91518e1f46980fb89eb4fd5cca98547d36d50 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 29 Sep 2021 18:24:06 +0100
Subject: [PATCH 44/51] fix runFoldx so that it looks for a missing
 rotabase.txt in the process_dir and also print the foldx command that will be
 run

---
 foldx/runFoldx.py | 70 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 18 deletions(-)

diff --git a/foldx/runFoldx.py b/foldx/runFoldx.py
index 8d9358b..12e00c9 100755
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By
 arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
 
 arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
-arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')
 
 # FIXME: Doesn't work with 2 chains yet!
 arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
@@ -148,6 +148,16 @@ print('Arguments being passed:'
 , '\noutput file:', outfile_foldx
 , '\n=============================================================')
 
+
+# make sure rotabase.txt exists in the process_dir
+rotabase_file = process_dir + '/' + 'rotabase.txt'
+
+if Path(rotabase_file).is_file():
+    print(f'rotabase file: {rotabase_file} exists')
+else:
+    print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
+    sys.exit()
+    
 #### Delay for 10 seconds to check the params ####
 print('Sleeping for 10 seconds to give you time to cancel')
 time.sleep(10)
@@ -235,6 +245,13 @@ def main():
     nmuts = len(mutlist)
     print(nmuts)
     print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    print('\033[95mSTAGE: repair PDB\033[0m')
+    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
+    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    # once you decide to use the function
+    # repairPDB(pdbname)
     
     print('start')  
     # some common parameters for foldX
@@ -242,61 +259,74 @@ def main():
     
     print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
     print('Running foldx RepairPDB for WT')
-    subprocess.call(['foldx' 
+
+    fold_RepairDB = ['foldx' 
     , '--command=RepairPDB'
     , foldx_common
-    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+#    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+    , '--pdb-dir=' + indir
     ,  '--pdb=' + actual_pdb_filename 
     , 'outPDB=true'
-    , '--output-dir=' + process_dir])
+    , '--output-dir=' + process_dir]
+    print('CMD:', fold_RepairDB)
+    subprocess.call(fold_RepairDB)
     print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
     print('\n==========================================================')
     
     
     print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
     print('Running foldx BuildModel for WT')
-    subprocess.call(['foldx' 
+
+    foldx_BuildModel = ['foldx' 
     , '--command=BuildModel'
     , foldx_common
     , '--pdb-dir=' + process_dir
     ,  '--pdb=' + pdbname + '_Repair.pdb'
-    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
     , 'outPDB=true'
     , '--numberOfRuns=1'
-    , '--output-dir=' + process_dir], cwd=process_dir)
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_BuildModel)
+    subprocess.call( foldx_BuildModel, cwd=process_dir)
 
     print('Running foldx PrintNetworks for WT')
-    subprocess.call(['foldx' 
+    foldx_PrintNetworks = ['foldx' 
     , '--command=PrintNetworks'
     , '--pdb-dir=' + process_dir
     ,  '--pdb=' + pdbname + '_Repair.pdb'
     , '--water=PREDICT'
     , '--vdwDesign=1'
-    , '--output-dir=' + process_dir], cwd=process_dir)
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_PrintNetworks)
+    subprocess.call(foldx_PrintNetworks, cwd=process_dir)
 
     print('Running foldx SequenceDetail for WT')
-    subprocess.call(['foldx' 
+    foldx_SequenceDetail = ['foldx' 
     , '--command=SequenceDetail'
     , '--pdb-dir=' + process_dir
     ,  '--pdb=' + pdbname + '_Repair.pdb'
     , '--water=PREDICT'
     , '--vdwDesign=1'
-    , '--output-dir=' + process_dir], cwd=process_dir)
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_SequenceDetail)
+    subprocess.call(foldx_SequenceDetail , cwd=process_dir)
+
     print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
     print('\n==========================================================')
     
-    
     print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
     for n in range(1,nmuts+1):
         print('\033[95mNETWORK:\033[0m', n)
         print('Running foldx PrintNetworks for mutation', n)
-        subprocess.call(['foldx' 
+        foldx_PrintNetworksMT = ['foldx' 
         , '--command=PrintNetworks'
         , '--pdb-dir=' + process_dir
         ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
         , '--water=PREDICT'
         , '--vdwDesign=1'
-        , '--output-dir=' + process_dir], cwd=process_dir) 
+        , '--output-dir=' + process_dir]
+        print('CMD:', foldx_PrintNetworksMT)
+        subprocess.call( foldx_PrintNetworksMT , cwd=process_dir) 
     print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
     print('\n==========================================================')
     
@@ -323,14 +353,16 @@ def main():
         print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
         chain1=chainA
         chain2=chainB
-        subprocess.call(['foldx' 
+        foldx_AnalyseComplex = ['foldx' 
         , '--command=AnalyseComplex'
         , '--pdb-dir=' + process_dir
         ,  '--pdb=' + pdbname + '_Repair.pdb'
         , '--analyseComplexChains=' + chain1 + ',' + chain2
         , '--water=PREDICT'
         , '--vdwDesign=1'
-        , '--output-dir=' + process_dir], cwd=process_dir)
+        , '--output-dir=' + process_dir]
+        print('CMD:',foldx_AnalyseComplex)
+        subprocess.call(foldx_AnalyseComplex, cwd=process_dir)
 
         # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
         ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
@@ -340,14 +372,16 @@ def main():
 
         for n in range(1,nmuts+1):
             print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
-            subprocess.call(['foldx' 
+            foldx_AnalyseComplex = ['foldx' 
             , '--command=AnalyseComplex'
             , '--pdb-dir=' + process_dir
             ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
             , '--analyseComplexChains=' + chain1 + ',' + chain2
             , '--water=PREDICT'
             , '--vdwDesign=1'
-            , '--output-dir=' + process_dir], cwd=process_dir)
+            , '--output-dir=' + process_dir]
+            print('CMD:', foldx_AnalyseComplex)
+            subprocess.call( foldx_AnalyseComplex , cwd=process_dir)
 
             # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
             ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'

From af227f9864a96143e9886f84631dafc9eec43875 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 30 Sep 2021 13:35:33 +0100
Subject: [PATCH 45/51] moved deepddg_format.py from ind output dir to scripts

---
 scripts/deepddg_format.py | 141 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 scripts/deepddg_format.py

diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py
new file mode 100644
index 0000000..aab0769
--- /dev/null
+++ b/scripts/deepddg_format.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+#=======================================================================
+# Task: format deep ddg df to allow easy merging
+
+# Input:  2 dfs
+#1) <gene>.lower()'_mcsm_formatted_snps.csv'
+#2) <gene>.lower()_complex_ddg_results.csv'
+#=======================================================================
+#%% load packages
+import sys, os
+import pandas as pd
+from pandas import DataFrame
+import numpy as np
+#from varname import nameof
+import argparse
+#=======================================================================
+#%% specify input and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+#=======================================================================#%% command line args: case sensitive
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 
+
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+
+arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+drug    = args.drug
+gene    = args.gene
+datadir = args.datadir
+indir   = args.input_dir
+outdir  = args.output_dir
+#%%=======================================================================
+#==============
+# directories
+#==============
+if not datadir:
+    datadir = homedir + '/git/Data/'
+    
+if not indir:
+    indir = datadir + drug + '/input/'
+    
+if not outdir:
+    outdir = datadir + drug + '/output/'
+
+#=======
+# input
+#=======
+in_filename_mcsm_snps = gene.lower() + '_mcsm_formatted_snps.csv'
+infile_mcsm_snps = outdir + in_filename_mcsm_snps
+
+in_filename_deepddg = gene.lower() + '_complex_ddg_results.csv' 
+infile_deepddg = outdir + 'deep_ddg/' + in_filename_deepddg
+
+print('\nInput path:', indir
+      , '\nOutput path:', outdir, '\n'
+      , '\nInput filename mcsm snps', infile_mcsm_snps , '\n'
+      , '\nInput filename deepddg', infile_deepddg , '\n'
+      , '\n============================================================')
+
+#=======
+# output 
+#=======
+#out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.txt'
+out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.csv'
+outfile_deepddg_f =  outdir + out_filename_deepddg
+
+print('Output filename:', outfile_deepddg_f
+        , '\n===================================================================')
+# end of variable assignment for input and output files
+#%%============================================================================  
+print('==================================='
+      , '\nmcsm muts'
+      , '\n===================================')
+
+mcsm_muts_df =  pd.read_csv(infile_mcsm_snps , header = None, sep = ',', names = ['mutationinformation'])
+mcsm_muts_df.columns
+
+#%%============================================================================  
+print('==================================='
+      , '\nDeep ddg'
+      , '\n===================================')
+
+deepddg_df =  pd.read_csv(infile_deepddg, sep = ',')
+deepddg_df.columns
+
+deepddg_df.rename(columns = {'#chain'  : 'chain_id'
+                             , 'WT'    : 'wild_type_deepddg'
+                             , 'ResID' : 'position'
+                             , 'Mut'   : 'mutant_type_deepddg'}
+                             , inplace = True)
+deepddg_df.columns
+deepddg_df['mutationinformation'] = deepddg_df['wild_type_deepddg'] + deepddg_df['position'].map(str) + deepddg_df['mutant_type_deepddg']
+deepddg_df.columns
+
+# add deepddg outcome column: <0--> Destabilising, >0 --> Stabilising
+deepddg_df['deepddg_outcome'] = np.where(deepddg_df['deepddg'] < 0, 'Destabilising', 'Stabilising')
+deepddg_df['deepddg_outcome'].value_counts()
+
+# should be identical in count ot Destabilising and stabilising respectively
+len(deepddg_df.loc[deepddg_df['deepddg'] < 0])
+len(deepddg_df.loc[deepddg_df['deepddg'] >= 0])
+               
+# drop extra columns to allow clean merging
+deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1)
+
+# rearrange columns
+deepddg_short_df.columns
+deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]]
+
+#%% combine with mcsm snps 
+deepddg_mcsm_muts_dfs = pd.merge(deepddg_short_df
+                                 , mcsm_muts_df
+                                 , on = 'mutationinformation'
+                                 , how = 'right')
+deepddg_mcsm_muts_dfs ['deepddg_outcome'].value_counts()
+
+#%%============================================================================
+# write csv
+print('Writing file: formatted deepddg and only mcsm muts')
+deepddg_mcsm_muts_dfs.to_csv(outfile_deepddg_f, index = False)
+print('\nFinished writing file:'
+      , '\nNo. of rows:', deepddg_mcsm_muts_dfs.shape[0]
+      , '\nNo. of cols:', deepddg_mcsm_muts_dfs.shape[1])
+#%% end of script

From 98325d763f4a0d0396e3b428b7bbbd233d776d32 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 30 Sep 2021 13:37:17 +0100
Subject: [PATCH 46/51] fixed output filename in deepddg_format.py

---
 scripts/deepddg_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py
index aab0769..98c2ee1 100644
--- a/scripts/deepddg_format.py
+++ b/scripts/deepddg_format.py
@@ -77,8 +77,8 @@ print('\nInput path:', indir
 #=======
 # output 
 #=======
-#out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.txt'
-out_filename_deepddg = gene.lower() + '_complex_ni_deepddg.csv'
+#out_filename_deepddg = gene.lower() + '_ni_deepddg.txt'
+out_filename_deepddg = gene.lower() + '_ni_deepddg.csv'
 outfile_deepddg_f =  outdir + out_filename_deepddg
 
 print('Output filename:', outfile_deepddg_f

From 675b222181f6aa2fca9d5fa01cce6fa6415948bf Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 18 Oct 2021 13:52:29 +0100
Subject: [PATCH 47/51] added cmd option for dynamut2 formatting results

---
 dynamut/format_results_dynamut.py     |  0
 dynamut/format_results_dynamut2.py    |  0
 dynamut/run_format_results_dynamut.py | 49 +++++++++++++++++++++++----
 dynamut/run_get_results_dynamut.py    |  6 ++--
 dynamut/split_csv_chain.sh            | 21 ++++++++++--
 scripts/data_extraction.py            |  5 +--
 scripts/deepddg_format.py             |  0
 7 files changed, 65 insertions(+), 16 deletions(-)
 mode change 100644 => 100755 dynamut/format_results_dynamut.py
 mode change 100644 => 100755 dynamut/format_results_dynamut2.py
 mode change 100644 => 100755 dynamut/run_format_results_dynamut.py
 mode change 100644 => 100755 scripts/deepddg_format.py

diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py
old mode 100644
new mode 100755
diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py
old mode 100644
new mode 100755
diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py
old mode 100644
new mode 100755
index 02af524..dd9f7fb
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@@ -20,8 +20,45 @@ from format_results_dynamut2 import *
 # variables
 # TODO: add cmd line args
 
-gene = 'gid'
-drug = 'streptomycin'
+#gene = 
+#drug = 
+
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere!
+
+arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = args.drug
+gene = args.gene
+datadir = args.datadir
+indir   = args.input_dir
+outdir  = args.output_dir
+#make_dirs  = args.make_dirs
+
+#%% input and output dirs and files
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#%%=====================================================================
+
 datadir = homedir + '/git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
@@ -29,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/'
 outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
 
 # Input file
-infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
+#infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
 infile_dynamut2 =  outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
 
 # Formatted output filename
-outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
-outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
+#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
+outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
 
 #===============================
 # CALL: format_results_dynamut
@@ -69,4 +106,4 @@ print('Finished writing file:'
        , '\nExpected no. of cols:', len(dynamut2_df_f.columns)
        , '\n=============================================================')
 
-#%%#####################################################################
\ No newline at end of file
+#%%#####################################################################
diff --git a/dynamut/run_get_results_dynamut.py b/dynamut/run_get_results_dynamut.py
index e9e82ef..029e934 100755
--- a/dynamut/run_get_results_dynamut.py
+++ b/dynamut/run_get_results_dynamut.py
@@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au'
 #headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
 
 # TODO: add cmd line args
-#gene = 'gid'
-drug = 'streptomycin'
+# gene = 
+# drug = 
 datadir = homedir + '/git/Data/'
 indir = datadir + drug + '/input/'
 outdir = datadir + drug + '/output/'
@@ -41,4 +41,4 @@ get_results(url_file  = my_url_file
             , output_dir = outdir
             , outfile_suffix = my_suffix)
            
-########################################################################
\ No newline at end of file
+########################################################################
diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh
index 2526b3f..ac60faa 100755
--- a/dynamut/split_csv_chain.sh
+++ b/dynamut/split_csv_chain.sh
@@ -13,10 +13,25 @@ CHUNK=$3
 mkdir -p ${OUTDIR}/${CHUNK}/chain_added
 cd ${OUTDIR}/${CHUNK}/chain_added
 
-# makes the 2 dirs, hence ../..
+# makes the 3 dirs, hence ../..
 split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
 
-# use case
-#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps.csv snp_batches 50     #Date: 20/09/2021
+########################################################################
+# use cases
+# Date: 20/09/2021
+# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50
 
+# Date: 01/10/2021
+# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50     
+
+# Date: 02/10/2021
+# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50  
+
+# Date: 05/10/2021
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20    
+  
 # add .txt to the files
+########################################################################
diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index 5582632..31f8a27 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -81,9 +81,6 @@ indir   = args.input_dir
 outdir  = args.output_dir
 make_dirs  = args.make_dirs
 
-#drug = 'streptomycin'
-#gene = 'gid'
-
 #%% input and output dirs and files
 #=======
 # dirs
@@ -1373,4 +1370,4 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
 print(u'\u2698' * 50,
       '\nEnd of script: Data extraction and writing files'
       '\n' + u'\u2698' * 50 )
-#%% end of script
\ No newline at end of file
+#%% end of script
diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py
old mode 100644
new mode 100755

From ba21188bd2f79421170f8e44e336375ac94ca220 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 18 Oct 2021 13:58:06 +0100
Subject: [PATCH 48/51] added notes

---
 dynamut/notes.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 dynamut/notes.txt

diff --git a/dynamut/notes.txt b/dynamut/notes.txt
new file mode 100644
index 0000000..97e6d02
--- /dev/null
+++ b/dynamut/notes.txt
@@ -0,0 +1,11 @@
+Dynamut was painfully run for gid, part manually, part programatically!
+
+However, it was decided to ditch that and only run Dynamut2 for future targets
+
+Dynamut2 was run through the website in batches of 50 for
+katG: 17 batches (00..16)
+rpoB: 23 batches (00..22)
+alr: 6 batches (00..05)
+
+However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021
+as jobs started to flake and fail through the website!

From 873fd3a121a5e23bed34ad06630287dc67c7e694 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 19 Oct 2021 11:12:34 +0100
Subject: [PATCH 49/51] added gene.lower to dynamut2 format result script

---
 dynamut/run_format_results_dynamut.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py
index dd9f7fb..cb6fe70 100755
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@@ -66,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/'
 outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
 
 # Input file
-#infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
-infile_dynamut2 =  outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
+#infile_dynamut =  outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
+infile_dynamut2 =  outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'
 
 # Formatted output filename
 #outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
-outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
+outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv'
 
 #===============================
 # CALL: format_results_dynamut

From 057291a56147f4684d783ec51316af72f20684bd Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 28 Oct 2021 10:41:43 +0100
Subject: [PATCH 50/51] much development

---
 mcsm_na/format_results_mcsm_na.py |   9 +-
 scripts/combining_dfs.py          | 331 +++++++++++++++++++++++-------
 scripts/data_extraction.py        |   1 -
 scripts/deepddg_format.py         |  12 +-
 scripts/rd_df.py                  |   2 -
 5 files changed, 266 insertions(+), 89 deletions(-)

diff --git a/mcsm_na/format_results_mcsm_na.py b/mcsm_na/format_results_mcsm_na.py
index 95cd9e8..335301c 100644
--- a/mcsm_na/format_results_mcsm_na.py
+++ b/mcsm_na/format_results_mcsm_na.py
@@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
     print('Assigning meaningful colnames'
             , '\n=======================================================')
     my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
-        , 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
+        , 'CHAIN': 'chain' 
         , 'WILD_RES': 'wild_type' # one letter amino acid code
         , 'RES_POS': 'position' # number
         , 'MUT_RES': 'mutant_type' # one letter amino acid code
@@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
     #############
     # create mutationinformation column
     #############    
-    mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
-
+    #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
+    mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
 #%%===================================================================== 
     #############
     # Create col: mcsm_na_outcome
@@ -131,5 +131,4 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
                                 , 'chain'
                                 , 'pdb_file']]
     return(mcsm_na_dataf)
-#%%##################################################################### 
-
+#%%##################################################################### 
\ No newline at end of file
diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 9331edd..53361c7 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -34,6 +34,11 @@ Created on Tue Aug  6 12:56:03 2019
 # Output: single csv of all 8 dfs combined
 # useful link
 # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
+
+#%% FIXME: let the script proceed even if files don't exist!
+# i.e example below
+# '/home/tanu/git/Data/ethambutol/output/dynamut_results/embb_complex_dynamut_norm.csv'
+
 #=======================================================================
 #%% load packages
 import sys, os
@@ -48,7 +53,7 @@ homedir = os.path.expanduser('~')
 
 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 
 # FIXME: local imports
@@ -109,47 +114,81 @@ if not outdir:
 #=======
 # input
 #=======
-#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' 
-in_filename_mcsm     = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb
-in_filename_foldx    = gene.lower() + '_foldx.csv'
-in_filename_deepddg  = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir
-in_filename_dssp     = gene.lower() + '_dssp.csv'
-in_filename_kd       = gene.lower() + '_kd.csv'
-in_filename_rd       = gene.lower() + '_rd.csv'
-#in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
-in_filename_afor     = gene.lower() + '_af_or.csv'
-#in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
-infilename_dynamut   = gene.lower() + '_complex_dynamut_norm.csv'
-infilename_dynamut2  = gene.lower() + '_complex_dynamut2_norm.csv'
-infilename_mcsm_na   = gene.lower() + '_complex_mcsm_na_norm.csv'
+gene_list_normal = ["pnca", "katg", "rpob", "alr"]
+
+if gene.lower() == "gid":
+    print("\nReading mCSM file for gene:", gene)
+    in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv'
+if gene.lower() == "embb":
+    print("\nReading mCSM file for gene:", gene)
+    in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv'
+if gene.lower() in gene_list_normal:
+   print("\nReading mCSM file for gene:", gene)
+   in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' 
+    
+infile_mcsm           = outdir + in_filename_mcsm
+mcsm_df               = pd.read_csv(infile_mcsm, sep = ',')
+
+in_filename_foldx     = gene.lower() + '_foldx.csv'
+infile_foldx          = outdir + in_filename_foldx
+foldx_df              = pd.read_csv(infile_foldx , sep = ',')
+
+in_filename_deepddg   = gene.lower() + '_ni_deepddg.csv' # change to decent filename and put it in the correct dir
+infile_deepddg        = outdir + in_filename_deepddg
+deepddg_df            = pd.read_csv(infile_deepddg, sep = ',')
+
+in_filename_dssp      = gene.lower() + '_dssp.csv'
+infile_dssp           = outdir + in_filename_dssp
+dssp_df               = pd.read_csv(infile_dssp, sep = ',')
+
+in_filename_kd        = gene.lower() + '_kd.csv'
+infile_kd             = outdir + in_filename_kd
+kd_df                 = pd.read_csv(infile_kd, sep = ',')
+
+in_filename_rd        = gene.lower() + '_rd.csv'
+infile_rd             = outdir + in_filename_rd
+rd_df                 = pd.read_csv(infile_rd, sep = ',')
+
+#in_filename_snpinfo   = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info
+#infile_snpinfo        = outdir + in_filename_snpinfo 
+
+in_filename_afor      = gene.lower() + '_af_or.csv'
+infile_afor           = outdir + in_filename_afor
+afor_df               = pd.read_csv(infile_afor, sep = ',') 
+
+#in_filename_afor_kin  = gene.lower() + '_af_or_kinship.csv'
+#infile_afor_kin       = outdir + in_filename_afor_kin
+
+infilename_dynamut2    = gene.lower() + '_dynamut2_norm.csv'
+infile_dynamut2        = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
+dynamut2_df            = pd.read_csv(infile_dynamut2, sep = ',')
+
+#------------------------------------------------------------
+# ONLY:for gene pnca and gid: End logic should pick this up!
+geneL_dy_na = ["pnca", "gid"]
+#if gene.lower() == "pnca" or "gid" :
+if gene.lower() in geneL_dy_na :
+    print("\nGene:", gene.lower()
+          , "\nReading Dynamut and mCSM_na files")
+    infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
+    infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
+    dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
+    
+    infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
+    infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
+    mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
+
+# ONLY:for gene embb and alr: End logic should pick this up!
+geneL_ppi2 = ["embb", "alr"]
+#if gene.lower() == "embb" or "alr":
+if gene.lower() in "embb" or "alr":
+    infilename_mcsm_ppi2   = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
+    infile_mcsm_ppi2       = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
+    mcsm_ppi2_df           = pd.read_csv(infile_mcsm_ppi2, sep = ',')
+#--------------------------------------------------------------
 infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
-
-infile_mcsm    = outdir + in_filename_mcsm
-infile_foldx   = outdir + in_filename_foldx
-infile_deepddg = outdir + in_filename_deepddg
-infile_dssp    = outdir + in_filename_dssp
-infile_kd      = outdir + in_filename_kd
-infile_rd      = outdir + in_filename_rd
-#infile_snpinfo = outdir + in_filename_snpinfo 
-infile_afor    = outdir + in_filename_afor
-#infile_afor_kin = outdir + in_filename_afor_kin
-infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
-infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
-infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
-infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
-
-# read csv
-mcsm_df      = pd.read_csv(infile_mcsm, sep = ',')
-foldx_df     = pd.read_csv(infile_foldx , sep = ',')
-deepddg_df   = pd.read_csv(infile_deepddg, sep = ',')
-dssp_df      = pd.read_csv(infile_dssp, sep = ',')
-kd_df        = pd.read_csv(infile_kd, sep = ',')
-rd_df        = pd.read_csv(infile_rd, sep = ',')
-afor_df      = pd.read_csv(infile_afor, sep = ',') 
-dynamut_df   = pd.read_csv(infile_dynamut, sep = ',')
-dynamut2_df  = pd.read_csv(infile_dynamut2, sep = ',')
-mcsm_na_df   = pd.read_csv(infile_mcsm_na, sep = ',')
-mcsm_f_snps  = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
+infile_mcsm_f_snps     = outdir + infilename_mcsm_f_snps
+mcsm_f_snps            = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
 
 #=======
 # output 
@@ -158,12 +197,6 @@ out_filename_comb = gene.lower() + '_all_params.csv'
 outfile_comb =  outdir + out_filename_comb
 print('Output filename:', outfile_comb
       , '\n===================================================================')
-
-o_join = 'outer'
-l_join = 'left'
-r_join = 'right'
-i_join = 'inner'
-
 # end of variable assignment for input and output files
 #%%############################################################################  
 #=====================
@@ -292,6 +325,44 @@ else:
           , '\n======================================================')
     sys.exit()
 
+#--------------------------
+# check if >1 chain
+#--------------------------
+deepddg_df.loc[:,'chain_id'].value_counts()
+
+if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
+    print("\nChains detected: >1"
+          , "\nGene:", gene
+          , "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
+    
+#--------------------------
+# subset chain
+#--------------------------
+if gene.lower() == "embb":
+    sel_chain = "B"
+else:
+    sel_chain = "A"
+    
+deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
+    
+#--------------------------
+# Check for duplicates
+#--------------------------
+if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
+    print("\nFAIL: Duplicates detected in DeepDDG infile"
+          , "\nNo. of duplicates:"
+          , deepddg_df['mutationinformation'].duplicated().value_counts()[1]
+          , "\nformat deepDDG infile before proceeding")
+    sys.exit()
+else:
+    print("\nPASS: No duplicates detected in DeepDDG infile")
+
+#--------------------------
+# Drop chain id col as other targets don't have itCheck for duplicates
+#--------------------------
+col_to_drop = ['chain_id']
+deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
+
 #%%=============================================================================
 # Now merges begin
 #%%=============================================================================
@@ -311,28 +382,83 @@ get_aa_3lower(df = mcsm_df
 #mcsm_df.columns = mcsm_df.columns.str.lower()
 # foldx_df.shape
 
-#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join)
+#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = "outer")
 merging_cols_m1  = detect_common_cols(mcsm_df, foldx_df)
-mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1,  how =  o_join)
+mcsm_foldx_dfs   = pd.merge(mcsm_df
+                          , foldx_df
+                          , on = merging_cols_m1
+                          , how =  "outer")
 ncols_m1 = len(mcsm_foldx_dfs.columns)
 
 print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
       , '\n===================================================================')
 mcsm_foldx_dfs[merging_cols_m1].apply(len)
 mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
+
+#%% for embB and any other targets where mCSM-lig hasn't run for 
+# get the empty cells to be full of meaningful info
+if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():  
+    print ("NAs detected in mcsm cols after merge")
+    
+    ##############################
+    # Extract relevant col values 
+    # code to one
+    ##############################
+
+    # wt_reg = r'(^[A-Z]{1})'
+    # print('wild_type:', wt_reg)
+
+    # mut_reg = r'[0-9]+(\w{1})$'
+    # print('mut type:', mut_reg)
+    mcsm_foldx_dfs['wild_type']     = mcsm_foldx_dfs.loc[:,'mutationinformation'].str.extract(r'(^[A-Z]{1})')
+    mcsm_foldx_dfs['position']      = mcsm_foldx_dfs.loc[:,'mutationinformation'].str.extract(r'([0-9]+)')
+    mcsm_foldx_dfs['mutant_type']   = mcsm_foldx_dfs.loc[:,'mutationinformation'].str.extract(r'[0-9]+([A-Z]{1})$')
+    
+    # BEWARE: Bit of logic trap i.e if nan comes first
+    # in chain column, then nan will be populated!
+    #df['foo'] = df['chain'].unique()[0]
+    mcsm_foldx_dfs['chain'] = np.where(mcsm_foldx_dfs[['chain']].isnull().all(axis=1)
+                                        , mcsm_foldx_dfs['chain'].unique()[0]
+                                        , mcsm_foldx_dfs['chain'])
+    
+    mcsm_foldx_dfs['ligand_id'] = np.where(mcsm_foldx_dfs[['ligand_id']].isnull().all(axis=1)
+                                        , mcsm_foldx_dfs['ligand_id'].unique()[0]
+                                        , mcsm_foldx_dfs['ligand_id'])
+    #--------------------------------------------------------------------------
+    
+    mcsm_foldx_dfs['wild_pos']       = mcsm_foldx_dfs.loc[:,'wild_type'] + mcsm_foldx_dfs.loc[:,'position'].astype(int).apply(str)
+    mcsm_foldx_dfs['wild_chain_pos'] = mcsm_foldx_dfs.loc[:,'wild_type'] + mcsm_foldx_dfs.loc[:,'chain'] +  mcsm_foldx_dfs.loc[:,'position'].astype(int).apply(str)
+    
+    #############
+    # Map 1 letter 
+    # code to 3Upper
+    #############
+    # initialise a sub dict that is lookup dict for 
+    # 3-LETTER aa code to 1-LETTER aa code
+    lookup_dict = dict()
+    for k, v in oneletter_aa_dict.items():
+        lookup_dict[k] = v['three_letter_code_lower']
+        wt = mcsm_foldx_dfs['wild_type'].squeeze() # converts to a series that map works on
+        mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict)   
+        mut = mcsm_foldx_dfs['mutant_type'].squeeze()
+        mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict)
+    
 #%%
 print('==================================='
       , '\nSecond merge: mcsm_foldx_dfs + deepddg'
       , '\n===================================')
 
-#deepddg_df =  pd.read_csv(infile_deepddg, sep = ',')
-#deepddg_df.columns
-
 # merge with mcsm_foldx_dfs and deepddg_df
-mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs, deepddg_df, on = 'mutationinformation',  how = l_join)
+mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs
+                                  , deepddg_df
+                                  , on = 'mutationinformation'
+                                  , how = "left")
 mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
 
 ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
+
+mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
+
 #%%============================================================================
 print('==================================='
       , '\Third merge: dssp + kd'
@@ -342,9 +468,12 @@ dssp_df.shape
 kd_df.shape
 rd_df.shape
 
-#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join)
+#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
 merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
-dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2,  how = o_join)
+dssp_kd_dfs = pd.merge(dssp_df
+                       , kd_df
+                       , on = merging_cols_m2
+                       , how = "outer")
 
 print('\n\nResult of third merge:', dssp_kd_dfs.shape
       , '\n===================================================================')
@@ -353,10 +482,12 @@ print('==================================='
       , '\nFourth merge: third merge + rd_df' 
       , '\ndssp_kd_dfs + rd_df'
       , '\n===================================')
-#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
+#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
 merging_cols_m3 = detect_common_cols(dssp_kd_dfs,  rd_df)
-dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3
-                          , how = o_join)
+dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs
+                          , rd_df
+                          , on = merging_cols_m3
+                          , how = "outer")
 
 ncols_m3 = len(dssp_kd_rd_dfs.columns)
 
@@ -369,24 +500,41 @@ print('======================================='
       , '\nFifth merge: Second merge + fourth merge'
       , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs'
       , '\n=======================================')
-#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)
+
+#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = "inner")
 #merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs)
-#combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how  = i_join)
+#combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how  = "inner")
 #combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4)
 
 # with deepddg values
 merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs)
-combined_df = pd.merge(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how  = i_join)
+combined_df = pd.merge(mcsm_foldx_deepddg_dfs
+                       , dssp_kd_rd_dfs
+                       , on = merging_cols_m4
+                       , how  = "inner")
 
 combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
 
-if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
-    print('PASS: successfully combined 5 dfs'
-          , '\nNo. of rows combined_df:', len(combined_df)
-          , '\nNo. of cols combined_df:', len(combined_df.columns))   
-else:
-    sys.exit('FAIL: check individual df merges')
-
+# FIXME: check logic, doesn't effect anything else!
+if not gene == "embB":
+    print("\nGene is:", gene)
+    if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
+        print('PASS: successfully combined 5 dfs'
+              , '\nNo. of rows combined_df:', len(combined_df)
+              , '\nNo. of cols combined_df:', len(combined_df.columns))   
+    else:
+        #sys.exit('FAIL: check individual df merges')
+        print("\nGene is:", gene
+              , "\ncombined_df length:", len(combined_df)
+              , "\nmcsm_df_length:", len(mcsm_df)
+              )
+        if len(combined_df.columns) == combined_df_expected_cols:
+            print('PASS: successfully combined 5 dfs'
+                  , '\nNo. of rows combined_df:', len(combined_df)
+                  , '\nNo. of cols combined_df:', len(combined_df.columns))
+        else:
+            sys.exit('FAIL: check individual merges')        
+            
 print('\nResult of Fourth merge:', combined_df.shape
       , '\n===================================================================')
 
@@ -401,7 +549,7 @@ combined_df['chain'].equals(combined_df['chain_id'])
 combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan
 combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
 
-#sanity check
+# sanity check
 foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']] 
 
 # Drop cols
@@ -455,7 +603,11 @@ afor_df = afor_df.drop(['position'], axis = 1)
 afor_cols = afor_df.columns
 
 # merge 
-combined_stab_afor = pd.merge(combined_df_clean, afor_df, on = merging_cols_m5, how  = l_join)
+combined_stab_afor = pd.merge(combined_df_clean
+                              , afor_df
+                              , on = merging_cols_m5
+                              , how  = "left")
+
 comb_afor_df_cols = combined_stab_afor.columns
 
 comb_afor_expected_cols = len(combined_df_clean.columns) + len(afor_df.columns) - len(merging_cols_m5)
@@ -467,18 +619,26 @@ if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor.
 else:
     sys.exit('\nFAIL: check individual df merges')
 
-print('\n\nResult of Fourth merge:', combined_stab_afor.shape
+print('\n\nResult of Fifth merge:', combined_stab_afor.shape
       , '\n===================================================================')
 
 combined_stab_afor[merging_cols_m5].apply(len)
 combined_stab_afor[merging_cols_m5].apply(len) == len(combined_stab_afor)
 
-if len(combined_stab_afor) -  combined_stab_afor['mutation'].isna().sum() == len(afor_df):
-     print('\nPASS: Merge successful for af and or'
-          , '\nNo. of nsSNPs with valid ORs: ', len(afor_df))
-else:
-    sys.exit('\nFAIL: merge unsuccessful for af and or')
+if (len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum()) == len(afor_df):
+     print('\nPASS: Merge successful for af and or with matched numbers')
 
+if len(combined_stab_afor) - combined_stab_afor['mutation'].isna().sum() == len(afor_df)-len(afor_df[~afor_df['mutation'].isin(combined_stab_afor['mutation'])]):
+    print("\nMismatched numbers, OR df has extra snps not found in mcsm df"
+          , "\nNo. of nsSNPs with valid ORs:", len(afor_df)
+          , "\nNo. of mcsm nsSNPs: ", len(combined_df_clean) 
+          , "\nNo. of OR nsSNPs not in mCSM df:"
+          , len(afor_df[~afor_df['mutation'].isin(combined_stab_afor['mutation'])])
+          , "\nWriting these mutations to file:")
+    orsnps_notmcsm = afor_df[~afor_df['mutation'].isin(combined_stab_afor['mutation'])]
+else:
+    sys.exit('\nFAIL: merge unsuccessful for af and or')    
+    
 #%%============================================================================
 # Output columns: when dynamut, dynamut2 and others weren't being combined
 out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
@@ -486,7 +646,7 @@ outfile_comb_afor =  outdir + '/' + out_filename_comb_afor
 print('Output filename:', outfile_comb_afor
       , '\n===================================================================')
 
-# # write csv
+# write csv
 print('Writing file: combined stability and afor')
 combined_stab_afor.to_csv(outfile_comb_afor, index = False)
 print('\nFinished writing file:'
@@ -494,7 +654,20 @@ print('\nFinished writing file:'
       , '\nNo. of cols:', combined_stab_afor.shape[1])
 #%%============================================================================
 # combine dynamut, dynamut2, and mcsm_na
-dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
+#dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid
+
+if gene.lower() == "pnca":
+    dfs_list = [dynamut_df, dynamut2_df]
+if gene.lower() == "gid":
+    dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
+if gene.lower() == "embb":
+    dfs_list = [dynamut2_df, mcsm_ppi2_df]
+if gene.lower() == "katg":
+    dfs_list = [dynamut2_df]
+if gene.lower() == "rpob":
+    dfs_list = [dynamut2_df]
+if gene.lower() == "alr":
+    dfs_list = [dynamut2_df, mcsm_ppi2_df]
 
 dfs_merged = reduce(lambda  left,right: pd.merge(left
                                                 , right
@@ -514,7 +687,7 @@ len(combined_stab_afor.columns)
 combined_all_params = pd.merge(combined_stab_afor
                                , dfs_merged_clean
                                , on = merging_cols_m6
-                               , how  = i_join)
+                               , how  = "inner")
 
 expected_ncols = len(dfs_merged_clean.columns) + len(combined_stab_afor.columns) - len(merging_cols_m6)
 expected_nrows = len(combined_stab_afor)
diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index 31f8a27..aac7cdb 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -70,7 +70,6 @@ arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and outp
 
 arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
 
-
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
diff --git a/scripts/deepddg_format.py b/scripts/deepddg_format.py
index 98c2ee1..20b2dcb 100755
--- a/scripts/deepddg_format.py
+++ b/scripts/deepddg_format.py
@@ -117,12 +117,20 @@ deepddg_df['deepddg_outcome'].value_counts()
 len(deepddg_df.loc[deepddg_df['deepddg'] < 0])
 len(deepddg_df.loc[deepddg_df['deepddg'] >= 0])
                
+#----------------------------------------------
 # drop extra columns to allow clean merging
-deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1)
+#----------------------------------------------
+#deepddg_short_df = deepddg_df.drop(['chain_id', 'wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1)
+
+#----------------------------------------------
+# embb (where gene-target has > 1 chain)
+# include chain else the numbering will be messed up!
+#----------------------------------------------
+deepddg_short_df = deepddg_df.drop(['wild_type_deepddg', 'position', 'mutant_type_deepddg'], axis = 1)
 
 # rearrange columns
 deepddg_short_df.columns
-deepddg_short_df = deepddg_short_df[["mutationinformation", "deepddg", "deepddg_outcome"]]
+deepddg_short_df = deepddg_short_df[["chain_id", "mutationinformation", "deepddg", "deepddg_outcome"]]
 
 #%% combine with mcsm snps 
 deepddg_mcsm_muts_dfs = pd.merge(deepddg_short_df
diff --git a/scripts/rd_df.py b/scripts/rd_df.py
index 7eab903..102530d 100755
--- a/scripts/rd_df.py
+++ b/scripts/rd_df.py
@@ -45,8 +45,6 @@ arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
 drug = args.drug
 gene = args.gene
 gene_match = gene + '_p.'

From 9cfb32afb87cc747656d34d3125eb2c6291ad610 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 28 Oct 2021 12:43:44 +0100
Subject: [PATCH 51/51] pretending that we added the CLI arguments

---
 mcsm_na/run_format_results_mcsm_na.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mcsm_na/run_format_results_mcsm_na.py b/mcsm_na/run_format_results_mcsm_na.py
index cb7b4ca..d990368 100644
--- a/mcsm_na/run_format_results_mcsm_na.py
+++ b/mcsm_na/run_format_results_mcsm_na.py
@@ -14,7 +14,7 @@ from format_results_mcsm_na import *
 # variables
 
 # TODO: add cmd line args
-
+# Imagine we've done the work
 gene = 'gid'
 drug = 'streptomycin'
 datadir = homedir + '/git/Data'