Merge branch 'gidb_dev' (including merge conflict while after adding CLI arguments for mcsm_na/format_results_mcsm_na.py)

2021-10-28 12:45:39 +01:00 · 2021-10-28 12:45:39 +01:00 · e2bc1cdde1
commit e2bc1cdde1
parent e4661963ab 9cfb32afb8
70 changed files with 4048 additions and 819 deletions
--- a/config/gid.R
+++ b/config/gid.R
@ -0,0 +1,2 @@
+gene = "gid"
+drug = "streptomycin"
--- a/dynamut/format_results_dynamut.py
+++ b/dynamut/format_results_dynamut.py
--- a/dynamut/format_results_dynamut2.py
+++ b/dynamut/format_results_dynamut2.py
--- a/dynamut/notes.txt
+++ b/dynamut/notes.txt
@ -0,0 +1,11 @@
+Dynamut was painfully run for gid, part manually, part programatically!
+
+However, it was decided to ditch that and only run Dynamut2 for future targets
+
+Dynamut2 was run through the website in batches of 50 for
+katG: 17 batches (00..16)
+rpoB: 23 batches (00..22)
+alr: 6 batches (00..05)
+
+However, the use of API was made for rpoB batches (09-22) from 13 Oct 2021
+as jobs started to flake and fail through the website!
--- a/dynamut/run_format_results_dynamut.py
+++ b/dynamut/run_format_results_dynamut.py
@ -20,8 +20,45 @@ from format_results_dynamut2 import *
 # variables
 # TODO: add cmd line args

-gene = 'gid'
-drug = 'streptomycin'
+#gene = 
+#drug = 
+
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
+arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
+arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
+arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
+arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
+#arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') # should be handled elsewhere!
+
+arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output paths & filenames
+drug = args.drug
+gene = args.gene
+datadir = args.datadir
+indir   = args.input_dir
+outdir  = args.output_dir
+#make_dirs  = args.make_dirs
+
+#%% input and output dirs and files
+#=======
+# dirs
+#=======
+if not datadir:
+    datadir = homedir + '/' + 'git/Data'
+    
+if not indir:
+    indir = datadir + '/' + drug + '/input'
+    
+if not outdir:
+    outdir = datadir + '/' + drug + '/output'
+
+#%%=====================================================================
+
 datadir = homedir + '/git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
@ -29,12 +66,12 @@ outdir_dynamut = outdir + '/dynamut_results/'
 outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'

 # Input file
-infile_dynamut =  outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
-infile_dynamut2 =  outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
+#infile_dynamut =  outdir_dynamut + gene.lower() + '_dynamut_all_output_clean.csv'
+infile_dynamut2 =  outdir_dynamut2 + gene.lower() + '_dynamut2_output_combined_clean.csv'

 # Formatted output filename
-outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
-outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
+#outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
+outfile_dynamut2_f = outdir_dynamut2 + gene.lower() + '_dynamut2_norm.csv'

 #===============================
 # CALL: format_results_dynamut
@ -69,4 +106,4 @@ print('Finished writing file:'
       , '\nExpected no. of cols:', len(dynamut2_df_f.columns)
       , '\n=============================================================')

-#%%#####################################################################
+#%%#####################################################################
--- a/dynamut/run_get_results_dynamut.py
+++ b/dynamut/run_get_results_dynamut.py
@ -17,8 +17,8 @@ my_host = 'http://biosig.unimelb.edu.au'
 #headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}

 # TODO: add cmd line args
-#gene = 'gid'
-drug = 'streptomycin'
+# gene = 
+# drug = 
 datadir = homedir + '/git/Data/'
 indir = datadir + drug + '/input/'
 outdir = datadir + drug + '/output/'
@ -41,4 +41,4 @@ get_results(url_file  = my_url_file
            , output_dir = outdir
            , outfile_suffix = my_suffix)
           
-########################################################################
+########################################################################
--- a/dynamut/split_csv.sh
+++ b/dynamut/split_csv.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)

 # Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
 # copy your snp file to split into the dynamut dir
@ -12,8 +12,13 @@ CHUNK=$3
 mkdir -p ${OUTDIR}/${CHUNK}
 cd ${OUTDIR}/${CHUNK}

+# makes the 2 dirs, hence ../..
 split ../../${INFILE} -l ${CHUNK} -d snp_batch_

 # use case
 #~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
 #~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh pnca_mcsm_formatted_snps.csv snp_batches 50
+#~/git/LSHTM_analysis/dynamut/split_csv.sh katg_mcsm_formatted_snps.csv snp_batches 50     #Date: 20/09/2021
+
+# add .txt to the files
--- a/dynamut/split_csv_chain.sh
+++ b/dynamut/split_csv_chain.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+# FIXME: This is written for expediency to kickstart running dynamut, mcsm-PPI2 (batch pf 50) and mCSM-NA (batch of 20)
+
+# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
+# copy your snp file to split into the dynamut dir
+# use sed to add chain ID to snp file and then split to avoid post processing
+
+INFILE=$1
+OUTDIR=$2
+CHUNK=$3
+
+mkdir -p ${OUTDIR}/${CHUNK}/chain_added
+cd ${OUTDIR}/${CHUNK}/chain_added
+
+# makes the 3 dirs, hence ../..
+split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
+
+########################################################################
+# use cases
+# Date: 20/09/2021
+# sed -e 's/^/A /g' katg_mcsm_formatted_snps.csv > katg_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 50
+
+# Date: 01/10/2021
+# sed -e 's/^/A /g' rpob_mcsm_formatted_snps.csv > rpob_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 50     
+
+# Date: 02/10/2021
+# sed -e 's/^/A /g' alr_mcsm_formatted_snps.csv > alr_mcsm_formatted_snps_chain.csv
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50  
+
+# Date: 05/10/2021
+#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20    
+  
+# add .txt to the files
+########################################################################
--- a/foldx/runFoldx.py
+++ b/foldx/runFoldx.py
@ -41,7 +41,7 @@ arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By
 arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME

 arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
-arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
+arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_formatted_snps.csv exists')

 # FIXME: Doesn't work with 2 chains yet!
 arg_parser.add_argument('-c1', '--chain1',    help = 'Chain1 ID', default = 'A') # case sensitive
@ -148,6 +148,16 @@ print('Arguments being passed:'
 , '\noutput file:', outfile_foldx
 , '\n=============================================================')

+
+# make sure rotabase.txt exists in the process_dir
+rotabase_file = process_dir + '/' + 'rotabase.txt'
+
+if Path(rotabase_file).is_file():
+    print(f'rotabase file: {rotabase_file} exists')
+else:
+    print(f'ERROR: rotabase file: {rotabase_file} does not exist. Please download it and put it in {process_dir}')
+    sys.exit()
+    
 #### Delay for 10 seconds to check the params ####
 print('Sleeping for 10 seconds to give you time to cancel')
 time.sleep(10)
@ -235,6 +245,13 @@ def main():
    nmuts = len(mutlist)
    print(nmuts)
    print(mutlist)
+    print('start')
+    #subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
+    print('\033[95mSTAGE: repair PDB\033[0m')
+    print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
+    #subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
+    # once you decide to use the function
+    # repairPDB(pdbname)
    
    print('start')  
    # some common parameters for foldX
@ -242,61 +259,74 @@ def main():
    
    print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
    print('Running foldx RepairPDB for WT')
-    subprocess.call(['foldx' 
+
+    fold_RepairDB = ['foldx' 
    , '--command=RepairPDB'
    , foldx_common
-    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+#    , '--pdb-dir=' + os.path.dirname(pdb_filename)
+    , '--pdb-dir=' + indir
    ,  '--pdb=' + actual_pdb_filename 
    , 'outPDB=true'
-    , '--output-dir=' + process_dir])
+    , '--output-dir=' + process_dir]
+    print('CMD:', fold_RepairDB)
+    subprocess.call(fold_RepairDB)
    print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
    print('\n==========================================================')
    
    
    print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
    print('Running foldx BuildModel for WT')
-    subprocess.call(['foldx' 
+
+    foldx_BuildModel = ['foldx' 
    , '--command=BuildModel'
    , foldx_common
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
-    , '--mutant-file="individual_list_' + pdbname +'.txt"'
+    , '--mutant-file=' + process_dir + '/' + 'individual_list_' + pdbname +'.txt'
    , 'outPDB=true'
    , '--numberOfRuns=1'
-    , '--output-dir=' + process_dir], cwd=process_dir)
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_BuildModel)
+    subprocess.call( foldx_BuildModel, cwd=process_dir)

    print('Running foldx PrintNetworks for WT')
-    subprocess.call(['foldx' 
+    foldx_PrintNetworks = ['foldx' 
    , '--command=PrintNetworks'
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--water=PREDICT'
    , '--vdwDesign=1'
-    , '--output-dir=' + process_dir], cwd=process_dir)
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_PrintNetworks)
+    subprocess.call(foldx_PrintNetworks, cwd=process_dir)

    print('Running foldx SequenceDetail for WT')
-    subprocess.call(['foldx' 
+    foldx_SequenceDetail = ['foldx' 
    , '--command=SequenceDetail'
    , '--pdb-dir=' + process_dir
    ,  '--pdb=' + pdbname + '_Repair.pdb'
    , '--water=PREDICT'
    , '--vdwDesign=1'
-    , '--output-dir=' + process_dir], cwd=process_dir)
+    , '--output-dir=' + process_dir]
+    print('CMD:', foldx_SequenceDetail)
+    subprocess.call(foldx_SequenceDetail , cwd=process_dir)
+
    print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
    print('\n==========================================================')
    
-    
    print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
    for n in range(1,nmuts+1):
        print('\033[95mNETWORK:\033[0m', n)
        print('Running foldx PrintNetworks for mutation', n)
-        subprocess.call(['foldx' 
+        foldx_PrintNetworksMT = ['foldx' 
        , '--command=PrintNetworks'
        , '--pdb-dir=' + process_dir
        ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
        , '--water=PREDICT'
        , '--vdwDesign=1'
-        , '--output-dir=' + process_dir], cwd=process_dir) 
+        , '--output-dir=' + process_dir]
+        print('CMD:', foldx_PrintNetworksMT)
+        subprocess.call( foldx_PrintNetworksMT , cwd=process_dir) 
    print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
    print('\n==========================================================')
    
@ -323,14 +353,16 @@ def main():
        print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
        chain1=chainA
        chain2=chainB
-        subprocess.call(['foldx' 
+        foldx_AnalyseComplex = ['foldx' 
        , '--command=AnalyseComplex'
        , '--pdb-dir=' + process_dir
        ,  '--pdb=' + pdbname + '_Repair.pdb'
        , '--analyseComplexChains=' + chain1 + ',' + chain2
        , '--water=PREDICT'
        , '--vdwDesign=1'
-        , '--output-dir=' + process_dir], cwd=process_dir)
+        , '--output-dir=' + process_dir]
+        print('CMD:',foldx_AnalyseComplex)
+        subprocess.call(foldx_AnalyseComplex, cwd=process_dir)

        # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
        ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
@ -340,14 +372,16 @@ def main():

        for n in range(1,nmuts+1):
            print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
-            subprocess.call(['foldx' 
+            foldx_AnalyseComplex = ['foldx' 
            , '--command=AnalyseComplex'
            , '--pdb-dir=' + process_dir
            ,  '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
            , '--analyseComplexChains=' + chain1 + ',' + chain2
            , '--water=PREDICT'
            , '--vdwDesign=1'
-            , '--output-dir=' + process_dir], cwd=process_dir)
+            , '--output-dir=' + process_dir]
+            print('CMD:', foldx_AnalyseComplex)
+            subprocess.call( foldx_AnalyseComplex , cwd=process_dir)

            # FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
            ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
--- a/mcsm/run_mcsm.py
+++ b/mcsm/run_mcsm.py
@ -104,7 +104,7 @@ if mutation_filename:
    in_filename_snps = mutation_filename
 else:
    in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
-    
+   
 infile_snps = outdir + '/' + in_filename_snps

 #=======
--- a/mcsm_na/format_results_mcsm_na.py
+++ b/mcsm_na/format_results_mcsm_na.py
@ -51,7 +51,7 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
    print('Assigning meaningful colnames'
            , '\n=======================================================')
    my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
-        , 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
+        , 'CHAIN': 'chain' 
        , 'WILD_RES': 'wild_type' # one letter amino acid code
        , 'RES_POS': 'position' # number
        , 'MUT_RES': 'mutant_type' # one letter amino acid code
@ -65,8 +65,8 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
    #############
    # create mutationinformation column
    #############    
-    mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
-
+    #mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
+    mcsm_na_data['mutationinformation'] = mcsm_na_data.loc[:,'wild_type'] + mcsm_na_data.loc[:,'position'].astype(int).apply(str) + mcsm_na_data.loc[:,'mutant_type']
 #%%===================================================================== 
    #############
    # Create col: mcsm_na_outcome
@ -131,5 +131,4 @@ def format_mcsm_na_output(mcsm_na_output_tsv):
                                , 'chain'
                                , 'pdb_file']]
    return(mcsm_na_dataf)
-#%%##################################################################### 
-
+#%%##################################################################### 
--- a/my_header.R
+++ b/my_header.R
@ -1,21 +1,31 @@
 #########################################################
-### A) Installing and loading required packages
+# A) Installing and loading required packages
+# B) My functions
+#########################################################
+
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")

-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
+require("getopt", quietly = TRUE) # cmd parse arguments

-#if (!require("tidyverse")) {
-#  install.packages("tidyverse", dependencies = TRUE)
-#  library(tidyverse)
-#}
+if (!require("tidyverse")) {
+  install.packages("tidyverse", dependencies = TRUE)
+  library(tidyverse)
+}

-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
+if (!require("shiny")) {
+  install.packages("shiny", dependencies = TRUE)
+  library(shiny)
+}
+
+if (!require("shinyBS")) {
+  install.packages("shinyBS", dependencies = TRUE)
+  library(shinyBS)
+}
+
+if (!require("gridExtra")) {
+  install.packages("gridExtra", dependencies = TRUE)
+  library(gridExtra)
 }

 if (!require("ggridges")) {
@ -23,6 +33,35 @@ if (!require("ggridges")) {
  library(ggridges)
 }

+# if (!require("ggplot2")) {
+#   install.packages("ggplot2", dependencies = TRUE)
+#   library(ggplot2)
+# }
+
+# if (!require ("dplyr")){
+#   install.packages("dplyr")
+#   library(dplyr)
+# }
+
+if (!require ("DT")){
+  install.packages("DT")
+  library(DT)
+}
+
+if (!require ("plyr")){
+   install.packages("plyr")
+   library(plyr)
+ }
+
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+if (!require ("ggbeeswarm")){
+   install.packages("ggbeeswarm")
+   library(ggbeeswarm)
+}
+
 if (!require("plotly")) {
  install.packages("plotly", dependencies = TRUE)
  library(plotly)
@ -103,11 +142,6 @@ if (!require ("psych")){
  library(psych)
 }

-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(dplyr)
-}
-
 if (!require ("compare")){
  install.packages("compare")
  library(compare)
@ -118,18 +152,37 @@ if (!require ("arsenal")){
  library(arsenal)
 }

+if(!require(ggseqlogo)){
+  install.packages("ggseqlogo")
+  library(ggseqlogo)
+}

-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-#library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
+# for PDB files
 if(!require(bio3d)){
  install.packages("bio3d")
  library(bio3d)
 }
+
+library(protr)
+if(!require(protr)){
+  install.packages("protr")
+  library(protr)
+}
+
+#if (!requireNamespace("BiocManager", quietly = TRUE))
+#  install.packages("BiocManager")
+
+#BiocManager::install("Logolas")
+library("Logolas")
+
+
+####################################
+# Load all my functions:
+# only works if tidyverse is loaded
+# hence included it here!
+####################################
+
+func_path = "~/git/LSHTM_analysis/scripts/functions/"
+source_files <- list.files(func_path, "\\.R$")  # locate all .R files
+map(paste0(func_path, source_files), source)  # source all your R scripts!
+
--- a/scripts/functions/bp_lineage.R
+++ b/scripts/functions/bp_lineage.R
@ -0,0 +1,91 @@
+########################################
+# Lineage barplot
+# Lineage and nsSNP count barplot
+# Lineage Diversity barplot
+########################################
+
+lin_count_bp <- function( lf_data
+                          , x_categ = ""
+                          , y_count = ""
+                          , bar_fill_categ = ""
+                          , display_label_col = ""
+                          , bar_stat_stype = "identity"
+                          , x_lab_angle = 90
+                          , d_lab_size = 5
+                          , d_lab_hjust = 0.5
+                          , d_lab_vjust = 0.5
+                          , d_lab_col = "black"
+                          , my_xats = 20 # x axis text size
+                          , my_yats = 20 # y axis text size
+                          , my_xals = 22 # x axis label size
+                          , my_yals = 22 # y axis label size
+                          , my_lls   = 22 # legend label size
+                          , bar_col_labels = ""
+                          , bar_col_values = ""
+                          , bar_leg_name = ""
+                          , leg_location = "top"
+                          , y_log10 = FALSE
+                          , y_scale_percent = FALSE
+                          , y_label = c("Count", "SNP diversity")
+                          ) {
+  g = ggplot(lf_data
+             , aes(  x    = factor( eval(parse(text = x_categ)), ordered = T )
+                     , y    = eval(parse(text = y_count))
+                     , fill = eval(parse(text = bar_fill_categ)) ) )
+  
+  OutPlot = g + geom_bar( stat          = bar_stat_stype
+                          , position    = position_stack(reverse = TRUE)
+                          #, alpha    = 1
+                          #, colour   = "grey75"
+  ) + 
+    theme(axis.text.x     = element_text(size = my_xats
+                                         , angle = x_lab_angle)
+          , axis.text.y   = element_text(size = my_yats
+                                         , angle = 90
+                                         , hjust = 1
+                                         , vjust = 0)
+          , axis.title.x = element_text(size     = my_xals
+                                        , colour = "black")
+          , axis.title.y = element_text(size     = my_yals
+                                        , colour = "black")
+          , legend.position = leg_location
+          , legend.text = element_text(size = my_lls)) + 
+    
+    geom_label(aes(label = eval(parse(text = display_label_col)))
+               , size    = d_lab_size
+               , hjust   = d_lab_hjust
+               , vjust   = d_lab_vjust
+               , colour  = d_lab_col
+               , show.legend = FALSE
+               #, check_overlap = TRUE
+               , position = position_stack(reverse = T)) + 
+
+    scale_fill_manual(values   = bar_col_values
+                      , name   = bar_leg_name
+                      , labels = bar_col_labels) +
+    labs(title    = ""
+         , x      = ""
+         , y      = y_label
+         , colour = "black")
+    
+  if (y_log10){
+    
+   OutPlot = OutPlot + 
+     scale_y_continuous(trans = "log10"
+                        , labels = trans_format("log10", math_format(10^.x) ) )
+   }
+  
+  if (y_scale_percent){
+    
+    OutPlot = OutPlot +
+      scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+      #scale_y_continuous(labels = scales::percent) +
+      
+      labs(title    = ""
+           , x      = ""
+           , y      = y_label
+           , colour = "black")
+  }
+  
+  return(OutPlot)
+}
--- a/scripts/functions/bp_subcolours.R
+++ b/scripts/functions/bp_subcolours.R
@ -3,7 +3,7 @@
 # LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
 #########################################################

-ColourPalleteMulti <- function(df, group, subgroup){
+ColourPalleteMulti = function(df, group, subgroup){
  
  # Find how many colour categories to create and the number of colours in each
  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
@ -24,4 +24,88 @@ ColourPalleteMulti <- function(df, group, subgroup){
                                                         , category.end[i]))(categories[i,2])}))
  return(colours)
 }
-#########################################################
+#########################################################################
+
+########################
+# Generate bp with
+# colour palette derived
+# from the data using 
+# above function
+#########################
+
+bp_stability_hmap <- function(plotdf = merged_df3
+                              , xvar_colname = "position"
+                              #, bar_col_colname = "group"
+                              , stability_colname = ""
+                              , stability_outcome_colname = ""
+                              , p_title = ""  # "Protein stability (DUET)"
+                              , my_xaxls = 12 # x-axis label size
+                              , my_yaxls = 20 # y-axis label size
+                              , my_xaxts = 18 # x-axis text size
+                              , my_yaxts = 20 # y-axis text size
+                              , my_pts  = 20  # plot-title size
+                              , my_xlab = "Position"
+                              , my_ylab = "No. of nsSNPs"
+)
+{
+  
+  # order the df by position and ensure it is a factor
+  plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
+  plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
+  
+  #cat("\nSneak peak:\n")
+  head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
+  
+  # stability values isolated to help with generating column called: 'group'
+  my_grp = plotdf[[stability_colname]]
+  cat( "\nLength of nsSNPs:", length(my_grp)
+       , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
+  
+  # Add col: 'group'
+  plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
+  
+  # check unique values in normalised data
+  cat("\nNo. of unique values in", stability_colname, "no rounding:"
+      , length(unique(plotdf[[stability_colname]])))
+  
+  # Call the function to create the palette based on the group defined above
+  #subcols_ps
+  subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
+  
+  cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
+  
+  #-------------------------------
+  # Generate the subcols barplot
+  #-------------------------------
+  
+  #g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
+  g = ggplot(plotdf, aes_string(x = xvar_colname
+                                # , ordered = T)
+  ))
+  
+  
+  OutWidePlot = g + geom_bar(aes(fill = group)
+                             , colour = "grey") +
+    
+    scale_fill_manual( values = subcols_bp_hmap
+                       , guide = "none") +
+    
+    theme( axis.text.x = element_text(size = my_xaxls
+                                      , angle = 90
+                                      , hjust = 1
+                                      , vjust = 0.4)
+           , axis.text.y = element_text(size = my_yaxls
+                                        , angle = 0
+                                        , hjust = 1
+                                        , vjust = 0)
+           , axis.title.x = element_text(size = my_xaxts)
+           , axis.title.y = element_text(size = my_yaxts ) 
+           , plot.title = element_text(size = my_pts
+                                       , hjust = 0.5)) +
+    
+    labs(title = p_title
+         , x = my_xlab
+         , y = my_ylab)
+  
+  return(OutWidePlot)
+}
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@ -152,6 +152,46 @@ combining_dfs_plotting <- function(  my_df_u
    unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
    quit()
  }
+
+  # Quick formatting: ordering df and pretty labels
+  
+  #------------------------------
+  # sorting by column: position
+  #------------------------------
+  merged_df2 = merged_df2[order(merged_df2$position), ]
+  
+  #-----------------------
+  # mutation_info_labels
+  #-----------------------
+  merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col
+                                              , "DM", "OM")
+  merged_df2$mutation_info_labels = factor(merged_df2$mutation_info_labels)
+  #-----------------------
+  # lineage labels
+  #-----------------------
+  merged_df2$lineage_labels =  gsub("lineage", "L", merged_df2$lineage)
+  
+  merged_df2$lineage_labels =  factor(merged_df2$lineage_labels, c("L1"
+                                                                   , "L2"
+                                                                   , "L3"
+                                                                   , "L4"
+                                                                   , "L5"
+                                                                   , "L6"
+                                                                   , "L7"
+                                                                   , "LBOV"
+                                                                   , "L1;L2"
+                                                                   , "L1;L3"
+                                                                   , "L1;L4"
+                                                                   , "L2;L3"
+                                                                   , "L2;L3;L4"
+                                                                   , "L2;L4"  
+                                                                   , "L2;L6"
+                                                                   , "L2;LBOV"  
+                                                                   , "L3;L4" 
+                                                                   , "L4;L6"
+                                                                   , "L4;L7"
+                                                                   , ""))
+  
  
  #=================================================================
  # Merge 2: merged_df3
--- a/scripts/functions/lf_bp.R
+++ b/scripts/functions/lf_bp.R
@ -0,0 +1,197 @@
+#############################
+# Barplots: ggplot
+# stats +/-
+# violin +/-
+# barplot +/
+# beeswarm
+#############################
+
+lf_bp <- function(lf_df
+                  , p_title = ""
+                  , colour_categ = ""
+                  , x_grp = "mutation_info"
+                  , y_var = "param_value"
+                  , facet_var = "param_type"
+                  , n_facet_row = 1
+                  , y_scales = "free_y"
+                  , colour_bp_strip = "khaki2"
+                  , dot_size = 3
+                  , dot_transparency = 0.3
+                  , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
+                  , my_ats = 22 # axis text size
+                  , my_als = 20 # axis label size
+                  , my_fls = 20 # facet label size
+                  , my_pts = 22 # plot title size)
+                  , make_boxplot = FALSE
+                  , bp_width = c("auto", 0.5)
+                  , add_stats = TRUE
+                  , stat_grp_comp = c("DM", "OM")
+                  , stat_method = "wilcox.test"
+                  , my_paired = FALSE
+                  , stat_label = c("p.format", "p.signif") ){
+  
+  fwv = as.formula(paste0("~", facet_var))
+  #fwv = reformulate(facet_var)
+  
+  p1 <- ggplot(lf_df, aes_string(x = x_grp, y = y_var))  + 
+    
+    facet_wrap( fwv
+               , nrow = n_facet_row
+               , scales = y_scales) +
+    
+    geom_violin(trim = T
+                , scale = "width"
+                #, position = position_dodge(width = 0.9)
+                , draw_quantiles = violin_quantiles)
+    
+    if (make_boxplot){
+      
+      if (bp_width == "auto"){
+        bp_width = 0.5/length(unique(lf_df[[x_grp]]))
+        cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
+      }else{
+        cat("\nBoxplot width value provided, using:",  bp_width, "\n")
+        bp_width = bp_width} 
+    
+    p2 = p1 + geom_boxplot(fill = "white"
+                  , outlier.colour = NA
+                  #, position = position_dodge(width = 0.9)
+                  , width = bp_width) + 
+      geom_beeswarm(priority = "density"
+                    #, shape = 21
+                    , size = dot_size
+                    , alpha = dot_transparency
+                    , show.legend = FALSE
+                    , cex = 0.8
+                    , aes(colour = factor(eval(parse(text = colour_categ))) ))  
+      
+    } else {
+    # ggbeeswarm (better than geom_point)
+    p2 = p1 +  geom_beeswarm(priority = "density"
+                    #, shape = 21
+                    , size = dot_size
+                    , alpha = dot_transparency
+                    , show.legend = FALSE
+                    , cex = 0.8
+                    , aes(colour = factor(eval(parse(text = colour_categ))) )) 
+    }
+
+   # Add foramtting to graph
+   OutPlot = p2 + theme(axis.text.x = element_text(size = my_ats)
+                   , axis.text.y = element_text(size = my_ats
+                                       , angle = 0
+                                       , hjust = 1
+                                       , vjust = 0)
+                   , axis.title.x = element_text(size = my_ats)
+                   , axis.title.y = element_text(size = my_ats)
+                   , plot.title = element_text(size = my_pts
+                                      , hjust = 0.5
+                                      , colour = "black"
+                                      , face = "bold")
+                   , strip.background = element_rect(fill = colour_bp_strip)
+                   , strip.text.x = element_text(size = my_fls
+                                                  , colour = "black")
+                   , legend.title = element_text(color = "black"
+                                                  , size = my_als)
+                   , legend.text = element_text(size = my_ats)
+                   , legend.direction = "vertical") +
+    
+    labs(title = p_title
+         , x = ""
+         , y = "") 
+   
+    if (add_stats){
+      my_comparisonsL <- list( stat_grp_comp )
+      
+    OutPlot = OutPlot + stat_compare_means(comparisons = my_comparisonsL
+                       , method = stat_method
+                       , paired = my_paired
+                       , label = stat_label[2])
+    return(OutPlot)
+    }
+   
+   return(OutPlot)
+}
+
+#############################
+# Barplot NO stats: plotly
+# violin +/-
+# barplot +/
+# beeswarm
+
+# TODO: plot_ly()
+#############################
+lf_bp_plotly <- function(lf_df
+                         , p_title = ""
+                         , colour_categ = ""
+                         , x_grp = mutation_info
+                         , y_var = param_value
+                         , facet_var = param_type
+                         , n_facet_row = 1
+                         , y_scales = "free_y"
+                         , colour_bp_strip = "khaki2"
+                         , dot_size = 3
+                         , dot_transparency = 0.3
+                         , violin_quantiles = c(0.25, 0.5, 0.75) # can be NULL
+                         , my_ats = 20 # axis text size
+                         , my_als = 18 # axis label size
+                         , my_fls = 18 # facet label size
+                         , my_pts = 22 # plot title size)
+                         #, make_boxplot = FALSE
+                         , bp_width = c("auto", 0.5)
+                         #, add_stats = FALSE
+                         #, stat_grp_comp = c("DM", "OM")
+                         #, stat_method = "wilcox.test"
+                         #, my_paired = FALSE
+                         #, stat_label = c("p.format", "p.signif") 
+                         ){
+  
+  OutPlotly = ggplot(lf_df, aes(x = eval(parse(text = x_grp))
+                              , y = eval(parse(text = y_var))
+                              , label1 = x_grp
+                              , label2 = y_var
+                              , lable3 = colour_categ) )  +
+
+      facet_wrap(~ eval(parse(text = facet_var))
+               , nrow = n_facet_row
+               , scales = y_scales) +
+    
+    geom_violin(trim = T
+                , scale = "width"
+                , draw_quantiles = violin_quantiles) +
+    
+   geom_beeswarm(priority = "density"
+                  , size = dot_size
+                  , alpha = dot_transparency
+                  , show.legend = FALSE
+                  , cex = 0.8
+                  , aes(colour = factor(eval(parse(text = colour_categ) ) ) ) ) +
+    theme(axis.text.x = element_text(size = my_ats)
+          , axis.text.y = element_text(size = my_ats
+                                       , angle = 0
+                                       , hjust = 1
+                                       , vjust = 0)
+          , axis.title.x = element_text(size = my_ats)
+          , axis.title.y = element_text(size = my_ats)
+          , plot.title = element_text(size = my_pts
+                             , hjust = 0.5
+                             , colour = "black"
+                             , face = "bold")
+          , strip.background = element_rect(fill = colour_bp_strip)
+          , strip.text.x = element_text(size = my_fls
+                                         , colour = "black")
+          , legend.title = element_text(color = "black"
+                                         , size = my_als)
+          , legend.text = element_text(size = my_ats)
+          , legend.position = "none")+
+    
+    labs(title = p_title
+         , x = ""
+         , y = "") 
+  
+  OutPlotly = ggplotly(OutPlotly
+                       #, tooltip = c("label")
+                       )
+  return(OutPlotly)
+  
+}
--- a/scripts/functions/lf_unpaired_stats.R
+++ b/scripts/functions/lf_unpaired_stats.R
@ -0,0 +1,21 @@
+library(ggpubr)
+###################################################################
+
+lf_unpaired_stats <- function(lf_data
+                              , lf_stat_value = "param_value"
+                              , lf_stat_group = "mutation_info"
+                              , lf_col_statvars = "param_type"
+                              , my_paired = FALSE
+                              , stat_adj = "none"){
+  
+  stat_formula = as.formula(paste0(lf_stat_value, "~", lf_stat_group))
+  
+  my_stat_df = compare_means(stat_formula
+                             , group.by = lf_col_statvars
+                             , data = lf_data
+                             , paired = my_paired
+                             , p.adjust.method =  stat_adj)  
+  
+  
+  return(my_stat_df)
+}
--- a/scripts/functions/lineage_dist.R
+++ b/scripts/functions/lineage_dist.R
@ -0,0 +1,69 @@
+###############################
+# TASK: function to plot lineage
+# dist plots with or without facet
+# think about color palette
+# for stability
+##############################
+
+#n_colours = length(unique(lin_dist_plot$duet_scaled))
+#my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)
+
+
+lineage_distP <- function(plotdf
+                          , x_axis = "duet_scaled"
+                          , y_axis = "lineage_labels"
+                          , x_lab = "DUET"
+                          , with_facet = F
+                          , facet_wrap_var = ""
+                          , fill_categ = "mutation_info_labels"
+                          , fill_categ_cols = c("#E69F00", "#999999")
+                          , my_ats = 15 # axis text size
+                          , my_als = 20 # axis label size
+                          , my_leg_ts = 16
+                          , my_leg_title = 16
+                          , my_strip_ts = 20
+                          , leg_pos = c(0.8, 0.9)
+                          , leg_pos_wf = c("top", "left", "bottom", "right")
+                          , leg_dir_wf = c("horizontal", "vertical")
+                          , leg_label = "")
+
+{
+
+LinDistP = ggplot(plotdf, aes_string(x = x_axis
+                              , y = y_axis))+
+
+  geom_density_ridges(aes_string(fill = fill_categ)
+                      , scale = 3
+                      , size = 0.3
+                      , alpha = 0.8) +
+  scale_x_continuous(expand = c(0.01, 0.01)) +
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_fill_manual(values = fill_categ_cols) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats)
+        , axis.title.x = element_text(size = my_ats)
+        , axis.title.y = element_blank()
+        , strip.text = element_text(size = my_strip_ts)
+        , legend.text = element_text(size = my_leg_ts)
+        , legend.title = element_text(size = my_leg_title)
+        , legend.position = c(0.8, 0.9)) +
+  labs(x = x_lab
+       , fill = leg_label)
+
+if (with_facet){
+
+  # used reformulate or make as formula
+  #fwv = reformulate(facet_wrap_var)
+  fwv = as.formula(paste0("~", facet_wrap_var))
+  
+  LinDistP = LinDistP + 
+    facet_wrap(fwv) + 
+    theme(legend.position = leg_pos_wf
+          , legend.direction = leg_dir_wf)
+}
+
+return(LinDistP)
+}
--- a/scripts/functions/my_pairs_panel.R
+++ b/scripts/functions/my_pairs_panel.R
@ -1,24 +1,40 @@
-my_corr_pairs <- function (corr_data){
+my_corr_pairs <- function (corr_data_all
+                           , corr_cols = colnames(corr_data_all)
+                           , corr_method = "spearman" # other options: "pearson" or "kendall"
+                           , colour_categ_col = "mutation_info_labels"
+                           , categ_colour =  c("#E69F00", "#999999")
+                           , density_show = F
+                           , hist_col = "coral4"
+                           , dot_size = 1.6
+                           , ats = 1.5
+                           , corr_lab_size = 3
+                           , corr_value_size = 1) 
+  {
  
-  OutPlot_corr = pairs.panels(corr_data
-                              , method = "spearman" # correlation method
-                              , hist.col = "grey" ##00AFBB
-                              , density = TRUE  # show density plots
-                              , ellipses = F # show correlation ellipses
+  corr_data_df =  corr_data_all[corr_cols]
+  my_bg = categ_colour[corr_data_all[[colour_categ_col]] ]
+
+  OutPlot_corr = pairs.panels(corr_data_df
+                              , method = corr_method
+                              , hist.col = hist_col
+                              , density = density_show  
+                              , ellipses = F 
+                              , smooth = F
                              , stars = T
                              , rug = F
                              , breaks = "Sturges"
                              , show.points = T
-                              #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps$duet_outcome))] # foldx colours are reveresed
-                              #, pch = 21 # for bg
-                              , jitter = T
+                              #, bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_data$duet_outcome))] # foldx colours are reveresed
+                              , bg = my_bg
+                              , pch = 21 
                              , alpha = 1
-                              , cex = 1.8
-                              , cex.axis = 2
-                              , cex.labels = 3.5
-                              , cex.cor = 1
-                              , smooth = F)
+                              , cex = dot_size
+                              , cex.axis = ats
+                              , cex.labels = corr_lab_size
+                              , cex.cor = corr_value_size
+                              )
  return(OutPlot_corr)
+  #return (my_bg)
  
 }

--- a/scripts/functions/plotting_data.R
+++ b/scripts/functions/plotting_data.R
@ -16,7 +16,9 @@ library(dplyr)
  ## my_df_u_lig
  ## dup_muts
 #========================================================
-plotting_data <- function(df, lig_dist_colname = 'ligand_distance', lig_dist_cutoff = 10) {
+plotting_data <- function(df
+                          , lig_dist_colname = 'ligand_distance'
+                          , lig_dist_cutoff = 10) {
 my_df       = data.frame()
 my_df_u     = data.frame()
 my_df_u_lig = data.frame()
@ -29,61 +31,6 @@ dup_muts    = data.frame()

 cat("\nInput dimensions:", dim(df)) 

-#==================================
-# add foldx outcome category
-# and foldx scaled values 
-
-# This will enable to always have these variables available
-# when calling for plots
-#==================================
-
-#------------------------------
-# adding foldx scaled values
-# scale data b/w -1 and 1
-#------------------------------
-n = which(colnames(df) == "ddg"); n 
-
-my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max 
-
-df$foldx_scaled = ifelse(df[,n] < 0
-                         , df[,n]/abs(my_min)
-                         , df[,n]/my_max) 
-# sanity check
-my_min = min(df$foldx_scaled); my_min 
-my_max = max(df$foldx_scaled); my_max
-
-if (my_min == -1 && my_max == 1){
-  cat("\nPASS: foldx ddg successfully scaled b/w -1 and 1"
-      , "\nProceeding with assigning foldx outcome category")
-}else{
-  cat("\nFAIL: could not scale foldx ddg values"
-      , "Aborting!\n")
-}
-
-#------------------------------
-# adding foldx outcome category
-# ddg<0 = "Stabilising" (-ve)
-#------------------------------
-c1 = table(df$ddg < 0)
-df$foldx_outcome = ifelse(df$ddg < 0, "Stabilising", "Destabilising")
-c2 = table(df$ddg < 0)
-
-if ( all(c1 == c2) ){
-  cat("\nPASS: foldx outcome successfully created")
-}else{
-  cat("\nFAIL: foldx outcome could not be created. Aborting!\n")
-  exit()
-}
-
-#------------------------------
-# renaming foldx column from 
-# "ddg" --> "ddg_foldx"
-#------------------------------
-
-# change name to foldx
-colnames(df)[n] <- "ddg_foldx"
-
 #==================================
 # extract unique mutation entries
 #==================================
--- a/scripts/functions/plotting_globals.R
+++ b/scripts/functions/plotting_globals.R
@ -32,7 +32,8 @@ import_dirs <- function(drug_name, gene_name) {
 #===============================
 # mcsm ligand distance cut off
 #===============================
-#mcsm_lig_cutoff <<- 10
+LigDist_colname <<- "ligand_distance" 
+LigDist_cutoff <<- 10

 #==================
 # Angstroms symbol
--- a/scripts/functions/position_count_bp.R
+++ b/scripts/functions/position_count_bp.R
@ -42,7 +42,9 @@ site_snp_count_bp <- function (plotdf
             , "\nNo. of cols:", ncol(plotdf)
             , "\nNow adding column: frequency of mutational positions"))
  
-  # adding snpcount for each position 
+  #-------------------------------------------
+  # adding column: snpcount for each position 
+  #-------------------------------------------
  setDT(plotdf)[, pos_count := .N, by = .(eval(parse(text = df_colname)))] 

  cat("\nCumulative nssnp count\n"
@ -64,15 +66,20 @@ site_snp_count_bp <- function (plotdf
  cat(paste0("\nrevised df dimensions:"
             , "\nNo. of rows:", nrow(plotdf)
             , "\nNo. of cols:", ncol(plotdf)))
-  
+
+  #------------------------------------------------------
+  # creating df: average count of snpcount for each position 
+  # created in earlier step
+  #-------------------------------------------------------
  # use group by on pos_count
  snpsBYpos_df <- plotdf %>%
-    group_by(eval(parse(text = df_colname))) %>%
-    summarize(snpsBYpos = mean(pos_count))
-  
-  cat("\nnssnp count\n"
-      , table(snpsBYpos_df$snpsBYpos))
+    dplyr::group_by(eval(parse(text = df_colname))) %>%
+    dplyr::summarise(snpsBYpos = mean(pos_count)) # changed from summarize!
  
+  cat("\nnssnp count per position\n"
+      , table(snpsBYpos_df$snpsBYpos)
+      , "\n")
+
  # calculating total no. of sites associated with nsSNPs
  tot_sites = sum(table(snpsBYpos_df$snpsBYpos))
  
--- a/scripts/functions/redundant/bp_subcolours_v2.R
+++ b/scripts/functions/redundant/bp_subcolours_v2.R
@ -0,0 +1,104 @@
+#########################################################
+# 1b: Define function: coloured barplot by subgroup
+# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
+#########################################################
+
+ColourPalleteMulti = function(df, group, subgroup){
+  
+  # Find how many colour categories to create and the number of colours in each
+  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
+                          , df
+                          , function(x) length(unique(x)))
+  #  return(categories) }
+  
+  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
+  
+  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
+  
+  #return(category.start); return(category.end)}
+  
+  # Build Colour pallette
+  colours <- unlist(lapply(1:nrow(categories),
+                           function(i){
+                             colorRampPalette(colors = c(category.start[i]
+                                                         , category.end[i]))(categories[i,2])}))
+  return(colours)
+}
+#########################################################################
+
+bp_stability_hmap <- function(plotdf = merged_df3
+                              , xvar_colname = "position"
+                              #, bar_col_colname = "group"
+                              , stability_colname = "duet_scaled"
+                              , stability_outcome_colname = "duet_outcome"
+                              , p_title = ""  # "Protein stability (DUET)"
+                              , my_xaxls = 12 # x-axis label size
+                              , my_yaxls = 20 # y-axis label size
+                              , my_xaxts = 18 # x-axis text size
+                              , my_yaxts = 20 # y-axis text size
+                              , my_pts  = 20  # plot-title size
+                              , my_xlab = "Position"
+                              , my_ylab = "No. of nsSNPs"
+                              )
+{
+
+  # order the df by position and ensure it is a factor
+  plotdf = plotdf[order(plotdf[[xvar_colname]]), ]
+  plotdf[[xvar_colname]] = factor(plotdf[[xvar_colname]])
+  
+  #cat("\nSneak peak:\n")
+  head(data.frame( plotdf[[xvar_colname]], plotdf[[stability_colname]] ) )
+
+  # stability values isolated to help with generating column called: 'group'
+  my_grp = plotdf[[stability_colname]]
+  cat( "\nLength of nsSNPs:", length(my_grp)
+       , "\nLength of unique values for nsSNPs:", length(unique(my_grp)) )
+  
+  # Add col: 'group'
+  plotdf$group = paste0(plotdf[[stability_outcome_colname]], "_", my_grp, sep = "")
+
+  # check unique values in normalised data
+  cat("\nNo. of unique values in", stability_colname, "no rounding:"
+      , length(unique(plotdf[[stability_colname]])))
+  
+  # Call the function to create the palette based on the group defined above
+  #subcols_ps
+  subcols_bp_hmap = ColourPalleteMulti(plotdf, stability_outcome_colname, stability_colname)
+  
+  cat("\nNo. of sub colours generated:", length(subcols_bp_hmap))
+  
+  #-------------------------------
+  # Generate the subcols barplot
+  #-------------------------------
+  
+  #g = ggplot(plotdf, aes(x = factor(position, ordered = T)))
+  g = ggplot(plotdf, aes_string(x = xvar_colname
+                               # , ordered = T)
+             ))
+  
+
+  OutWidePlot = g + geom_bar(aes(fill = group)
+                             , colour = "grey") +
+    
+  scale_fill_manual( values = subcols_bp_hmap
+                       , guide = "none") +
+    
+    theme( axis.text.x = element_text(size = my_xaxls
+                                      , angle = 90
+                                      , hjust = 1
+                                      , vjust = 0.4)
+           , axis.text.y = element_text(size = my_yaxls
+                                        , angle = 0
+                                        , hjust = 1
+                                        , vjust = 0)
+           , axis.title.x = element_text(size = my_xaxts)
+           , axis.title.y = element_text(size = my_yaxts ) 
+           , plot.title = element_text(size = my_pts
+                                       , hjust = 0.5)) +
+    
+    labs(title = p_title
+         , x = my_xlab
+         , y = my_ylab)
+
+  return(OutWidePlot)
+}
--- a/scripts/functions/redundant/lf_bp_with_stats.R
+++ b/scripts/functions/redundant/lf_bp_with_stats.R
@ -0,0 +1,97 @@
+library(ggpubr)
+###################################################################
+
+####################################
+lf_bp_with_stats <- function(lf_df
+                        , x_grp = "mutation_info"
+                        , y_var = "param_value"
+                        , facet_var = "param_type"
+                        , n_facet_row = 1
+                        , y_scales = "free_y"
+                        , p_title = ""
+                        , colour_categ = ""
+                        , colour_bp_strip = "khaki2"
+                        , stat_grp_comp = c("DM", "OM")
+                        , stat_method = "wilcox.test"
+                        , my_paired = FALSE
+                        , bp_width = c("auto", 0.5)
+                        , dot_size = 3
+                        , dot_transparency = 0.3
+                        , stat_label = c("p.format", "p.signif")
+                        , my_ats = 22 # axis text size
+                        , my_als = 20 # axis label size
+                        , my_fls = 20 # facet label size
+                        , my_pts = 22 # plot title size
+) {
+  if (bp_width == "auto"){
+  bp_width = 0.5/length(unique(lf_df[[x_grp]]))
+  cat("\nAutomatically calculated boxplot width, using bp_width:\n", bp_width, "\n")
+  }else{
+  cat("\nBoxplot width value provided, using:",  bp_width, "\n")
+  bp_width = bp_width
+  }
+  
+  my_comparisonsL <- list( stat_grp_comp )
+
+  bp_statP <- ggplot(lf_df, aes(x = eval(parse(text = x_grp))
+                    , y = eval(parse(text = y_var)) ))  + 
+    
+    facet_wrap(~ eval(parse(text = facet_var))
+               , nrow = n_facet_row
+               , scales = y_scales) +
+    
+    geom_violin(trim = T
+                , scale = "width"
+                #, position = position_dodge(width = 0.9)
+                , draw_quantiles = c(0.25, 0.5, 0.75)) + 
+    
+    # geom_boxplot(fill = "white"
+    #              , outlier.colour = NA
+    #              #, position = position_dodge(width = 0.9)
+    #              , width = bp_width) +
+
+    # geom_point(position = position_jitterdodge(dodge.width = 0.5)
+    #            , alpha = 0.5
+    #            , show.legend = FALSE
+    #            , aes(colour = factor(eval(parse(text = colour_categ))) )) +
+
+    # ggbeeswarm (better than geom_point)
+      geom_beeswarm(priority = "density"
+                    #, shape = 21
+                    , size = dot_size
+                    , alpha = dot_transparency
+                    , show.legend = FALSE
+                    , cex = 0.8
+                    , aes(colour = factor(eval(parse(text = colour_categ))) )) +
+
+    theme(axis.text.x = element_text(size = my_ats)
+          , axis.text.y = element_text(size = my_ats
+                                       , angle = 0
+                                       , hjust = 1
+                                       , vjust = 0)
+          , axis.title.x = element_text(size = my_ats)
+          , axis.title.y = element_text(size = my_ats)
+          , plot.title = element_text(size = my_pts
+                                      , hjust = 0.5
+                                      , colour = "black"
+                                      , face = "bold")
+          , strip.background = element_rect(fill = colour_bp_strip)
+          , strip.text.x = element_text(size = my_fls
+                                        , colour = "black")
+          , legend.title = element_text(color = "black"
+                                        , size = my_als)
+          , legend.text = element_text(size = my_ats)
+          , legend.direction = "vertical") +
+    
+    labs(title = p_title
+         , x = ""
+         , y = "")+ 
+  
+    stat_compare_means(comparisons = my_comparisonsL
+                       , method = stat_method
+                       , paired = my_paired
+                       , label = stat_label[1])
+  
+  return(bp_statP)
+
+}
--- a/scripts/functions/redundant/test_lf_bp_with_stats.R
+++ b/scripts/functions/redundant/test_lf_bp_with_stats.R
@ -0,0 +1,83 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+
+source("../functions/lf_bp_with_stats.R")
+source("../functions/lf_bp.R")
+
+######################
+# Make  plot
+######################
+# Note: Data
+# run other_plots_data.R
+# to get the long format data to test this function 
+
+lf_bp(lf_df = lf_dynamut2
+                       , p_title = "Dynamut2"
+                       , colour_categ = "ddg_dynamut2_outcome"
+                       , x_grp = "mutation_info"
+                       , y_var = "param_value"
+                       , facet_var = "param_type"
+                       , n_facet_row = 1
+                       , y_scales = "free_y"
+                       , colour_bp_strip = "khaki2"
+                       , dot_size = 3
+                       , dot_transparency = 0.3
+                       , violin_quantiles = c(0.25, 0.5, 0.75)
+                       , my_ats = 22 # axis text size
+                       , my_als = 20 # axis label size
+                       , my_fls = 20 # facet label size
+                       , my_pts = 22 # plot title size 
+                       , make_boxplot = F
+                       , bp_width = "auto"
+                       , add_stats = T
+                       , stat_grp_comp = c("DM", "OM")
+                       , stat_method = "wilcox.test"
+                       , my_paired = FALSE
+                       , stat_label = c("p.format", "p.signif") )
+ 
+# foo = lf_dynamut2 %>%
+#   group_by(mutation_info, param_type) %>%
+#   summarise( Mean = mean(param_value, na.rm = T)
+#              , SD = sd(param_value, na.rm = T)
+#              , Median = median(param_value, na.rm = T)
+#              , IQR = IQR(param_value, na.rm = T) )
+
+# Quick tests
+plotdata_sel = subset(lf_dynamut2
+             , lf_dynamut2$param_type == "ASA")
+
+plot_sum = plotdata_sel %>%
+  group_by(mutation_info, param_type) %>%
+  summarise(n = n()
+             , Mean = mean(param_value, na.rm = T)
+             , SD = sd(param_value, na.rm = T)
+             , Min = min(param_value, na.rm = T)
+             , Q1 = quantile(param_value, na.rm = T, 0.25)
+             , Median = median(param_value, na.rm = T)
+             , Q3 = quantile(param_value, na.rm = T, 0.75)
+             , Max = max(param_value, na.rm = T) ) %>%
+  rename('Mutation Class' = mutation_info
+         , Parameter = param_type)
+plot_sum = as.data.frame(plot_sum, row.names = NULL)
+plot_sum
+
+bar = compare_means(param_value ~ mutation_info
+              , group.by = "param_type"
+              , data = plotdata_sel
+              , paired = FALSE
+              , p.adjust.method = "BH")
+bar2 = bar[c("param_type"
+              , "group1"
+              , "group2"
+              , "p.format"
+              , "p.signif"
+              , "p.adj")] %>%
+  rename(Parameter = param_type
+          , Group1 = group1
+          , Group2 = group2
+          , "P-value" = p.format
+          , "P-sig" = p.signif
+          , "P-adj" = p.adj)
+bar2 = data.frame(bar2); bar2
+
+library(Hmisc)
+describe(lf_dynamut2)
--- a/scripts/functions/stability_count_bp.R
+++ b/scripts/functions/stability_count_bp.R
@ -15,39 +15,42 @@ theme_set(theme_grey())
  ## ...opt args
 #==========================================================
 stability_count_bp <- function(plotdf
-         , df_colname
-         , leg_title = "Legend title"
-         , axis_text_size = 25
-         , axis_label_size = 22
-         , leg_text_size = 20
-         , leg_title_size = 22
+         , df_colname = ""
+         , leg_title = "Legend Title"
+         , ats = 25     # axis text size
+         , als = 22     # axis label size
+         , lts = 20     # legend text size
+         , ltis = 22    # label title size
+         , geom_ls = 10 # geom_label size
         , yaxis_title = "Number of nsSNPs"
         , bp_plot_title = ""
         , label_categories = c("Destabilising", "Stabilising")
         , title_colour = "chocolate4"
         , subtitle_text = NULL
-         , subtitle_size = 20
+         , sts = 20
         , subtitle_colour = "pink"
         #, leg_position = c(0.73,0.8) # within plot area
         , leg_position = "top"){
 
-  OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + 
+#  OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
+  OutPlot_count = ggplot(plotdf, aes_string(x = df_colname)) +     
    geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
    geom_label(stat = "count"
               , aes(label = ..count..)
               , color = "black"
               , show.legend = FALSE
-               , size = 10) +
+               , size = geom_ls) +
    theme(axis.text.x = element_blank()
          , axis.title.x = element_blank()
-          , axis.title.y = element_text(size =  axis_label_size)
-          , axis.text.y = element_text(size = axis_text_size)
+          , axis.title.y = element_text(size =  als)
+          , axis.text.y = element_text(size = ats)
          , legend.position = leg_position
-          , legend.text = element_text(size = leg_text_size)
-          , legend.title = element_text(size =  leg_title_size)
-          , plot.title = element_text(size =  axis_label_size
-                                      , colour = title_colour)
-          , plot.subtitle = element_text(size = subtitle_size
+          , legend.text = element_text(size = lts)
+          , legend.title = element_text(size =  ltis)
+          , plot.title = element_text(size =  als
+                                      , colour = title_colour
+                                      , hjust = 0.5)
+          , plot.subtitle = element_text(size = sts
                                         , hjust = 0.5
                                         , colour = subtitle_colour)) + 
    labs(title      = bp_plot_title
--- a/scripts/functions/tests/test_aa_prop_bp.R
+++ b/scripts/functions/tests/test_aa_prop_bp.R
--- a/scripts/functions/tests/test_af_or_calcs.R
+++ b/scripts/functions/tests/test_af_or_calcs.R
--- a/scripts/functions/tests/test_bp.R
+++ b/scripts/functions/tests/test_bp.R
--- a/scripts/functions/tests/test_bp_lineage.R
+++ b/scripts/functions/tests/test_bp_lineage.R
@ -0,0 +1,62 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+
+source ('get_plotting_dfs.R')
+source("../functions/bp_lineage.R")
+
+#########################################
+# Lineage and SNP count: lineage lf data
+#########################################
+#=========================
+# Data: All lineages or
+# selected few
+#=========================
+sel_lineages = levels(lin_lf$sel_lineages_f)
+sel_lineages
+lin_lf_plot = lin_lf[lin_lf$sel_lineages_f%in%sel_lineages,]
+
+# drop unused factor levels
+lin_lf_plot$sel_lineages_f = factor(lin_lf_plot$sel_lineages_f)
+levels(lin_lf_plot$sel_lineages_f)
+#=========================
+# Lineage count plot
+#=========================
+lin_count_bp(lin_lf_plot
+             , x_categ = "sel_lineages_f"
+             , y_count = "p_count"
+             , bar_fill_categ = "count_categ"
+             , display_label_col = "p_count"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , bar_col_labels = c("Mutations", "Total Samples")
+             , bar_col_values = c("grey50", "gray75")
+             , y_scale_percent = F # T for diversity
+             , y_log10 = F
+             , y_label = "Count")
+
+###############################################
+# Lineage SNP diversity count: lineage wf data
+###############################################
+#=========================
+# Data: All lineages or
+# selected few
+#=========================
+sel_lineages = levels(lin_wf$sel_lineages_f)
+sel_lineages
+lin_wf_plot = lin_wf[lin_wf$sel_lineages_f%in%sel_lineages,]
+
+# drop unused factor levels
+lin_wf_plot$sel_lineages_f = factor(lin_wf_plot$sel_lineages_f)
+levels(lin_wf_plot$sel_lineages_f)
+#=========================
+# Lineage Diversity plot
+#=========================
+lin_count_bp(lin_wf_plot
+                 , x_categ = "sel_lineages_f"
+                 , y_count = "snp_diversity"
+                 , display_label_col = "snp_diversity_f"
+                 , bar_stat_stype = "identity"
+                 , x_lab_angle = 90
+                 , my_xats = 20
+                 , y_scale_percent = T
+                 , y_label = "SNP diversity")
--- a/scripts/functions/tests/test_bp_subcolours.R
+++ b/scripts/functions/tests/test_bp_subcolours.R
@ -0,0 +1,78 @@
+#!/usr/bin/env Rscript      
+#source("~/git/Misc/shiny/myshiny/gid_data.R")
+
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
+
+# p1
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "duet_scaled"
+                  , stability_outcome_colname = "duet_outcome"
+                  , p_title = "DUET" )
+
+# p2
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "foldx_scaled"
+                  , stability_outcome_colname = "foldx_outcome"
+                  , p_title = "FoldX" )
+
+# p3
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "deepddg_scaled"
+                  , stability_outcome_colname = "deepddg_outcome"
+                  , p_title = "DeepDDG" )
+
+# p4
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_dynamut2_scaled"
+                  , stability_outcome_colname = "ddg_dynamut2_outcome"
+                  , p_title = "Dynamut2" )
+
+# p5
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "mcsm_na_scaled"
+                  , stability_outcome_colname = "mcsm_na_outcome"
+                  , p_title = "mCSM-NA" )
+
+# p6
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_dynamut_scaled"
+                  , stability_outcome_colname = "ddg_dynamut_outcome"
+                  , p_title = "Dynamut" )
+
+# p7
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_mcsm_scaled"
+                  , stability_outcome_colname = "ddg_mcsm_outcome"
+                  , p_title = "mCSM" )
+
+# p8
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_duet_scaled"
+                  , stability_outcome_colname = "ddg_duet_outcome"
+                  , p_title = "DUET-d" )
+
+# p9
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_sdm_scaled"
+                  , stability_outcome_colname = "ddg_sdm_outcome"
+                  , p_title = "SDM" )
+
+# p10
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "ddg_encom_scaled"
+                  , stability_outcome_colname = "ddg_encom_outcome"
+                  , p_title = "ENCoM-Stability" )
+
+# p11
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "dds_encom_scaled"
+                  , stability_outcome_colname = "dds_encom_outcome"
+                  , p_title = "ENCoM-Flexibility" )
+
+# p12
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "affinity_scaled"
+                  , stability_outcome_colname = "ligand_outcome"
+                  , p_title = "mCSM-lig" )
--- a/scripts/functions/tests/test_bp_subcolours_i.R
+++ b/scripts/functions/tests/test_bp_subcolours_i.R
@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript      
+#source("~/git/Misc/shiny/myshiny/gid_data.R")
+
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+source("~/git/LSHTM_analysis/scripts/functions/bp_subcolours.R")
+
+# p1
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "duet_scaled"
+                  , stability_outcome_colname = "duet_outcome"
+                  , p_title = "DUET" )
+
+# p2
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "foldx_scaled"
+                  , stability_outcome_colname = "foldx_outcome"
+                  , p_title = "FoldX" )
+
+# p3
+bp_stability_hmap(plotdf =  merged_df3
+                  , stability_colname = "deepddg_scaled"
+                  , stability_outcome_colname = "deepddg_outcome"
+                  , p_title = "DeepDDG" )
+
+##################################################
+
+merged_df3_f = merged_df3
+
+setDT(merged_df3_f)[, pos_count := .N, by = position] 
+
+##################################################
+ui <- basicPage(
+  plotOutput("plot1", click = "plot_click"),
+  verbatimTextOutput("info")
+)
+
+server <- function(input, output) {
+  output$plot1 <- renderPlot({
+    
+    #plot(mtcars$wt, mtcars$mpg)
+    bp_stability_hmap(plotdf =  merged_df3_f
+                      , xvar_colname = "position"
+                      , stability_colname = "foldx_scaled"
+                      , stability_outcome_colname = "foldx_outcome"
+                      , p_title = "FoldX" )
+    
+  })
+  
+  output$info <- renderPrint({
+    # With base graphics, need to tell it what the x and y variables are.
+    nearPoints(merged_df3_f, input$plot_click
+                  , xvar = "position"
+                  , yvar = "pos_count"
+                  )
+  })
+}
+
+shinyApp(ui, server)
--- a/scripts/functions/tests/test_combining_dfs_plotting.R
+++ b/scripts/functions/tests/test_combining_dfs_plotting.R
--- a/scripts/functions/tests/test_lf_bp.R
+++ b/scripts/functions/tests/test_lf_bp.R
@ -0,0 +1,58 @@
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+source("Header_TT.R")
+source("../functions/lf_bp.R")
+# ================================================
+# Data: run get_plotting_data.R
+# to get the long format data to test this function 
+# drug = "streptomycin" 
+# gene = "gid"
+# source("get_plotting_dfs.R")
+# ==================================================
+
+######################
+# Make plot: ggplot
+######################
+lf_bp(lf_df = lf_encomddg
+      , p_title = "ENCoM-DDG"
+      , colour_categ = "ddg_encom_outcome"
+      , x_grp = "mutation_info"
+      , y_var = "param_value"
+      , facet_var = "param_type"
+      , n_facet_row = 1
+      , y_scales = "free_y"
+      , colour_bp_strip = "khaki2"
+      , dot_size = 3
+      , dot_transparency = 0.3
+      , violin_quantiles = c(0.25, 0.5, 0.75)
+      , my_ats = 22 # axis text size
+      , my_als = 20 # axis label size
+      , my_fls = 20 # facet label size
+      , my_pts = 22 # plot title size 
+      , make_boxplot = F
+      , bp_width = "auto"
+      , add_stats = T
+      , stat_grp_comp = c("DM", "OM")
+      , stat_method = "wilcox.test"
+      , my_paired = FALSE
+      , stat_label = c("p.format", "p.signif") )
+
+#wilcox.test(wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "DM"]
+#            , wf_encomdds$`EnCOM ΔΔS`[wf_encomdds$mutation_info == "OM"])
+
+######################
+# Make plot: plotly
+######################
+# FIXME: This labels are not working as I want!
+# lf_bp_plotly(lf_df = lf_deepddg
+#       , p_title = "DeepDDG"
+#       , colour_categ = "deepddg_outcome"
+#       , x_grp = "mutation_info"
+#       , y_var = "param_value"
+#       , facet_var = "param_type"
+#       , n_facet_row = 1
+#       , y_scales = "free_y"
+#       , colour_bp_strip = "khaki2"
+#       , dot_size = 3
+#       , dot_transparency = 0.3
+#       , violin_quantiles = c(0.25, 0.5, 0.75)
+#  )
--- a/scripts/functions/tests/test_lf_unpaired_stats.R
+++ b/scripts/functions/tests/test_lf_unpaired_stats.R
@ -0,0 +1,19 @@
+setwd("~/git/LSHTM_analysis/scripts/functions")
+source("lf_unpaired_stats.R")
+
+#####################
+# call stat function()
+# a useful way to check stats
+# for any lf data
+#####################
+# Note: Data
+# run other_plots_data.R
+# to get the long format data to test this function 
+
+stat_results_df <- lf_unpaired_stats(lf_data =  lf_duet
+                  , lf_stat_value = "param_value"
+                  , lf_stat_group = "mutation_info"
+                  , lf_col_statvars = "param_type"
+                  , my_paired = FALSE
+                  , stat_adj = "none"
+)
--- a/scripts/functions/tests/test_lineage_dist.R
+++ b/scripts/functions/tests/test_lineage_dist.R
@ -0,0 +1,33 @@
+###############################
+# TEST function lineage_dist.R
+# to plot lineage
+# dist plots with or without facet
+##############################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+
+source("get_plotting_dfs.R")
+
+cat("cols imported:"
+    , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
+
+
+#############################################################
+# without facet
+lineage_distP(lin_dist_plot
+              , with_facet = F
+              , leg_label = "Mutation Class"
+)
+
+# without facet
+lineage_distP(lin_dist_plot
+              , with_facet = T
+              , facet_wrap_var = "mutation_info_labels"
+              , leg_label = "Mutation Class"
+              , leg_pos_wf = "none"
+              , leg_dir_wf = "horizontal"
+              
+)
--- a/scripts/functions/tests/test_plotting_data.R
+++ b/scripts/functions/tests/test_plotting_data.R
--- a/scripts/plotting/Header_TT.R
+++ b/scripts/plotting/Header_TT.R
@ -1,14 +1,11 @@
 #########################################################
-### A) Installing and loading required packages
+# A) Installing and loading required packages
+# B) My functions
+#########################################################
+
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")

-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
-require(extrafont)
-
 require("getopt", quietly = TRUE) # cmd parse arguments

 if (!require("tidyverse")) {
@ -16,9 +13,53 @@ if (!require("tidyverse")) {
  library(tidyverse)
 }

-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
+if (!require("shiny")) {
+  install.packages("shiny", dependencies = TRUE)
+  library(shiny)
+}
+
+if (!require("shinyBS")) {
+  install.packages("shinyBS", dependencies = TRUE)
+  library(shinyBS)
+}
+
+if (!require("gridExtra")) {
+  install.packages("gridExtra", dependencies = TRUE)
+  library(gridExtra)
+}
+
+if (!require("ggridges")) {
+  install.packages("ggridges", dependencies = TRUE)
+  library(ggridges)
+}
+
+# if (!require("ggplot2")) {
+#   install.packages("ggplot2", dependencies = TRUE)
+#   library(ggplot2)
+# }
+
+# if (!require ("dplyr")){
+#   install.packages("dplyr")
+#   library(dplyr)
+# }
+
+if (!require ("DT")){
+  install.packages("DT")
+  library(DT)
+}
+
+if (!require ("plyr")){
+   install.packages("plyr")
+   library(plyr)
+ }
+
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+if (!require ("ggbeeswarm")){
+   install.packages("ggbeeswarm")
+   library(ggbeeswarm)
 }

 if (!require("plotly")) {
@ -101,11 +142,6 @@ if (!require ("psych")){
  library(psych)
 }

-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(dplyr)
-}
-
 if (!require ("compare")){
  install.packages("compare")
  library(compare)
@ -116,6 +152,22 @@ if (!require ("arsenal")){
  library(arsenal)
 }

+if(!require(ggseqlogo)){
+  install.packages("ggseqlogo")
+  library(ggseqlogo)
+}
+
+# for PDB files
+if(!require(bio3d)){
+  install.packages("bio3d")
+  library(bio3d)
+}
+
+library(protr)
+if(!require(protr)){
+  install.packages("protr")
+  library(protr)
+}

 #if (!requireNamespace("BiocManager", quietly = TRUE))
 #  install.packages("BiocManager")
@ -123,24 +175,14 @@ if (!require ("arsenal")){
 #BiocManager::install("Logolas")
 library("Logolas")

-#install.packages("ggseqlogo")
-library(ggseqlogo)

+####################################
+# Load all my functions:
+# only works if tidyverse is loaded
+# hence included it here!
+####################################

-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
+func_path = "~/git/LSHTM_analysis/scripts/functions/"
+source_files <- list.files(func_path, "\\.R$")  # locate all .R files
+map(paste0(func_path, source_files), source)  # source all your R scripts!

-library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
-if(!require(bio3d)){
-  install.packages("bio3d")
-  library(bio3d)
-}
-
-#install.packages("protr")
-library(protr)
--- a/scripts/plotting/barplots_subcolours.R
+++ b/scripts/plotting/barplots_subcolours.R
@ -124,4 +124,4 @@ print(outPlot_bp_lig)
 dev.off()
 ######################################################################=
 #                             End of script
-######################################################################=
+######################################################################=
--- a/scripts/plotting/basic_barplots_combined.R
+++ b/scripts/plotting/basic_barplots_combined.R
@ -23,7 +23,7 @@ plot_basic_bp_combined_labelled  =  paste0(plotdir,"/", basic_bp_combined_labell

 #=======================================================================
 #=======
-# combin DUET and Ligand affinity plots
+# combine DUET and Ligand affinity plots
 #=======
 svg(plot_basic_bp_combined_labelled , width = 12, height = 12 )

--- a/scripts/plotting/corr_adjusted_PS_LIG.R
+++ b/scripts/plotting/corr_adjusted_PS_LIG.R
--- a/scripts/plotting/corr_data.R
+++ b/scripts/plotting/corr_data.R
@ -0,0 +1,120 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for corr plots
+#########################################################
+
+#=================================================
+#         Data for Corrplots
+#=================================================
+cat("\n=========================================="
+    , "\nCORR PLOTS data: ALL params"
+    , "\n=========================================")
+
+# use data
+#merged_df2
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# NOTE: you can add mcsm_ppi column as well, and it will only select what it can find!
+big_df_colnames = data.frame(names(merged_df2))
+
+corr_cols_select <- c("mutationinformation", drug, "mutation_info_labels"
+                   , "duet_stability_change", "ligand_affinity_change", "ddg_foldx", "asa", "rsa"
+                   , "rd_values", "kd_values", "log10_or_mychisq", "neglog_pval_fisher","af"
+                   , "deepddg", "ddg_dynamut", "ddg_dynamut2", "mcsm_na_affinity"
+                   , "ddg_encom", "dds_encom", "ddg_mcsm", "ddg_sdm", "ddg_duet", "ligand_distance")
+
+#===========================
+# Corr data for plots: PS
+# big_df ps: ~ merged_df2
+#===========================
+
+corr_df_m2 = merged_df2[,colnames(merged_df2)%in%corr_cols_select]
+
+#-----------------------
+# formatting: some cols
+# Add pretty colnames
+#-----------------------
+corr_df_m2_f <- corr_df_m2 %>% 
+  rename(
+      DUET       = duet_stability_change
+    , 'mCSM-lig' = ligand_affinity_change
+    , FoldX      = ddg_foldx
+    , DeepDDG    = deepddg
+    , ASA        = asa
+    , RSA        = rsa
+    , KD         = kd_values
+    , RD         = rd_values
+    , MAF        = af
+    , 'Log (OR)' = log10_or_mychisq
+    , '-Log (P)' = neglog_pval_fisher
+    , Dynamut    = ddg_dynamut        
+    , 'ENCoM-DDG'= ddg_encom
+    , mCSM       = ddg_mcsm
+    , SDM        = ddg_sdm           
+    , 'DUET-d'   = ddg_duet
+    , 'ENCoM-DDS'= dds_encom
+    , Dynamut2   = ddg_dynamut2
+    , 'mCSM-NA'  = mcsm_na_affinity ) 
+
+#===========================
+# Corr data for plots: PS
+# short_df ps: ~merged_df3
+#===========================
+
+corr_df_m3 = corr_df_m2[!duplicated(corr_df_m2$mutationinformation),]
+
+na_or = sum(is.na(corr_df_m3$log10_or_mychisq))
+check1 = nrow(corr_df_m3) - na_or; check1
+
+if (nrow(corr_df_m3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
+  cat( "\nPASS: No. of rows for corr_df_m3 match"
+       , "\nPASS: No. of OR values checked: " , check1)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3)
+      , "\nGot: ", nrow(corr_df_m3)
+      , "\nExpected OR values: ", nrow(merged_df3_comp)
+      , "\nGot: ", check1)
+}
+
+#-----------------------
+# formatting: some cols
+# Add pretty colnames
+#-----------------------
+corr_df_m3_f <- corr_df_m3 %>% 
+  rename(
+      DUET       = duet_stability_change
+    , 'mCSM-lig' = ligand_affinity_change
+    , FoldX      = ddg_foldx
+    , DeepDDG    = deepddg
+    , ASA        = asa
+    , RSA        = rsa
+    , KD         = kd_values
+    , RD         = rd_values
+    , MAF        = af
+    , 'Log (OR)' = log10_or_mychisq
+    , '-Log (P)' = neglog_pval_fisher
+    , Dynamut    = ddg_dynamut        
+    , 'ENCoM-DDG'= ddg_encom
+    , mCSM       = ddg_mcsm
+    , SDM        = ddg_sdm           
+    , 'DUET-d'   = ddg_duet
+    , 'ENCoM-DDS'= dds_encom
+    , Dynamut2   = ddg_dynamut2
+    , 'mCSM-NA'  = mcsm_na_affinity ) 
+
+########################################################################
+cat("\nCorr Data created:"
+, "\n==================================="
+, "\ncorr_df_m2: created from merged_df2"
+, "\n==================================="
+, "\nnrows:", nrow(corr_df_m2)
+, "\nncols:", ncol(corr_df_m2)
+, "\n==================================="
+, "\ncorr_df_m3: created from merged_df3"
+, "\n==================================="
+, "\nnrows:", nrow(corr_df_m3)
+, "\nncols:", ncol(corr_df_m3)
+)
--- a/scripts/plotting/dirs.R
+++ b/scripts/plotting/dirs.R
--- a/scripts/plotting/dist_plots_check.R
+++ b/scripts/plotting/dist_plots_check.R
--- a/scripts/plotting/dm_om_data.R
+++ b/scripts/plotting/dm_om_data.R
@ -0,0 +1,416 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
+#########################################################
+##========================================================================
+# cols to select: 
+# THINK: whu
+
+comb_df <- merged_df3[, c("mutationinformation", "mutation"
+                               , "mutation_info","mutation_info_labels"
+                               , "position"
+                               , LigDist_colname
+                               , "duet_stability_change", "duet_scaled", "duet_outcome"
+                               , "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
+                               , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                               , "deepddg", "deepddg_scaled", "deepddg_outcome"
+                               , "asa", "rsa"
+                               , "rd_values", "kd_values"
+                               , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                               , "mcsm_na_affinity", "mcsm_na_scaled", "mcsm_na_outcome"
+                               , "ddg_dynamut", "ddg_dynamut_scaled","ddg_dynamut_outcome"
+                               , "ddg_encom",  "ddg_encom_scaled", "ddg_encom_outcome"
+                               , "dds_encom",  "dds_encom_scaled", "dds_encom_outcome"
+                               , "ddg_mcsm",  "ddg_mcsm_scaled",  "ddg_mcsm_outcome"    
+                               , "ddg_sdm",  "ddg_sdm_scaled",  "ddg_sdm_outcome"
+                               , "ddg_duet", "ddg_duet_scaled",  "ddg_duet_outcome"
+                               , "ddg_dynamut2","ddg_dynamut2_scaled",  "ddg_dynamut2_outcome")]
+                               
+
+comb_df_s = arrange(comb_df, position)
+    
+#=======================================================================
+fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
+fact_cols
+lapply(comb_df_s[, fact_cols], class)
+comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
+
+if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
+  cat("\nChanging cols to factor")
+  comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
+  if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}
+lapply(comb_df_s[, fact_cols], class)
+
+#=======================================================================
+table(comb_df_s$mutation_info)
+
+ # further checks to make sure dr and other muts are indeed unique
+dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
+dr_muts_names = unique(dr_muts$mutation)
+
+other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
+other_muts_names = unique(other_muts$mutation)
+
+if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
+  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
+  cat("PASS: dr and other muts are indeed unique")
+}else{
+  cat("FAIL: dr and others muts are NOT unique!")
+  quit()
+}
+
+# pretty display names i.e. labels to reduce major code duplication later
+foo_cnames = data.frame(colnames(comb_df_s))
+names(foo_cnames) <- "old_name"
+
+stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
+flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
+
+lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
+duet_dn      = paste0("DUET ", stability_suffix); duet_dn
+foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
+deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
+mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
+dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
+dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
+encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
+encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
+sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
+mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
+
+# Change colnames of some columns using datatable 
+comb_df_sl = comb_df_s
+names(comb_df_sl)
+
+setnames(comb_df_sl
+         , old = c("asa", "rsa", "rd_values", "kd_values"
+                   , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                   , LigDist_colname
+                   , "duet_scaled"
+                   , "foldx_scaled"
+                   , "deepddg_scaled"
+                   , "mcsm_na_scaled"
+                   , "ddg_dynamut_scaled"
+                   , "ddg_dynamut2_scaled"
+                   , "ddg_encom_scaled"
+                   , "dds_encom_scaled"
+                   , "ddg_sdm"
+                   , "ddg_mcsm")
+                   
+         , new = c("ASA", "RSA", "RD", "KD"
+                   , "Log10 (OR)", "-Log (P)", "MAF"
+                   , lig_dn
+                   , duet_dn
+                   , foldx_dn
+                   , deepddg_dn
+                   , mcsm_na_dn
+                   , dynamut_dn
+                   , dynamut2_dn
+                   , encom_ddg_dn
+                   , encom_dds_dn
+                   , sdm_dn
+                   , mcsm_dn)
+         )
+
+foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
+
+# some more pretty labels
+table(comb_df_sl$mutation_info)
+
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
+
+table(comb_df_sl$mutation_info)
+
+#######################################################################
+#======================
+# Selecting dfs
+# with appropriate cols
+#=======================
+static_cols_start =  c("mutationinformation"
+                       , "position"
+                       , "mutation"
+                       , "mutation_info")
+
+static_cols_end = c(lig_dn
+                    , "ASA"
+                    , "RSA"
+                    , "RD"
+                    , "KD")
+
+# ordering is important!
+
+#########################################################################
+#==============
+# DUET: LF
+#==============
+cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
+wf_duet = comb_df_sl[, cols_to_select_duet]
+
+#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
+
+expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
+expected_rows_lf
+
+# LF data: duet
+lf_duet = gather(wf_duet
+                  , key = param_type
+                  , value = param_value
+                  , all_of(duet_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_duet) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", duet_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# FoldX: LF
+#==============
+cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
+wf_foldx = comb_df_sl[, cols_to_select_foldx]
+
+pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
+
+expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
+expected_rows_lf
+
+# LF data: Foldx
+lf_foldx <<- gather(wf_foldx
+                 , key = param_type
+                 , value = param_value
+                 , all_of(foldx_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_foldx) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", foldx_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Deepddg: LF
+#==============
+cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
+wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
+
+pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
+
+expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
+expected_rows_lf
+
+# LF data: Deepddg
+lf_deepddg = gather(wf_deepddg
+                  , key = param_type
+                  , value = param_value
+                  , all_of(deepddg_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_deepddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", deepddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM-NA: LF
+#==============
+cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
+wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+
+pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
+
+expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+expected_rows_lf
+
+# LF data: mcsm_na
+lf_mcsm_na = gather(wf_mcsm_na
+                    , key = param_type
+                    , value = param_value
+                    , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_mcsm_na) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", mcsm_na_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut: LF
+#==============
+cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
+wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
+
+pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
+
+expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
+expected_rows_lf
+
+# LF data: dynamut
+lf_dynamut = gather(wf_dynamut
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut2: LF
+#==============
+cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
+
+wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
+
+pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
+
+expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
+expected_rows_lf
+
+# LF data: dynamut2
+lf_dynamut2 = gather(wf_dynamut2
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut2_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut2) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut2_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# EnCOM ddg: LF
+#==============
+cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
+wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
+
+pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
+
+expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomddg  = gather(wf_encomddg 
+                     , key = param_type
+                     , value = param_value
+                     , all_of(encom_ddg_dn):tail(static_cols_end,1)
+                     , factor_key = TRUE)
+
+if (nrow(lf_encomddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", encom_ddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+#==============
+# EnCOM dds: LF
+#==============
+cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
+wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
+
+pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
+
+expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
+expected_rows_lf
+
+# LF data: encomdds 
+lf_encomdds  = gather(wf_encomdds
+                      , key = param_type
+                      , value = param_value
+                      , all_of(encom_dds_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+
+if (nrow(lf_encomdds) == expected_rows_lf){
+  cat("\nPASS: long format data created for", encom_dds_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# SDM: LF
+#==============
+cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
+wf_sdm = comb_df_sl[, cols_to_select_sdm]
+
+pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
+
+expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
+expected_rows_lf
+
+# LF data: sdm
+lf_sdm  = gather(wf_sdm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(sdm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_sdm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", sdm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM: LF
+#==============
+cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
+wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
+
+pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
+
+expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
+expected_rows_lf
+
+# LF data: mcsm
+lf_mcsm  = gather(wf_mcsm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(mcsm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_mcsm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", mcsm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+#==========================
+# Duet-d(from Dynamut): LF
+#===========================
+
+#Not created, redundant and chaos!
+
+############################################################################
+
--- a/scripts/plotting/extreme_muts.R
+++ b/scripts/plotting/extreme_muts.R
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@ -1,32 +1,27 @@
 #!/usr/bin/env Rscript
+
 #########################################################
 # TASK: Get formatted data for plots
-#=======================================================================
+#########################################################
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()

 source("Header_TT.R")
-source("../functions/my_pairs_panel.R") # with lower panel turned off
-source("../functions/plotting_globals.R")
-source("../functions/plotting_data.R")
-source("../functions/combining_dfs_plotting.R")
-source("../functions/bp_subcolours.R")

 #********************
 # cmd args passed 
 # in from other scripts
 # to call this
 #********************
-#drug = 'streptomycin'
-#gene = 'gid'
+
 #====================
 # variables for lig
 #====================

-LigDist_colname = "ligand_distance"
-LigDist_cutoff = 10
+#LigDist_colname = "ligand_distance"
+#LigDist_cutoff = 10

 #===========
 # input
@ -41,8 +36,8 @@ import_dirs(drug, gene)
 #---------------------------
 if (!exists("infile_params") && exists("gene")){
 #if (!is.character(infile_params) && exists("gene")){ # when running as cmd
-  #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
-  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
+  in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA (and for gid finally) 10/09/21
+  #in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
  infile_params = paste0(outdir, "/", in_filename_params)
  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
 }
@ -54,10 +49,15 @@ pd_df = plotting_data(mcsm_df
                      , lig_dist_colname = LigDist_colname
                      , lig_dist_cutoff = LigDist_cutoff)

-my_df       = pd_df[[1]] 
-my_df_u     = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
-my_df_u_lig = pd_df[[3]] 
-dup_muts    = pd_df[[4]] 
+my_df   = pd_df[[1]] 
+my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
+
+max_ang <- round(max(my_df_u[LigDist_colname]))
+min_ang <- round(min(my_df_u[LigDist_colname]))
+
+cat("\nLigand distance cut off, colname:", LigDist_colname
+    , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
+    , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")

 #--------------------------------
 # call: combining_dfs_plotting()
@ -81,509 +81,149 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
                                      , lig_dist_colname = LigDist_colname
                                      , lig_dist_cutoff = LigDist_cutoff)

-merged_df2          = all_plot_dfs[[1]]
-merged_df3          = all_plot_dfs[[2]]
-merged_df2_comp     = all_plot_dfs[[3]]
-merged_df3_comp     = all_plot_dfs[[4]]
-merged_df2_lig      = all_plot_dfs[[5]]
-merged_df3_lig      = all_plot_dfs[[6]]
-merged_df2_comp_lig = all_plot_dfs[[7]]
-merged_df3_comp_lig = all_plot_dfs[[8]]
+merged_df2      = all_plot_dfs[[1]]
+merged_df3      = all_plot_dfs[[2]]
+merged_df2_comp = all_plot_dfs[[3]]
+merged_df3_comp = all_plot_dfs[[4]]
+#======================================================================
+#TODO: Think! MOVE TO COMBINE or singular file for deepddg
+
+#============================
+# adding deepddg scaled values
+# scale data b/w -1 and 1
+#============================
+# n = which(colnames(merged_df3) == "deepddg"); n 
+# 
+# my_min = min(merged_df3[,n]); my_min 
+# my_max = max(merged_df3[,n]); my_max 
+# 
+# merged_df3$deepddg_scaled = ifelse(merged_df3[,n] < 0
+#                                    , merged_df3[,n]/abs(my_min)
+#                                    , merged_df3[,n]/my_max) 
+# # sanity check
+# my_min = min(merged_df3$deepddg_scaled); my_min 
+# my_max = max(merged_df3$deepddg_scaled); my_max
+# 
+# if (my_min == -1 && my_max == 1){
+#    cat("\nPASS: DeepDDG successfully scaled b/w -1 and 1"
+#        #, "\nProceeding with assigning deep outcome category")
+#        , "\n")
+# }else{
+#    cat("\nFAIL: could not scale DeepDDG ddg values"
+#        , "Aborting!")
+# }
+# 

 ####################################################################
-#                        Data for subcols barplot (~heatmpa)
+#                        Data for combining other dfs
 ####################################################################
-# can include: mutation, or_kin, pwald, af_kin
-cols_to_select = c("mutationinformation", "drtype"
-                   , "wild_type"
-                   , "position"
-                   , "mutant_type"
-                   , "chain", "ligand_id", "ligand_distance"
-                   , "duet_stability_change", "duet_outcome", "duet_scaled"
-                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
-                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
-                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
-                   , "asa", "rsa", "rd_values", "kd_values"
-                   , "af", "or_mychisq", "pval_fisher" 
-                   , "or_fisher", "or_logistic", "pval_logistic"
-                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
-                   , "wt_calcprop", "mut_calcprop")

-#=======================
-# Data for sub colours
-# barplot: PS
-#=======================
+#source("other_dfs_data.R")
+# Fixed this at source i.e python script
+# Moved: "other_dfs_data.R" to redundant/

-cat("\nNo. of cols to select:", length(cols_to_select))
+####################################################################
+#                        Data for subcols barplot (~heatmap)
+####################################################################

-subcols_df_ps = merged_df3[, cols_to_select]
-
-cat("\nNo of unique positions for ps:"
-    , length(unique(subcols_df_ps$position)))
-
-# add count_pos col that counts the no. of nsSNPS at a position
-setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
-
-# should be a factor
-if (is.factor(subcols_df_ps$duet_outcome)){
-  cat("\nDuet_outcome is factor")
-  table(subcols_df_ps$duet_outcome)
-}else{
-  cat("\nConverting duet_outcome to factor")
-  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
-  table(subcols_df_ps$duet_outcome)
-}
-
-# should be -1 and 1
-min(subcols_df_ps$duet_scaled)
-max(subcols_df_ps$duet_scaled)
-
-tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
-tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
-
-# check unique values in normalised data
-cat("\nNo. of unique values in duet scaled, no rounding:"
-    , length(unique(subcols_df_ps$duet_scaled)))
-
-# No rounding    
-my_grp = subcols_df_ps$duet_scaled; length(my_grp)
-
-# Add rounding is to be used
-n = 3 
-subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
-
-cat("\nNo. of unique values in duet scaled", n, "places rounding:"
-    , length(unique(subcols_df_ps$duet_scaledR)))
-
-my_grp_r = subcols_df_ps$duet_scaledR  # rounding
-
-# Add grp cols
-subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
-subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
-
-# Call the function to create the palette based on the group defined above
-subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
-subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
-
-print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
-print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
-
-#=======================
-# Data for sub colours
-# barplot: LIG
-#=======================
-cat("\nNo. of cols to select:", length(cols_to_select))
-
-subcols_df_lig = merged_df3_lig[, cols_to_select]
-
-cat("\nNo of unique positions for LIG:"
-    , length(unique(subcols_df_lig$position)))
-
-# should be a factor
-if (is.factor(subcols_df_lig$ligand_outcome)){
-  cat("\nLigand_outcome is factor")
-  table(subcols_df_lig$ligand_outcome)
-}else{
-  cat("\nConverting ligand_outcome to factor")
-  subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
-  table(subcols_df_lig$ligand_outcome)
-}
-
-# should be -1 and 1
-min(subcols_df_lig$affinity_scaled)
-max(subcols_df_lig$affinity_scaled)
-
-tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
-tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
-
-# check unique values in normalised data
-cat("\nNo. of unique values in affinity scaled, no rounding:"
-    , length(unique(subcols_df_lig$affinity_scaled)))
-
-# No rounding    
-my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
-
-# Add rounding is to be used
-n = 3 
-subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
-
-cat("\nNo. of unique values in duet scaled", n, "places rounding:"
-    , length(unique(subcols_df_lig$affinity_scaledR)))
-
-my_grp_lig_r = subcols_df_lig$affinity_scaledR  # rounding
-
-# Add grp cols
-subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
-subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
-
-# Call the function to create the palette based on the group defined above
-subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
-subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
-
-print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
-print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
+#source("coloured_bp_data.R")
+# Repurposed function so that params can be passed instead to generate
+# data required for plotting.
+# Moved "coloured_bp_data.R" to redundant/

 ####################################################################
 #                        Data for logoplots
 ####################################################################
-#-------------------------
-# choose df for logoplot
-#-------------------------
-logo_data = merged_df3
-#logo_data = merged_df3_comp

-# quick checks
-colnames(logo_data)
-str(logo_data)
+source("logo_data.R")

-c1 = unique(logo_data$position) 
-nrow(logo_data)
-cat("No. of rows in my_data:", nrow(logo_data)
-    , "\nDistinct positions corresponding to snps:", length(c1)
-    , "\n===========================================================")
-#=======================================================================
-#==================
-# logo data: OR
-#==================
-foo = logo_data[, c("position"
-                      , "mutant_type","duet_scaled", "or_mychisq"
-                      , "mut_prop_polarity", "mut_prop_water")] 
-
-logo_data$log10or = log10(logo_data$or_mychisq)
-logo_data_plot = logo_data[, c("position"
-                            , "mutant_type", "or_mychisq", "log10or")]
-
-logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
-wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
-
-wide_df_or = as.matrix(wide_df_or)
-rownames(wide_df_or) = wide_df_or[,1]
-dim(wide_df_or)
-wide_df_or = wide_df_or[,-1]
-str(wide_df_or)
-
-position_or = as.numeric(colnames(wide_df_or))
-
-#==================
-# logo data: logOR
-#==================
-logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
-wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
-
-wide_df_logor = as.matrix(wide_df_logor)
-
-rownames(wide_df_logor) = wide_df_logor[,1]
-wide_df_logor = subset(wide_df_logor, select = -c(1) )
-colnames(wide_df_logor)
-wide_df_logor_m = data.matrix(wide_df_logor)
-
-rownames(wide_df_logor_m)
-colnames(wide_df_logor_m)
-
-position_logor = as.numeric(colnames(wide_df_logor_m))
-
-#===============================
-# logo data: multiple nsSNPs (>1)
-#=================================
-#require(data.table)
-
-# get freq count of positions so you can subset freq<1
-setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
-
-table(logo_data$position)
-table(logo_data$mut_pos_occurrence)
-
-max_mut = max(table(logo_data$position))
-
-# extract freq_pos > 1
-my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
-u = unique(my_data_snp$position)
-max_mult_mut = max(table(my_data_snp$position))
-
-if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
-  
-  cat("PASS: positions with  multiple muts extracted"
-      , "\nNo. of mutations:", nrow(my_data_snp)
-      , "\nNo. of positions:", length(u)
-      , "\nMax no. of muts at any position", max_mult_mut)
-}else{
-  cat("FAIL: positions with multiple muts could NOT be extracted"
-      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
-      , "\nGot:", nrow(my_data_snp) )
-}
-
-cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
-
-#--------------------------------------
-# matrix for_mychisq mutant type
-# frequency of mutant type by position
-#---------------------------------------
-table(my_data_snp$mutant_type, my_data_snp$position)
-tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
-class(tab_mt)
-
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-# should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#-------------------------------------
-# matrix for wild type
-# frequency of wild type by position
-#-------------------------------------
-tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
-tab_wt = unclass(tab_wt)
-
-# remove wt duplicates
-wt = my_data_snp[, c("position", "wild_type")]
-wt = wt[!duplicated(wt),]
-
-tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
-
-rownames(tab_wt)
-rownames(tab_wt)
-
-identical(colnames(tab_mt), colnames(tab_wt))
-identical(ncol(tab_mt), ncol(tab_wt))
-
-#----------------------------------
-# logo data OR: multiple nsSNPs (>1)
-#----------------------------------
-logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
-#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
-wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
-
-wide_df_or_mult = as.matrix(wide_df_or_mult)
-rownames(wide_df_or_mult) = wide_df_or_mult[,1]
-wide_df_or_mult = wide_df_or_mult[,-1]
-str(wide_df_or_mult)
-
-position_or_mult = as.numeric(colnames(wide_df_or_mult))
+s1 = c("\nSuccessfully sourced logo_data.R")
+cat(s1)

 ####################################################################
-#                        Data for Corrplots
+#                        Data for DM OM Plots: Long format dfs
 ####################################################################
-cat("\n=========================================="
-    , "\nCORR PLOTS data: PS"
-    , "\n===========================================")

-df_ps = merged_df2
+#source("other_plots_data.R")

-#--------------------
-# adding log cols : NEW UNCOMMENT
-#--------------------
-#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
-#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+source("dm_om_data.R")

-##df_ps$log10_or_kin = log10(df_ps$or_kin)
-##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+s2 = c("\nSuccessfully sourced other_plots_data.R")
+cat(s2)

-#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
+####################################################################
+#                  Data for Lineage barplots: WF and LF dfs
+####################################################################

-#----------------------------
-# columns for corr plots:PS
-#----------------------------
-# subset data to generate pairwise correlations
-cols_to_select =  c("mutationinformation"
-                    , "duet_scaled"
-                    , "foldx_scaled"
-                    #, "mutation_info_labels"
-                    , "asa"
-                    , "rsa"
-                    , "rd_values"
-                    , "kd_values"
-                    , "log10_or_mychisq"
-                    , "neglog_pval_fisher"
-                    ##, "or_kin"
-                    ##, "neglog_pwald_kin"
-                    , "af"
-                    ##, "af_kin"
-                    , "duet_outcome"
-                    , drug)
+source("lineage_data.R")

-corr_data_ps = df_ps[cols_to_select]
+s3 = c("\nSuccessfully sourced lineage_data.R")
+cat(s3)

-dim(corr_data_ps)
+####################################################################
+#                  Data for corr plots:
+####################################################################
+# make sure the above script works because merged_df2_combined is needed
+source("corr_data.R")

-#--------------------------------------
-# assign nice colnames (for display)
-#--------------------------------------
-my_corr_colnames = c("Mutation"
-                     , "DUET"
-                     , "FoldX"
-                     #, "Mutation class"
-                     , "ASA"
-                     , "RSA"
-                     , "RD"
-                     , "KD"
-                     , "Log (OR)"
-                     , "-Log (P)"
-                     ##, "Adjusted (OR)"
-                     ##, "-Log (P wald)"
-                     , "MAF"
-                     ##, "AF_kin"
-                     , "duet_outcome"
-                     , drug)
-
-length(my_corr_colnames)
-
-colnames(corr_data_ps)
-colnames(corr_data_ps) <- my_corr_colnames
-colnames(corr_data_ps)
-
-start = 1
-end = which(colnames(corr_data_ps) == drug); end # should be the last column
-offset = 1
-
-#===========================
-# Corr data for plots: PS
-# big_df ps: ~ merged_df2
-#===========================
-
-#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
-corr_ps_df2 = corr_data_ps[start:end]
-head(corr_ps_df2)
-
-#===========================
-# Corr data for plots: PS
-# short_df ps: ~merged_df3
-#===========================
-corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
-
-na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
-check1 = nrow(corr_ps_df3) - na_or
-
-##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
-##check2 = nrow(corr_ps_df3) - na_adj_or 
-
-if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
-  cat( "\nPASS: No. of rows for corr_ps_df3 match"
-       , "\nPASS: No. of OR values checked: " , check1)
-} else {
-  cat("\nFAIL: Numbers  mismatch:"
-      , "\nExpected nrows: ", nrow(merged_df3)
-      , "\nGot: ", nrow(corr_ps_df3)
-      , "\nExpected OR values: ", nrow(merged_df3_comp)
-      , "\nGot: ", check1)
-}
-
-#=================================
-# Data for Correlation plots: LIG
-#=================================
-cat("\n=========================================="
-    , "\nCORR PLOTS data: LIG"
-    , "\n===========================================")
-
-df_lig = merged_df2_lig
-
-table(df_lig$ligand_outcome)
-
-#--------------------
-# adding log cols : NEW UNCOMMENT
-#--------------------
-#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
-#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
-
-##df_lig$log10_or_kin = log10(df_lig$or_kin)
-##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
-
-#----------------------------
-# columns for corr plots:PS
-#----------------------------
-# subset data to generate pairwise correlations
-cols_to_select =  c("mutationinformation"
-                    , "affinity_scaled"
-                    #, "mutation_info_labels"
-                    , "asa"
-                    , "rsa"
-                    , "rd_values"
-                    , "kd_values"
-                    , "log10_or_mychisq"
-                    , "neglog_pval_fisher"
-                    ##, "or_kin"
-                    ##, "neglog_pwald_kin"
-                    , "af"
-                    ##, "af_kin"
-                    , "ligand_outcome"
-                    , drug)
-
-corr_data_lig = df_lig[, cols_to_select]
-
-dim(corr_data_lig)
-
-#--------------------------------------
-# assign nice colnames (for display)
-#--------------------------------------
-my_corr_colnames = c("Mutation"
-                     , "Ligand Affinity"
-                     #, "Mutation class"
-                     , "ASA"
-                     , "RSA"
-                     , "RD"
-                     , "KD"
-                     , "Log (OR)"
-                     , "-Log (P)"
-                     ##, "Adjusted (OR)"
-                     ##, "-Log (P wald)"
-                     , "MAF"
-                     ##, "MAF_kin"
-                     , "ligand_outcome"
-                     , drug)
-
-length(my_corr_colnames)
-
-colnames(corr_data_lig)
-colnames(corr_data_lig) <- my_corr_colnames
-colnames(corr_data_lig)
-
-start = 1
-end = which(colnames(corr_data_lig) == drug); end # should be the last column
-offset = 1
-
-#=============================
-# Corr data for plots: LIG
-# big_df lig: ~ merged_df2_lig
-#==============================
-#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
-corr_lig_df2 = corr_data_lig[start:end]
-head(corr_lig_df2)
-
-#=============================
-# Corr data for plots: LIG
-# short_df lig: ~ merged_df3_lig
-#==============================
-corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
-
-na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
-check1_lig = nrow(corr_lig_df3) - na_or_lig
-
-if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
-  cat( "\nPASS: No. of rows for corr_lig_df3 match"
-       , "\nPASS: No. of OR values checked: " , check1_lig)
-} else {
-  cat("\nFAIL: Numbers  mismatch:"
-      , "\nExpected nrows: ", nrow(merged_df3_lig)
-      , "\nGot: ", nrow(corr_ps_df3_lig)
-      , "\nExpected OR values: ", nrow(merged_df3_comp_lig)
-      , "\nGot: ", check1_lig)
-}
-
-# remove unnecessary columns
-identical(corr_data_lig, corr_lig_df2)
-identical(corr_data_ps, corr_ps_df2)
-
-#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
+s4 = c("\nSuccessfully sourced corr_data.R")
+cat(s4)

 ########################################################################
 #                           End of script
 ########################################################################
-rm(foo)
+if (  all( length(s1), length(s2), length(s3), length(s4) ) >0 ){
+ cat(
+  "\n##################################################"
+ , "\nSuccessful: get_plotting_dfs.R worked!"
+ , "\n###################################################\n") 
+} else {
+ cat(
+  "\n#################################################"
+ , "\nFAIL: get_plotting_dfs.R didn't complete fully!Please check"
+ , "\n###################################################\n" )
+ }   
+ 
+########################################################################
+# clear excess variables
+rm(c1, c2, c3, c4, check1
+   , curr_count, curr_total
+   , cols_check
+   , cols_to_select
+   , cols_to_select_deepddg
+   , cols_to_select_duet
+   , cols_to_select_dynamut
+   , cols_to_select_dynamut2
+   , cols_to_select_encomddg
+   , cols_to_select_encomdds
+   , cols_to_select_mcsm
+   , cols_to_select_mcsm_na
+   , cols_to_select_sdm
+   , infile_metadata
+   , infile_params
+   #, infilename_dynamut
+   #, infilename_dynamut2
+   #, infilename_mcsm_f_snps
+   #, infilename_mcsm_na
+   )

-cat("\n===================================================\n"
-      , "\nSuccessful: get_plotting_dfs.R worked!"
-      , "\n====================================================")
+rm(pivot_cols
+, pivot_cols_deepddg
+, pivot_cols_duet
+, pivot_cols_dynamut
+, pivot_cols_dynamut2
+, pivot_cols_encomddg
+, pivot_cols_encomdds
+, pivot_cols_foldx
+, pivot_cols_mcsm
+, pivot_cols_mcsm_na
+, pivot_cols_n
+, pivot_cols_sdm)
+
+rm(expected_cols
+, expected_ncols
+, expected_rows
+, expected_rows_lf
+, fact_cols)
+
+   
--- a/scripts/plotting/get_plotting_dfs_with_lig.R
+++ b/scripts/plotting/get_plotting_dfs_with_lig.R
@ -0,0 +1,589 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Get formatted data for plots
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+source("Header_TT.R")
+source("../functions/my_pairs_panel.R") # with lower panel turned off
+source("../functions/plotting_globals.R")
+source("../functions/plotting_data.R")
+source("../functions/combining_dfs_plotting.R")
+source("../functions/bp_subcolours.R")
+
+#********************
+# cmd args passed 
+# in from other scripts
+# to call this
+#********************
+#drug = 'streptomycin'
+#gene = 'gid'
+#====================
+# variables for lig
+#====================
+
+LigDist_colname = "ligand_distance"
+LigDist_cutoff = 10
+
+#===========
+# input
+#===========
+#---------------------
+# call: import_dirs()
+#---------------------
+import_dirs(drug, gene)
+
+#---------------------------
+# call: plotting_data()
+#---------------------------
+if (!exists("infile_params") && exists("gene")){
+#if (!is.character(infile_params) && exists("gene")){ # when running as cmd
+  #in_filename_params = paste0(tolower(gene), "_all_params.csv") #for pncA
+  in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid
+  infile_params = paste0(outdir, "/", in_filename_params)
+  cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n")
+}
+
+# Input 1: read <gene>_comb_afor.csv
+cat("\nReading mcsm combined data file: ", infile_params)
+mcsm_df = read.csv(infile_params, header = T)
+pd_df = plotting_data(mcsm_df
+                      , lig_dist_colname = LigDist_colname
+                      , lig_dist_cutoff = LigDist_cutoff)
+
+my_df       = pd_df[[1]] 
+my_df_u     = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
+my_df_u_lig = pd_df[[3]] 
+dup_muts    = pd_df[[4]] 
+
+#--------------------------------
+# call: combining_dfs_plotting()
+#--------------------------------
+if (!exists("infile_metadata") && exists("gene")){
+#if (!is.character(infile_metadata) && exists("gene")){ # when running as cmd
+  in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid
+  infile_metadata = paste0(outdir, "/", in_filename_metadata)
+  cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n")
+}
+
+# Input 2: read <gene>_meta data.csv
+cat("\nReading meta data file: ", infile_metadata)
+
+gene_metadata <- read.csv(infile_metadata
+                          , stringsAsFactors = F
+                          , header = T)
+
+all_plot_dfs = combining_dfs_plotting(my_df_u
+                                      , gene_metadata
+                                      , lig_dist_colname = LigDist_colname
+                                      , lig_dist_cutoff = LigDist_cutoff)
+
+merged_df2          = all_plot_dfs[[1]]
+merged_df3          = all_plot_dfs[[2]]
+merged_df2_comp     = all_plot_dfs[[3]]
+merged_df3_comp     = all_plot_dfs[[4]]
+merged_df2_lig      = all_plot_dfs[[5]]
+merged_df3_lig      = all_plot_dfs[[6]]
+merged_df2_comp_lig = all_plot_dfs[[7]]
+merged_df3_comp_lig = all_plot_dfs[[8]]
+
+####################################################################
+#                        Data for subcols barplot (~heatmap)
+####################################################################
+# can include: mutation, or_kin, pwald, af_kin
+cols_to_select = c("mutationinformation", "drtype"
+                   , "wild_type"
+                   , "position"
+                   , "mutant_type"
+                   , "chain", "ligand_id", "ligand_distance"
+                   , "duet_stability_change", "duet_outcome", "duet_scaled"
+                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
+                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
+                   , "asa", "rsa", "rd_values", "kd_values"
+                   , "af", "or_mychisq", "pval_fisher" 
+                   , "or_fisher", "or_logistic", "pval_logistic"
+                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
+                   , "wt_calcprop", "mut_calcprop")
+
+#=======================
+# Data for sub colours
+# barplot: PS
+#=======================
+
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_ps = merged_df3[, cols_to_select]
+
+cat("\nNo of unique positions for ps:"
+    , length(unique(subcols_df_ps$position)))
+
+# add count_pos col that counts the no. of nsSNPS at a position
+setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
+
+# should be a factor
+if (is.factor(subcols_df_ps$duet_outcome)){
+  cat("\nDuet_outcome is factor")
+  table(subcols_df_ps$duet_outcome)
+}else{
+  cat("\nConverting duet_outcome to factor")
+  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
+  table(subcols_df_ps$duet_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_ps$duet_scaled)
+max(subcols_df_ps$duet_scaled)
+
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in duet scaled, no rounding:"
+    , length(unique(subcols_df_ps$duet_scaled)))
+
+# No rounding    
+my_grp = subcols_df_ps$duet_scaled; length(my_grp)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_ps$duet_scaledR)))
+
+my_grp_r = subcols_df_ps$duet_scaledR  # rounding
+
+# Add grp cols
+subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
+subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
+subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
+
+print(paste0("Colour palette generated for my_grp: ", length(subcols_ps), " colours"))
+print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours"))
+
+#=======================
+# Data for sub colours
+# barplot: LIG
+#=======================
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_lig = merged_df3_lig[, cols_to_select]
+
+cat("\nNo of unique positions for LIG:"
+    , length(unique(subcols_df_lig$position)))
+
+# should be a factor
+if (is.factor(subcols_df_lig$ligand_outcome)){
+  cat("\nLigand_outcome is factor")
+  table(subcols_df_lig$ligand_outcome)
+}else{
+  cat("\nConverting ligand_outcome to factor")
+  subcols_df_lig$ligand_outcome = as.factor(subcols_df_lig$ligand_outcome)
+  table(subcols_df_lig$ligand_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_lig$affinity_scaled)
+max(subcols_df_lig$affinity_scaled)
+
+tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, min)
+tapply(subcols_df_lig$affinity_scaled, subcols_df_lig$ligand_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in affinity scaled, no rounding:"
+    , length(unique(subcols_df_lig$affinity_scaled)))
+
+# No rounding    
+my_grp_lig = subcols_df_lig$affinity_scaled; length(my_grp_lig)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_lig$affinity_scaledR = round(subcols_df_lig$affinity_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_lig$affinity_scaledR)))
+
+my_grp_lig_r = subcols_df_lig$affinity_scaledR  # rounding
+
+# Add grp cols
+subcols_df_lig$group_lig <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig, sep = "")
+subcols_df_lig$group_ligR <- paste0(subcols_df_lig$ligand_outcome, "_", my_grp_lig_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig")
+subcolsR_lig <- ColourPalleteMulti(subcols_df_lig, "ligand_outcome", "my_grp_lig_r")
+
+print(paste0("Colour palette generated for my_grp: ", length(subcols_lig), " colours"))
+print(paste0("Colour palette generated for my_grp_r: ", length(subcolsR_lig), " colours"))
+
+####################################################################
+#                        Data for logoplots
+####################################################################
+#-------------------------
+# choose df for logoplot
+#-------------------------
+logo_data = merged_df3
+#logo_data = merged_df3_comp
+
+# quick checks
+colnames(logo_data)
+str(logo_data)
+
+c1 = unique(logo_data$position) 
+nrow(logo_data)
+cat("No. of rows in my_data:", nrow(logo_data)
+    , "\nDistinct positions corresponding to snps:", length(c1)
+    , "\n===========================================================")
+#=======================================================================
+#==================
+# logo data: OR
+#==================
+foo = logo_data[, c("position"
+                      , "mutant_type","duet_scaled", "or_mychisq"
+                      , "mut_prop_polarity", "mut_prop_water")] 
+
+logo_data$log10or = log10(logo_data$or_mychisq)
+logo_data_plot = logo_data[, c("position"
+                            , "mutant_type", "or_mychisq", "log10or")]
+
+logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
+wide_df_or <- logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
+
+wide_df_or = as.matrix(wide_df_or)
+rownames(wide_df_or) = wide_df_or[,1]
+dim(wide_df_or)
+wide_df_or = wide_df_or[,-1]
+str(wide_df_or)
+
+position_or = as.numeric(colnames(wide_df_or))
+
+#==================
+# logo data: logOR
+#==================
+logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
+wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
+
+wide_df_logor = as.matrix(wide_df_logor)
+
+rownames(wide_df_logor) = wide_df_logor[,1]
+wide_df_logor = subset(wide_df_logor, select = -c(1) )
+colnames(wide_df_logor)
+wide_df_logor_m = data.matrix(wide_df_logor)
+
+rownames(wide_df_logor_m)
+colnames(wide_df_logor_m)
+
+position_logor = as.numeric(colnames(wide_df_logor_m))
+
+#===============================
+# logo data: multiple nsSNPs (>1)
+#=================================
+#require(data.table)
+
+# get freq count of positions so you can subset freq<1
+setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
+
+table(logo_data$position)
+table(logo_data$mut_pos_occurrence)
+
+max_mut = max(table(logo_data$position))
+
+# extract freq_pos > 1
+my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
+
+#--------------------------------------
+# matrix for_mychisq mutant type
+# frequency of mutant type by position
+#---------------------------------------
+table(my_data_snp$mutant_type, my_data_snp$position)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)
+
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+# should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos
+
+#-------------------------------------
+# matrix for wild type
+# frequency of wild type by position
+#-------------------------------------
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
+tab_wt = unclass(tab_wt)
+
+# remove wt duplicates
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]
+
+tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
+
+rownames(tab_wt)
+rownames(tab_wt)
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
+
+#----------------------------------
+# logo data OR: multiple nsSNPs (>1)
+#----------------------------------
+logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
+#wide_df_or <- logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
+wide_df_or_mult <- logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
+
+wide_df_or_mult = as.matrix(wide_df_or_mult)
+rownames(wide_df_or_mult) = wide_df_or_mult[,1]
+wide_df_or_mult = wide_df_or_mult[,-1]
+str(wide_df_or_mult)
+
+position_or_mult = as.numeric(colnames(wide_df_or_mult))
+
+####################################################################
+#                        Data for Corrplots
+####################################################################
+cat("\n=========================================="
+    , "\nCORR PLOTS data: PS"
+    , "\n===========================================")
+
+df_ps = merged_df2
+
+#--------------------
+# adding log cols : NEW UNCOMMENT
+#--------------------
+#df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
+#df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+
+##df_ps$log10_or_kin = log10(df_ps$or_kin)
+##df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+
+#df_ps$mutation_info_labels = ifelse(df_ps$mutation_info == dr_muts_col, 1, 0)
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# subset data to generate pairwise correlations
+cols_to_select =  c("mutationinformation"
+                    , "duet_scaled"
+                    , "foldx_scaled"
+                    #, "mutation_info_labels"
+                    , "asa"
+                    , "rsa"
+                    , "rd_values"
+                    , "kd_values"
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    ##, "or_kin"
+                    ##, "neglog_pwald_kin"
+                    , "af"
+                    ##, "af_kin"
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[cols_to_select]
+
+dim(corr_data_ps)
+
+#--------------------------------------
+# assign nice colnames (for display)
+#--------------------------------------
+my_corr_colnames = c("Mutation"
+                     , "DUET"
+                     , "FoldX"
+                     #, "Mutation class"
+                     , "ASA"
+                     , "RSA"
+                     , "RD"
+                     , "KD"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     ##, "Adjusted (OR)"
+                     ##, "-Log (P wald)"
+                     , "MAF"
+                     ##, "AF_kin"
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+#===========================
+# Corr data for plots: PS
+# big_df ps: ~ merged_df2
+#===========================
+
+#corr_ps_df2 = corr_data_ps[start:(end-offset)] # without drug
+corr_ps_df2 = corr_data_ps[start:end]
+head(corr_ps_df2)
+
+#===========================
+# Corr data for plots: PS
+# short_df ps: ~merged_df3
+#===========================
+corr_ps_df3 = corr_ps_df2[!duplicated(corr_ps_df2$Mutation),]
+
+na_or = sum(is.na(corr_ps_df3$`Log (OR)`))
+check1 = nrow(corr_ps_df3) - na_or
+
+##na_adj_or = sum(is.na(corr_ps_df3$`adjusted (OR)`))
+##check2 = nrow(corr_ps_df3) - na_adj_or 
+
+if (nrow(corr_ps_df3) == nrow(merged_df3) && nrow(merged_df3_comp) == check1) {
+  cat( "\nPASS: No. of rows for corr_ps_df3 match"
+       , "\nPASS: No. of OR values checked: " , check1)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3)
+      , "\nGot: ", nrow(corr_ps_df3)
+      , "\nExpected OR values: ", nrow(merged_df3_comp)
+      , "\nGot: ", check1)
+}
+
+#=================================
+# Data for Correlation plots: LIG
+#=================================
+cat("\n=========================================="
+    , "\nCORR PLOTS data: LIG"
+    , "\n===========================================")
+
+df_lig = merged_df2_lig
+
+table(df_lig$ligand_outcome)
+
+#--------------------
+# adding log cols : NEW UNCOMMENT
+#--------------------
+#df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+#df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+##df_lig$log10_or_kin = log10(df_lig$or_kin)
+##df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+#----------------------------
+# columns for corr plots:PS
+#----------------------------
+# subset data to generate pairwise correlations
+cols_to_select =  c("mutationinformation"
+                    , "affinity_scaled"
+                    #, "mutation_info_labels"
+                    , "asa"
+                    , "rsa"
+                    , "rd_values"
+                    , "kd_values"
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    ##, "or_kin"
+                    ##, "neglog_pwald_kin"
+                    , "af"
+                    ##, "af_kin"
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select]
+
+dim(corr_data_lig)
+
+#--------------------------------------
+# assign nice colnames (for display)
+#--------------------------------------
+my_corr_colnames = c("Mutation"
+                     , "Ligand Affinity"
+                     #, "Mutation class"
+                     , "ASA"
+                     , "RSA"
+                     , "RD"
+                     , "KD"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     ##, "Adjusted (OR)"
+                     ##, "-Log (P wald)"
+                     , "MAF"
+                     ##, "MAF_kin"
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames
+colnames(corr_data_lig)
+
+start = 1
+end = which(colnames(corr_data_lig) == drug); end # should be the last column
+offset = 1
+
+#=============================
+# Corr data for plots: LIG
+# big_df lig: ~ merged_df2_lig
+#==============================
+#corr_lig_df2 = corr_data_lig[start:(end-offset)] # without drug
+corr_lig_df2 = corr_data_lig[start:end]
+head(corr_lig_df2)
+
+#=============================
+# Corr data for plots: LIG
+# short_df lig: ~ merged_df3_lig
+#==============================
+corr_lig_df3 = corr_lig_df2[!duplicated(corr_lig_df2$Mutation),]
+
+na_or_lig = sum(is.na(corr_lig_df3$`Log (OR)`))
+check1_lig = nrow(corr_lig_df3) - na_or_lig
+
+if (nrow(corr_lig_df3) == nrow(merged_df3_lig) && nrow(merged_df3_comp_lig) == check1_lig) {
+  cat( "\nPASS: No. of rows for corr_lig_df3 match"
+       , "\nPASS: No. of OR values checked: " , check1_lig)
+} else {
+  cat("\nFAIL: Numbers  mismatch:"
+      , "\nExpected nrows: ", nrow(merged_df3_lig)
+      , "\nGot: ", nrow(corr_ps_df3_lig)
+      , "\nExpected OR values: ", nrow(merged_df3_comp_lig)
+      , "\nGot: ", check1_lig)
+}
+
+# remove unnecessary columns
+identical(corr_data_lig, corr_lig_df2)
+identical(corr_data_ps, corr_ps_df2)
+
+#rm(df_ps, df_lig, corr_data_ps, corr_data_lig)
+
+########################################################################
+#                           End of script
+########################################################################
+rm(foo)
+
+cat("\n===================================================\n"
+      , "\nSuccessful: get_plotting_dfs.R worked!"
+      , "\n====================================================")
--- a/scripts/plotting/ggcorr_all_PS_LIG.R
+++ b/scripts/plotting/ggcorr_all_PS_LIG.R
--- a/scripts/plotting/hist_af_or_base.R
+++ b/scripts/plotting/hist_af_or_base.R
--- a/scripts/plotting/hist_af_or_combined.R
+++ b/scripts/plotting/hist_af_or_combined.R
--- a/scripts/plotting/legend_adjustment.R
+++ b/scripts/plotting/legend_adjustment.R
--- a/scripts/plotting/lineage_basic_barplots_combined.R
+++ b/scripts/plotting/lineage_basic_barplots_combined.R
@ -0,0 +1,127 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+#########################################################
+# TASK: Basic lineage barplots showing numbers
+
+# Output: Basic barplot with lineage samples and mut count
+# + SNP diversity
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+source("Header_TT.R")
+
+#===========
+# input
+#===========
+#drug = 'streptomycin'
+#gene = 'gid'
+
+spec = matrix(c(
+    "drug"       , "d",  1, "character",
+    "gene"       , "g",  1, "character",
+    "data_file1" , "fa", 2, "character",
+    "data_file2" , "fb", 2, "character" 
+), byrow = TRUE, ncol = 4)
+
+opt = getopt(spec)
+
+drug            = opt$drug
+gene            = opt$gene
+infile_params   = opt$data_file1
+infile_metadata = opt$data_file2
+
+if(is.null(drug)|is.null(gene)) {
+    stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+
+source ('get_plotting_dfs.R')
+
+#=======
+# output
+#=======
+# plot 1
+basic_bp_lineage_cl = "basic_lineage_barplots_combined.svg"
+plot_basic_bp_lineage_cl  =  paste0(plotdir,"/", basic_bp_lineage_cl)
+plot_basic_bp_lineage_cl 
+#################################################################
+#=============================
+# PLOT 1: Lineage count plot:
+# LF data
+#=============================
+#------------------------
+# Data: All lineages or
+# selected few
+#------------------------
+lin_lf_plot = lin_lf[lin_lf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
+str(lin_lf_plot)
+
+# drop unused factor levels
+lin_lf_plot$sel_lineages = factor(lin_lf_plot$sel_lineages)
+levels(lin_lf_plot$sel_lineages)
+str(lin_lf_plot)
+
+#------------------------
+# plot from my function:
+#------------------------
+lin_countP = lin_count_bp(lin_lf_plot
+             , x_categ = "sel_lineages"
+             , y_count = "p_count"
+             , bar_fill_categ = "count_categ"
+             , display_label_col = "p_count"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , bar_col_labels = c("Mutations", "Total Samples")
+             , bar_col_values = c("grey50", "gray75")
+             , y_scale_percent = F # T for diversity
+             , y_log10 = F
+             , y_label = "Count")
+lin_countP
+#================================
+# PLOT 2: Lineage Diversity plot
+# WF data
+#================================
+#------------------------
+# Data: All lineages or
+# selected few
+#------------------------
+lin_wf_plot = lin_wf[lin_wf$sel_lineages%in%c("L1", "L2", "L3", "L4"),]
+str(lin_wf_plot)
+
+# drop unused factor levels
+lin_wf_plot$sel_lineages = factor(lin_wf_plot$sel_lineages)
+levels(lin_wf_plot$sel_lineages)
+str(lin_wf_plot)
+
+#------------------------
+# plot from my function:
+#------------------------
+lin_diversityP = lin_count_bp(lin_wf_plot
+             , x_categ = "sel_lineages"
+             , y_count = "snp_diversity"
+             , display_label_col = "snp_diversity_f"
+             , bar_stat_stype = "identity"
+             , x_lab_angle = 90
+             , my_xats = 20
+             , y_scale_percent = T
+             , y_label = "SNP diversity")
+
+lin_diversityP
+#########################################################################333
+#================================
+# Combine plots
+#================================
+
+svg(plot_basic_bp_lineage_cl , width = 8, height = 15 )
+
+lineage_bp_combined = cowplot::plot_grid(lin_countP, lin_diversityP
+                                      #, labels = c("(a)", "(b)", "(c)", "(d)")
+                                      , nrow = 2
+                                      , labels = "AUTO"
+                                      , label_size = 25)
+
+lineage_bp_combined
+dev.off()
--- a/scripts/plotting/lineage_data.R
+++ b/scripts/plotting/lineage_data.R
@ -0,0 +1,147 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for lineage barplots:
+# WF and LF data with lineage sample, and snp counts
+# sourced by get_plotting_dfs.R
+#########################################################
+
+#=================================================
+# Get data with lineage count, and snp diversity
+#=================================================
+table(merged_df2$lineage)
+
+if (table(merged_df2$lineage == "")[[2]]) {
+
+cat("\nMissing samples with lineage classification:", table(merged_df2$lineage == "")[[2]])
+  
+}
+
+table(merged_df2$lineage_labels)
+class(merged_df2$lineage_labels); nlevels(merged_df2$lineage_labels)
+
+#==========================================
+# WF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+#==========================================
+sel_lineages = levels(merged_df2$lineage_labels)
+
+lin_wf = data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(merged_df2$id)[merged_df2$lineage_labels==i])
+  #print(curr_total)
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+
+  foo = merged_df2[merged_df2$lineage_labels==i,]
+  print(paste0(i, "=======\n"))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+  
+  total_snps_u = c(total_snps_u, curr_count)
+}
+lin_wf
+
+# Add these counts as columns to the df
+lin_wf$num_snps_u = total_snps_u
+lin_wf$total_samples = total_samples
+lin_wf
+
+# Add SNP diversity
+lin_wf$snp_diversity = lin_wf$num_snps_u/lin_wf$total_samples
+lin_wf
+
+#----------------------
+# Add some formatting
+#----------------------
+# SNP diversity 
+lin_wf$snp_diversity_f = round( (lin_wf$snp_diversity * 100), digits = 0)
+lin_wf$snp_diversity_f = paste0(lin_wf$snp_diversity_f, "%")
+
+lin_wf$sel_lineages
+
+# Important: Check factors so that x-axis categ appear as you want
+lin_wf$sel_lineages =  factor(lin_wf$sel_lineages, c("L1"
+                                                     , "L2"
+                                                     , "L3"
+                                                     , "L4"
+                                                     , "L5"
+                                                     , "L6"
+                                                     , "L7"
+                                                     , "LBOV"
+                                                     , "L1;L2"
+                                                     , "L1;L3"
+                                                     , "L1;L4"
+                                                     , "L2;L3"
+                                                     , "L2;L3;L4"
+                                                     , "L2;L4"  
+                                                     , "L2;L6"
+                                                     , "L2;LBOV"  
+                                                     , "L3;L4" 
+                                                     , "L4;L6"
+                                                     , "L4;L7"
+                                                     , ""))
+
+levels(lin_wf$sel_lineages)
+
+#=================================
+# LF data: lineages with 
+# snp count
+# total_samples
+# snp diversity (perc)
+#=================================
+names(lin_wf)
+tot_cols = ncol(lin_wf)
+pivot_cols = c("sel_lineages", "snp_diversity", "snp_diversity_f")
+pivot_cols_n = length(pivot_cols)
+
+expected_rows =  nrow(lin_wf) * ( length(lin_wf) - pivot_cols_n )
+  
+lin_lf <- gather(lin_wf
+                 , count_categ
+                 , p_count
+                 , num_snps_u:total_samples
+                 , factor_key = TRUE)
+lin_lf
+
+# quick checks
+if ( nrow(lin_lf )  ==  expected_rows ){
+  cat("\nPASS: Lineage LF data created"
+      , "\nnrow: ", nrow(lin_lf)
+      , "\nncol: ", ncol(lin_lf))
+} else {
+  cat("\nFAIL: numbers mismatch"
+      , "\nExpected nrow: ", expected_rows)
+}
+
+# Important: Relevel factors so that x-axis categ appear as you want
+lin_lf$sel_lineages =  factor(lin_lf$sel_lineages, c("L1"
+                                                     , "L2"
+                                                     , "L3"
+                                                     , "L4"
+                                                     , "L5"
+                                                     , "L6"
+                                                     , "L7"
+                                                     , "LBOV"
+                                                     , "L1;L2"
+                                                     , "L1;L3"
+                                                     , "L1;L4"
+                                                     , "L2;L3"
+                                                     , "L2;L3;L4"
+                                                     , "L2;L4"  
+                                                     , "L2;L6"
+                                                     , "L2;LBOV"  
+                                                     , "L3;L4" 
+                                                     , "L4;L6"
+                                                     , "L4;L7"
+                                                     , ""))
+
+levels(lin_lf$sel_lineages)
+
+################################################################
+
--- a/scripts/plotting/lineage_dist_plots.R
+++ b/scripts/plotting/lineage_dist_plots.R
@ -0,0 +1,143 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Lineage dist plots: ggridges
+
+# Output: 1 or 2 SVGs for PS stability
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R") # also loads all my functions
+
+#===========
+# input
+#===========
+drug = "streptomycin"
+gene = "gid"
+#source("get_plotting_dfs.R")
+
+spec = matrix(c(
+  "drug"       , "d",  1, "character",
+  "gene"       , "g",  1, "character",
+  "data_file1" , "fa", 2, "character",
+  "data_file2" , "fb", 2, "character" 
+), byrow = TRUE, ncol = 4)
+
+opt = getopt(spec)
+
+drug            = opt$drug
+gene            = opt$gene
+infile_params   = opt$data_file1
+infile_metadata = opt$data_file2
+
+if(is.null(drug)|is.null(gene)) {
+  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+
+#=======
+# output
+#=======
+lineage_dist_dm_om_ps      = "lineage_dist_dm_om_PS.svg"
+plot_lineage_dist_dm_om_ps =  paste0(plotdir,"/", lineage_dist_dm_om_ps)
+#========================================================================
+
+###########################
+# Data for plots
+# you need merged_df2_combined or merged_df2_combined_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+#===================
+# Data for plots
+#===================
+# quick checks
+table(merged_df2_combined$mutation_info_labels); levels(merged_df2_combined$lineage_labels)
+table(merged_df2_combined$lineage_labels); levels(merged_df2_combined$mutation_info_labels)
+
+sel_lineages = c("L1", "L2", "L3", "L4")
+
+lin_dist_plot = merged_df2_combined[merged_df2_combined$lineage_labels%in%sel_lineages,]
+table(lin_dist_plot$lineage_labels); nlevels(lin_dist_plot$lineage_labels)
+
+# refactor
+lin_dist_plot$lineage_labels = factor(lin_dist_plot$lineage_labels)
+nlevels(lin_dist_plot$lineage_labels)
+
+#-----------------------------------------------------------------------
+# IMPORTANT RESULTS to put inside table or text for interactive plots
+
+sum(table(lin_dist_plot$lineage_labels)) #{RESULT: Total number of samples for lineage}
+
+table(lin_dist_plot$lineage_labels)#{RESULT: No of samples within lineage}
+
+length(unique(lin_dist_plot$mutationinformation))#{Result: No. of unique mutations selected lineages contribute to}
+length(lin_dist_plot$mutationinformation)
+
+u2 = unique(merged_df2_combined$mutationinformation)
+u = unique(lin_dist_plot$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+#-----------------------------------------------------------------------
+
+my_x_and_t = c("duet_scaled", "mCSM-DUET")
+my_x_and_t = c("foldx_scaled", "FoldX")
+#my_x_and_t = c("deepddg_scaled", "DeepDDG")
+
+my_x_and_t = c("ddg_dynamut2_scaled", "Dynamut2")
+my_x_and_t = c("ddg_dynamut_scaled", "Dynamut")
+
+my_x_and_t = c("ddg_mcsm_scaled", "mCSM")
+my_x_and_t = c("ddg_sdm_scaled", "SDM")
+my_x_and_t = c("ddg_duet_scaled", "DUET-d")
+
+my_x_and_t = c("ddg_encom_scaled", "EnCOM-Stability")
+my_x_and_t = c("dds_encom_scaled", "EnCOM-Flexibility")
+
+my_x_and_t = c("mcsm_na_scaled", "mCSM-NA")
+
+# TO DO
+my_x_and_t = c("affinity_scaled", "mCSM-Lig") #ligdist< 10
+
+#=====================
+# Plot: without facet
+#=====================
+
+linP_dm_om = lineage_distP(lin_dist_plot
+              , x_axis = my_x_and_t[1]
+              , x_lab = my_x_and_t[2]
+              , y_axis = "lineage_labels"
+              , leg_label = "Mutation Class"
+              , with_facet = F)
+linP_dm_om
+
+#=====================
+# Plot: with facet
+#=====================
+
+linP_dm_om_facet = lineage_distP(lin_dist_plot
+                                 , x_axis = my_x_and_t[1]
+                                 , x_lab = my_x_and_t[2]
+                                 , y_axis = "lineage_labels"
+                                 , with_facet = T
+                                 , facet_wrap_var = "mutation_info_labels"
+                                 , leg_label = "Mutation Class"
+                                 , leg_pos_wf = "none"
+                                 , leg_dir_wf = "horizontal")
+linP_dm_om_facet
+
+#=================
+# output plot:
+# without facet
+#=================
+svg(plot_lineage_dist_dm_om_ps)
+
+linP_dm_om
+
+dev.off()
--- a/scripts/plotting/logo_data.R
+++ b/scripts/plotting/logo_data.R
@ -0,0 +1,142 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for Logo_plots
+#########################################################
+#-------------------------
+# choose df for logoplot
+#-------------------------
+logo_data = merged_df3
+#logo_data = merged_df3_comp
+
+# quick checks
+colnames(logo_data)
+str(logo_data)
+
+c1 = unique(logo_data$position) 
+nrow(logo_data)
+cat("No. of rows in my_data:", nrow(logo_data)
+    , "\nDistinct positions corresponding to snps:", length(c1)
+    , "\n===========================================================")
+#=======================================================================
+#==================
+# logo data: OR
+#==================
+foo = logo_data[, c("position"
+                    , "mutant_type","duet_scaled", "or_mychisq"
+                    , "mut_prop_polarity", "mut_prop_water")] 
+
+logo_data$log10or = log10(logo_data$or_mychisq)
+logo_data_plot = logo_data[, c("position"
+                               , "mutant_type", "or_mychisq", "log10or")]
+
+logo_data_plot_or = logo_data[, c("position", "mutant_type", "or_mychisq")]
+wide_df_or  =  logo_data_plot_or %>% spread(position, or_mychisq, fill = 0.0)
+
+wide_df_or = as.matrix(wide_df_or)
+rownames(wide_df_or) = wide_df_or[,1]
+dim(wide_df_or)
+wide_df_or = wide_df_or[,-1]
+str(wide_df_or)
+
+position_or = as.numeric(colnames(wide_df_or))
+
+#==================
+# logo data: logOR
+#==================
+logo_data_plot_logor = logo_data[, c("position", "mutant_type", "log10or")]
+wide_df_logor <- logo_data_plot_logor %>% spread(position, log10or, fill = 0.0)
+
+wide_df_logor = as.matrix(wide_df_logor)
+
+rownames(wide_df_logor) = wide_df_logor[,1]
+wide_df_logor = subset(wide_df_logor, select = -c(1) )
+colnames(wide_df_logor)
+wide_df_logor_m = data.matrix(wide_df_logor)
+
+rownames(wide_df_logor_m)
+colnames(wide_df_logor_m)
+
+position_logor = as.numeric(colnames(wide_df_logor_m))
+
+#===============================
+# logo data: multiple nsSNPs (>1)
+#=================================
+#require(data.table)
+
+# get freq count of positions so you can subset freq<1
+setDT(logo_data)[, mut_pos_occurrence := .N, by = .(position)] 
+
+table(logo_data$position)
+table(logo_data$mut_pos_occurrence)
+
+max_mut = max(table(logo_data$position))
+
+# extract freq_pos > 1
+my_data_snp = logo_data[logo_data$mut_pos_occurrence!=1,] 
+u = unique(my_data_snp$position)
+max_mult_mut = max(table(my_data_snp$position))
+
+if (nrow(my_data_snp) == nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]] ){
+  
+  cat("PASS: positions with  multiple muts extracted"
+      , "\nNo. of mutations:", nrow(my_data_snp)
+      , "\nNo. of positions:", length(u)
+      , "\nMax no. of muts at any position", max_mult_mut)
+}else{
+  cat("FAIL: positions with multiple muts could NOT be extracted"
+      , "\nExpected:",nrow(logo_data) - table(logo_data$mut_pos_occurrence)[[1]]
+      , "\nGot:", nrow(my_data_snp) )
+}
+
+cat("\nNo. of sites with only 1 mutations:", table(logo_data$mut_pos_occurrence)[[1]])
+
+#--------------------------------------
+# matrix for_mychisq mutant type
+# frequency of mutant type by position
+#---------------------------------------
+table(my_data_snp$mutant_type, my_data_snp$position)
+tab_mt = table(my_data_snp$mutant_type, my_data_snp$position)
+class(tab_mt)
+
+# unclass to convert to matrix
+tab_mt = unclass(tab_mt)
+tab_mt = as.matrix(tab_mt, rownames = T)
+
+# should be TRUE
+is.matrix(tab_mt)
+
+rownames(tab_mt) #aa
+colnames(tab_mt) #pos
+
+#-------------------------------------
+# matrix for wild type
+# frequency of wild type by position
+#-------------------------------------
+tab_wt = table(my_data_snp$wild_type, my_data_snp$position); tab_wt
+tab_wt = unclass(tab_wt)
+
+# remove wt duplicates
+wt = my_data_snp[, c("position", "wild_type")]
+wt = wt[!duplicated(wt),]
+
+tab_wt = table(wt$wild_type, wt$position); tab_wt # should all be 1
+
+rownames(tab_wt)
+rownames(tab_wt)
+
+identical(colnames(tab_mt), colnames(tab_wt))
+identical(ncol(tab_mt), ncol(tab_wt))
+
+#----------------------------------
+# logo data OR: multiple nsSNPs (>1)
+#----------------------------------
+logo_data_or_mult = my_data_snp[, c("position", "mutant_type", "or_mychisq")]
+#wide_df_or  =  logo_data_or %>% spread(position, or_mychisq, fill = 0.0)
+wide_df_or_mult  =  logo_data_or_mult %>% spread(position, or_mychisq, fill = NA)
+
+wide_df_or_mult = as.matrix(wide_df_or_mult)
+rownames(wide_df_or_mult) = wide_df_or_mult[,1]
+wide_df_or_mult = wide_df_or_mult[,-1]
+str(wide_df_or_mult)
+
+position_or_mult = as.numeric(colnames(wide_df_or_mult))
--- a/scripts/plotting/opp_mcsm_muts.R
+++ b/scripts/plotting/opp_mcsm_muts.R
--- a/scripts/plotting/or_plots_combined.R
+++ b/scripts/plotting/or_plots_combined.R
--- a/scripts/plotting/other_plots_combined.R
+++ b/scripts/plotting/other_plots_combined.R
@ -35,7 +35,7 @@ plot_dr_other_combined_labelled  =  paste0(plotdir,"/", dr_other_combined_labell
 #my_comparisons <- list( c(dr_muts_col, other_muts_col) )
 my_comparisons <- list( c("DM", "OM") )

-my_ats = 22# axis text size
+my_ats = 22 # axis text size
 my_als = 20 # axis label size
 my_fls = 20 # facet label size
 my_pts = 22 # plot title size
@ -45,12 +45,15 @@ my_pts = 22 # plot title size
 #===========
 # Plot1: PS
 #===========
-my_stat_ps = compare_means(param_value~mutation_info, group.by = "param_type"
-                        , data = df_lf_ps,  paired = FALSE, p.adjust.method = "BH")
+# my_stat_ps = compare_means(param_value~mutation_info
+#                            , group.by = "param_type"
+#                            , data = df_lf_ps
+#                            , paired = FALSE
+#                            , p.adjust.method = "BH")

 y_value = "param_value"

-p1 = ggplot(df_lf_ps, aes(x = mutation_info
+p1 = ggplot(lf_duet, aes(x = mutation_info
                    , y = eval(parse(text=y_value)) ))  + 
  facet_wrap(~ param_type
             , nrow = 1
@ -61,7 +64,7 @@ p1 = ggplot(df_lf_ps, aes(x = mutation_info
  geom_point(position = position_jitterdodge(dodge.width=0.01)
             , alpha = 0.5
             , show.legend = FALSE
-             , aes(colour = factor(duet_outcome))) +
+             , aes(colour = duet_outcome)) +
  theme(axis.text.x = element_text(size = my_ats)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
--- a/scripts/plotting/output_tables.R
+++ b/scripts/plotting/output_tables.R
--- a/scripts/plotting/ps_plots_combined.R
+++ b/scripts/plotting/ps_plots_combined.R
--- a/scripts/plotting/redundant/coloured_bp_data.R
+++ b/scripts/plotting/redundant/coloured_bp_data.R
@ -0,0 +1,80 @@
+#!/usr/bin/env Rscript  
+#################################################################
+# TASK: Script to add bp colours ~ barplot heatmap
+#################################################################
+
+my_df = merged_df3
+
+cols_to_select = c("mutationinformation", "drtype"
+                   , "wild_type"
+                   , "position"
+                   , "mutant_type"
+                   , "chain", "ligand_id", "ligand_distance"
+                   , "duet_stability_change", "duet_outcome", "duet_scaled"
+                   , "ligand_affinity_change", "ligand_outcome", "affinity_scaled"
+                   , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                   , "deepddg", "deepddg_outcome" # comment out as not available for pnca
+                   , "asa", "rsa", "rd_values", "kd_values"
+                   , "af", "or_mychisq", "pval_fisher" 
+                   , "or_fisher", "or_logistic", "pval_logistic"
+                   , "wt_prop_water", "mut_prop_water", "wt_prop_polarity", "mut_prop_polarity"
+                   , "wt_calcprop", "mut_calcprop")
+
+#=======================
+# Data for sub colours
+# barplot: PS
+#=======================
+
+cat("\nNo. of cols to select:", length(cols_to_select))
+
+subcols_df_ps = my_df[, cols_to_select]
+
+cat("\nNo of unique positions for ps:"
+    , length(unique(subcols_df_ps$position)))
+
+# add count_pos col that counts the no. of nsSNPS at a position
+setDT(subcols_df_ps)[, pos_count := .N, by = .(position)]
+
+# should be a factor
+if (is.factor(subcols_df_ps$duet_outcome)){
+  cat("\nDuet_outcome is factor")
+  table(subcols_df_ps$duet_outcome)
+}else{
+  cat("\nConverting duet_outcome to factor")
+  subcols_df_ps$duet_outcome = as.factor(subcols_df_ps$duet_outcome)
+  table(subcols_df_ps$duet_outcome)
+}
+
+# should be -1 and 1
+min(subcols_df_ps$duet_scaled)
+max(subcols_df_ps$duet_scaled)
+
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, min)
+tapply(subcols_df_ps$duet_scaled, subcols_df_ps$duet_outcome, max)
+
+# check unique values in normalised data
+cat("\nNo. of unique values in duet scaled, no rounding:"
+    , length(unique(subcols_df_ps$duet_scaled)))
+
+# No rounding    
+my_grp = subcols_df_ps$duet_scaled; length(my_grp)
+
+# Add rounding is to be used
+n = 3 
+subcols_df_ps$duet_scaledR = round(subcols_df_ps$duet_scaled, n)
+
+cat("\nNo. of unique values in duet scaled", n, "places rounding:"
+    , length(unique(subcols_df_ps$duet_scaledR)))
+
+my_grp_r = subcols_df_ps$duet_scaledR  # rounding
+
+# Add grp cols
+subcols_df_ps$group <- paste0(subcols_df_ps$duet_outcome, "_", my_grp, sep = "")
+subcols_df_ps$groupR <- paste0(subcols_df_ps$duet_outcome, "_", my_grp_r, sep = "")
+
+# Call the function to create the palette based on the group defined above
+subcols_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp")
+subcolsR_ps <- ColourPalleteMulti(subcols_df_ps, "duet_outcome", "my_grp_r")
+
+cat("Colour palette generated for my_grp: ", length(subcols_ps), " colours")
+cat("Colour palette generated for my_grp_r: ", length(subcolsR_ps), " colours")
--- a/scripts/plotting/redundant/lineage_basic_barplot.R
+++ b/scripts/plotting/redundant/lineage_basic_barplot.R
--- a/scripts/plotting/redundant/lineage_dist_combined_PS.R
+++ b/scripts/plotting/redundant/lineage_dist_combined_PS.R
--- a/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R
+++ b/scripts/plotting/redundant/lineage_dist_dm_om_combined_PS.R
@ -15,44 +15,8 @@ setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()

 source("Header_TT.R")
-library(ggridges)
-library(plyr)
-source("combining_dfs_plotting.R")
-# PS combined: 
-# 1) merged_df2
-# 2) merged_df2_comp
-# 3) merged_df3
-# 4) merged_df3_comp

-# LIG combined: 
-# 5) merged_df2_lig
-# 6) merged_df2_comp_lig
-# 7) merged_df3_lig
-# 8) merged_df3_comp_lig
-
-# 9) my_df_u
-# 10) my_df_u_lig
-
-cat("Directories imported:"
-    , "\n===================="
-    , "\ndatadir:", datadir
-    , "\nindir:", indir
-    , "\noutdir:", outdir
-    , "\nplotdir:", plotdir)
-
-cat("Variables imported:"
-    , "\n====================="
-    , "\ndrug:", drug
-    , "\ngene:", gene
-    , "\ngene_match:", gene_match
-    , "\nAngstrom symbol:", angstroms_symbol
-    , "\nNo. of duplicated muts:", dup_muts_nu
-    , "\nNA count for ORs:", na_count
-    , "\nNA count in df2:", na_count_df2
-    , "\nNA count in df3:", na_count_df3
-    , "\ndr_muts_col:", dr_muts_col
-    , "\nother_muts_col:", other_muts_col
-    , "\ndrtype_col:", resistance_col)
+source("get_plotting_dfs.R")

 cat("cols imported:"
    , mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2)
@ -77,67 +41,24 @@ plot_lineage_dist_combined_dm_om_L  =  paste0(plotdir,"/", lineage_dist_combined
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
-# REASSIGNMENT
-my_df  = merged_df2
-
-# delete variables not required
-rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp
-   , merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-table(my_df$mutation_info)

 #===================
 # Data for plots
 #===================
-table(my_df$lineage); str(my_df$lineage)
-
-# select lineages 1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-                 #, "lineage5"
-                 #, "lineage6"
-                 #, "lineage7")
-
-# works nicely with facet wrap using labeller, but not otherwise
-#my_labels = c('Lineage 1'
-#              , 'Lineage 2'
-#              , 'Lineage 3'
-#              , 'Lineage 4')
-#              #, 'Lineage 5'
-#              #, 'Lineage 6'
-#              #, 'Lineage 7')
-
-#names(my_labels) = c('lineage1'
-#                     , 'lineage2'
-#                     , 'lineage3'
-#                     , 'lineage4')
-#                     #, 'lineage5'
-#                     #, 'lineage6'
-#                     #, 'lineage7')
-
-#==========================
-# subset selected lineages
-#==========================
-df_lin = subset(my_df, subset = lineage %in% sel_lineages)
-table(df_lin$lineage)
+lin_dist_plot = merged_df2[merged_df2$lineage%in%c("lineage1", "lineage2", "lineage3", "lineage4"),]
+table(lin_dist_plot$lineage)

 #{RESULT: Total number of samples for lineage}
-sum(table(df_lin$lineage)) 
+sum(table(lin_dist_plot$lineage)) 

 #{RESULT: No of samples within lineage}
-table(df_lin$lineage) 
+table(lin_dist_plot$lineage) 

 #{Result: No. of unique mutations the 4 lineages contribute to}
-length(unique(df_lin$mutationinformation)) 
+length(unique(lin_dist_plot$mutationinformation)) 

-u2 = unique(my_df$mutationinformation)
-u = unique(df_lin$mutationinformation)
+u2 = unique(lin_dist_plot$mutationinformation)
+u = unique(lin_dist_plot$mutationinformation)

 #{Result:Muts not present within selected lineages}
 check = u2[!u2%in%u]; print(check) 
@ -148,37 +69,38 @@ check = u2[!u2%in%u]; print(check)
 # from "plyr"
 #==================
 #{Result:No of samples in selected lineages}
-table(df_lin$lineage)
+table(lin_dist_plot$lineage)

-df_lin$lineage_labels = mapvalues(df_lin$lineage
+lin_dist_plot$lineage_labels = mapvalues(lin_dist_plot$lineage
                                  , from = c("lineage1","lineage2", "lineage3", "lineage4")
                                  , to = c("Lineage 1", "Lineage 2", "Lineage 3", "Lineage 4"))
-table(df_lin$lineage_labels)
+table(lin_dist_plot$lineage_labels)

-table(df_lin$lineage_labels) == table(df_lin$lineage)
+table(lin_dist_plot$lineage_labels) == table(lin_dist_plot$lineage)
                              
 #========================
 # mutation_info: labels
 #========================
 #{Result:No of DM and OM muts in selected lineages}
-table(df_lin$mutation_info)
+table(lin_dist_plot$mutation_info)

-df_lin$mutation_info_labels = ifelse(df_lin$mutation_info == dr_muts_col, "DM", "OM")
-table(df_lin$mutation_info_labels)
-
-table(df_lin$mutation_info) == table(df_lin$mutation_info_labels)
+lin_dist_plot$mutation_info_labels = ifelse(lin_dist_plot$mutation_info == dr_muts_col
+                                            , "DM", "OM")
+table(lin_dist_plot$mutation_info_labels)

+table(lin_dist_plot$mutation_info) == table(lin_dist_plot$mutation_info_labels)

 #========================
 # duet_outcome: labels
 #========================
 #{Result: No. of D and S mutations in selected lineages}
-table(df_lin$duet_outcome)
+table(lin_dist_plot$duet_outcome)

-df_lin$duet_outcome_labels = ifelse(df_lin$duet_outcome == "Destabilising", "D", "S")
-table(df_lin$duet_outcome_labels)
+lin_dist_plot$duet_outcome_labels = ifelse(lin_dist_plot$duet_outcome == "Destabilising"
+                                           , "D", "S")
+table(lin_dist_plot$duet_outcome_labels)

-table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
+table(lin_dist_plot$duet_outcome) == table(lin_dist_plot$duet_outcome_labels)


 #=======================
@ -198,25 +120,14 @@ table(df_lin$duet_outcome) == table(df_lin$duet_outcome_labels)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
-
-#==========================
-# Distribution plots
-#============================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
 #******************
 # generate distribution plot of lineages
 #******************
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
-n_colours = length(unique(df$duet_scaled))
+n_colours = length(unique(lin_dist_plot$duet_scaled))
+
 my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcsm_blue2))(n = n_colours+1)

 #=======================================
@ -232,17 +143,23 @@ my_palette <- colorRampPalette(c(mcsm_red2, mcsm_red1, mcsm_mid, mcsm_blue1, mcs
 #plot_lineage_dist_duet_f
 #svg(plot_lineage_dist_duet_f)

-p1 = ggplot(df, aes(x = duet_scaled
-                    , y = duet_outcome))+
+p1 = ggplot(lin_dist_plot, aes(x = duet_scaled
+                    #, y = duet_outcome
+                    , y = mutation_info_labels
+                    ))+
  geom_density_ridges_gradient(aes(fill = ..x..)
                               #, jittered_points = TRUE
                               , scale = 3
                               , size = 0.3 ) +
  facet_wrap( ~lineage_labels
+              #~mutation_info_labels
+             # ~mutation_info_labels
 # , scales = "free"
 # , labeller = labeller(lineage = my_labels)
   ) +                            
-  coord_cartesian( xlim = c(-1, 1)) +
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_x_continuous(expand = c(0.01, 0)) +
+  
  scale_fill_gradientn(colours = my_palette
                       , name = "DUET"
                       #, breaks = c(-1, 0, 1)
@ -264,8 +181,8 @@ p1 = ggplot(df, aes(x = duet_scaled
        #, legend.title = element_text(size = my_als-6)
        , legend.title = element_blank()
        , legend.position = c(-0.08, 0.41) 
-        #, legend.direction = "horizontal"
-        #, legend.position = "left"
+        , legend.direction = "horizontal"
+        , legend.position = "top"
 )+
  labs(x = "DUET") 

@ -286,13 +203,14 @@ p1
 #plot_lineage_dist_duet_dm_om
 #svg(plot_lineage_dist_duet_dm_om)

-p2 = ggplot(df, aes(x = duet_scaled
+p2 = ggplot(lin_dist_plot, aes(x = duet_scaled
                    , y = lineage_labels))+
  geom_density_ridges(aes(fill = factor(mutation_info_labels))
                      , scale = 3
                      , size = 0.3
                      , alpha = 0.8) +
-  coord_cartesian( xlim = c(-1, 1)) +
+  scale_x_continuous(expand = c(0.01, 0)) +
+  #coord_cartesian( xlim = c(-1, 1)) +
  scale_fill_manual(values = c("#E69F00", "#999999")) +
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
@ -325,14 +243,18 @@ p2
 #plot_lineage_dist_duet_nf
 #svg(plot_lineage_dist_duet_nf)

-p3 = ggplot(df, aes(x = duet_scaled
+p3 = ggplot(lin_dist_plot, aes(x = duet_scaled
                    , y = lineage_labels))+
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                               #, jittered_points = TRUE
-                               , scale = 3
-                               , size = 0.3 ) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  scale_fill_gradientn(colours = my_palette, name = "DUET") + 
+  # geom_density_ridges_gradient(aes(fill = ..x..)
+  #                              #, jittered_points = TRUE
+  #                              , scale = 3
+  #                              , size = 0.3 ) +
+  geom_density_ridges()+
+  #facet_wrap (~mutation_info_labels) +
+  #coord_cartesian( xlim = c(-1, 1)) +
+  scale_x_continuous(expand = c(0.01, 0)) +
+  
+  #scale_fill_gradientn(colours = my_palette, name = "DUET") + 
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
--- a/scripts/plotting/redundant/other_dfs_data.R
+++ b/scripts/plotting/redundant/other_dfs_data.R
@ -0,0 +1,117 @@
+#!/usr/bin/env Rscript  
+
+# Didn't end up using it: sorted it at the source
+# .py script to combine all dfs to output all_params
+
+#################################################################
+# TASK: Script to add all other dfs to merged_df2 and merged_df3
+
+#################################################################
+# Combine other dfs:
+# dynamut_df, dynamut2_df, mcsm_na_df, 
+# perhaps : deepddg and mcsm ppi (for embb)
+################################################################
+# read other files
+infilename_dynamut = paste0("~/git/Data/", drug, "/output/dynamut_results/", gene
+                            , "_complex_dynamut_norm.csv")
+
+infilename_dynamut2  = paste0("~/git/Data/", drug, "/output/dynamut_results/dynamut2/", gene
+                              , "_complex_dynamut2_norm.csv")
+
+infilename_mcsm_na = paste0("~/git/Data/", drug, "/output/mcsm_na_results/", gene
+                            , "_complex_mcsm_na_norm.csv")
+
+infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+                                 , "_mcsm_formatted_snps.csv")
+
+dynamut_df   = read.csv(infilename_dynamut)
+dynamut2_df  = read.csv(infilename_dynamut2)
+mcsm_na_df   = read.csv(infilename_mcsm_na)
+mcsm_f_snps  = read.csv(infilename_mcsm_f_snps, header = F)
+names(mcsm_f_snps) = "mutationinformation"
+
+#=================================
+# check with intersect to find the common col, but use 
+c1 = length(intersect(names(dynamut_df), names(dynamut2_df)))
+c2 = length(intersect(names(dynamut2_df), names(mcsm_na_df)))
+
+if (c1 == 1 && c2 == 1) {
+  n_common = 1
+}else{
+  cat("\nMore than one common col found, inspect before merging!")
+}
+
+# mutationinformation column to be on the safe side
+# delete chain from dynamut2_df
+#dynamut2_df = subset(dynamut2_df, select = -chain)
+
+# quick checks
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), ncol)
+
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), colnames)
+
+lapply(list(dynamut_df
+            , dynamut2_df
+            , mcsm_na_df), nrow)
+
+ncols_comb = lapply(list(dynamut_df
+                         , dynamut2_df
+                         , mcsm_na_df), ncol)
+
+#---------------------------------
+# Combine 1: all other params dfs
+#---------------------------------
+combined_dfs = Reduce(inner_join, list(dynamut_df
+                                       , dynamut2_df
+                                       , mcsm_na_df))
+# Reduce("+", ncols_comb)
+
+#-----------------------------------------
+# Combine 2: combine1 result + merged_df2
+#-----------------------------------------
+drop_cols = intersect(names(combined_dfs), names(merged_df2))
+drop_cols = drop_cols
+
+drop_cols = drop_cols[! drop_cols %in% c("mutationinformation")]
+
+combined_dfs_f = combined_dfs[, !colnames(combined_dfs)%in%drop_cols]
+
+nrow(combined_dfs_f); nrow(merged_df2)
+ncol(combined_dfs_f); ncol(merged_df2)
+
+#-----------------------------------------
+# Combined merged_df2
+#-----------------------------------------
+merged_df2_combined = merge(merged_df2
+                            , combined_dfs_f
+                            , by = "mutationinformation"
+)
+
+expected_ncols = ncol(combined_dfs_f)+ ncol(merged_df2) - 1
+
+if ( nrow(merged_df2_combined) == nrow(merged_df2) && ncol(merged_df2_combined) == expected_ncols ){
+  
+  cat("\nPASS: merged_df2 combined with other parameters dfs."
+      , "\nUse this for lineage distribution plots")
+}else{
+  
+  cat("\nFAIL: merged_df2 didn't combine successfully with other parameters dfs")
+  quit()
+  
+}
+
+rm(combined_dfs, combined_dfs_f)
+
+#================================
+# combined data
+# short_df ps: ~ merged_df3
+# TODO: later integrate properly
+#================================
+#-----------------------------------------
+# Combined merged_df2
+#-----------------------------------------
+merged_df3_combined = merged_df2_combined[!duplicated(merged_df2_combined$mutationinformation),]
--- a/scripts/plotting/redundant/other_plots_data.R
+++ b/scripts/plotting/redundant/other_plots_data.R
@ -0,0 +1,470 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Script to format data for dm om plots: 
+# generating LF data
+# sourced by get_plotting_dfs.R
+#########################################################
+# working dir and loading libraries
+# getwd()
+# setwd("~/git/LSHTM_analysis/scripts/plotting")
+# getwd()
+
+# make cmd
+# globals
+# drug = "streptomycin"
+# gene = "gid"
+
+# source("get_plotting_dfs.R")
+#=======================================================================
+# MOVE TO COMBINE or singular file for deepddg
+# 
+# cols_to_select = c("mutation", "mutationinformation"
+#                    , "wild_type", "position", "mutant_type"
+#                    , "mutation_info")
+# 
+# merged_df3_short = merged_df3[, cols_to_select]
+
+# infilename_mcsm_f_snps <- paste0("~/git/Data/", drug, "/output/", gene
+#                       , "_mcsm_formatted_snps.csv")
+# 
+# mcsm_f_snps<- read.csv(infilename_mcsm_f_snps, header = F)
+# names(mcsm_f_snps) <- "mutationinformation"
+
+# write merged_df3 to generate structural figure on chimera
+#write.csv(merged_df3_short, "merged_df3_short.csv")
+#========================================================================
+
+#========================================================================
+# cols to select
+
+cols_mcsm_df <- merged_df3[, c("mutationinformation", "mutation"
+                               , "mutation_info", "position"
+                               , LigDist_colname
+                               , "duet_stability_change", "duet_scaled", "duet_outcome"
+                               , "ligand_affinity_change", "affinity_scaled", "ligand_outcome"
+                               , "ddg_foldx", "foldx_scaled", "foldx_outcome"
+                               , "deepddg", "deepddg_scaled", "deepddg_outcome"
+                               , "asa", "rsa"
+                               , "rd_values", "kd_values"
+                               , "log10_or_mychisq", "neglog_pval_fisher", "af")]
+
+cols_mcsm_na_df <- mcsm_na_df[, c("mutationinformation" 
+                                  , "mcsm_na_affinity", "mcsm_na_scaled"
+                                  , "mcsm_na_outcome")]
+# entire dynamut_df
+
+cols_dynamut2_df <- dynamut2_df[, c("mutationinformation"
+                                    , "ddg_dynamut2", "ddg_dynamut2_scaled"
+                                    , "ddg_dynamut2_outcome")]
+
+n_comb_cols = length(cols_mcsm_df) + length(cols_mcsm_na_df) + 
+  length(dynamut_df) + length(cols_dynamut2_df); n_comb_cols
+
+i1<- intersect(names(cols_mcsm_df), names(cols_mcsm_na_df))
+i2<- intersect(names(dynamut_df), names(cols_dynamut2_df))
+merging_cols <- intersect(i1, i2)
+cat("\nmerging_cols:", merging_cols)
+
+if (merging_cols == "mutationinformation") {
+  cat("\nStage 1: Found common col between dfs, checking values in it...")
+  c1 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_df[[merging_cols]])
+  c2 <- all(mcsm_f_snps[[merging_cols]]%in%cols_mcsm_na_df[[merging_cols]])
+  c3 <- all(mcsm_f_snps[[merging_cols]]%in%dynamut_df[[merging_cols]])
+  c4 <- all(mcsm_f_snps[[merging_cols]]%in%cols_dynamut2_df[[merging_cols]])
+  cols_check <- c(c1, c2, c3, c4)
+  expected_cols = n_comb_cols - ( length(cols_check) - 1)
+  if (all(cols_check)){
+    cat("\nStage 2: Proceeding with merging dfs:\n")
+    comb_df <- Reduce(inner_join, list(cols_mcsm_df
+                                       , cols_mcsm_na_df
+                                       , dynamut_df
+                                       , cols_dynamut2_df))
+    comb_df_s = arrange(comb_df, position)
+    
+    # if ( nrow(comb_df_s) == nrow(mcsm_f_snps) && ncol(comb_df_s) == expected_cols) {
+    #   cat("\Stage3, PASS: dfs merged sucessfully"
+    #       , "\nnrow of merged_df: ", nrow(comb_df_s)
+    #       , "\nncol of merged_df:", ncol(comb_df_s))
+    #   }
+    
+    }
+}
+#names(comb_df_s)
+cat("\n!!!IT GOT TO HERE!!!!")
+#=======================================================================
+fact_cols = colnames(comb_df_s)[grepl( "_outcome|_info", colnames(comb_df_s) )]
+fact_cols
+lapply(comb_df_s[, fact_cols], class)
+comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols], as.factor)
+
+if (any(lapply(comb_df_s[, fact_cols], class) == "character")){
+  cat("\nChanging cols to factor")
+  comb_df_s[, fact_cols] <- lapply(comb_df_s[, fact_cols],as.factor)
+  if (all(lapply(comb_df_s[, fact_cols], class) == "factor")){
+    cat("\nSuccessful: cols changed to factor")
+  }
+}
+lapply(comb_df_s[, fact_cols], class)
+
+#=======================================================================
+table(comb_df_s$mutation_info)
+
+ # further checks to make sure dr and other muts are indeed unique
+dr_muts = comb_df_s[comb_df_s$mutation_info == dr_muts_col,]
+dr_muts_names = unique(dr_muts$mutation)
+
+other_muts = comb_df_s[comb_df_s$mutation_info == other_muts_col,]
+other_muts_names = unique(other_muts$mutation)
+
+if ( table(dr_muts_names%in%other_muts_names)[[1]] == length(dr_muts_names) &&
+  table(other_muts_names%in%dr_muts_names)[[1]] == length(other_muts_names) ){
+  cat("PASS: dr and other muts are indeed unique")
+}else{
+  cat("FAIL: dr and others muts are NOT unique!")
+  quit()
+}
+
+# pretty display names i.e. labels to reduce major code duplication later
+foo_cnames = data.frame(colnames(comb_df_s))
+names(foo_cnames) <- "old_name"
+
+stability_suffix <- paste0(delta_symbol, delta_symbol, "G")
+flexibility_suffix <- paste0(delta_symbol, delta_symbol, "S")
+
+lig_dn       = paste0("Ligand distance (", angstroms_symbol, ")"); lig_dn
+duet_dn      = paste0("DUET ", stability_suffix); duet_dn
+foldx_dn     = paste0("FoldX ", stability_suffix); foldx_dn
+deepddg_dn   = paste0("Deepddg " , stability_suffix); deepddg_dn
+mcsm_na_dn   = paste0("mCSM-NA affinity ", stability_suffix); mcsm_na_dn
+dynamut_dn   = paste0("Dynamut ", stability_suffix); dynamut_dn
+dynamut2_dn  = paste0("Dynamut2 " , stability_suffix); dynamut2_dn
+encom_ddg_dn = paste0("EnCOM " , stability_suffix); encom_ddg_dn
+encom_dds_dn = paste0("EnCOM " , flexibility_suffix ); encom_dds_dn
+sdm_dn       = paste0("SDM " , stability_suffix); sdm_dn
+mcsm_dn      = paste0("mCSM " , stability_suffix ); mcsm_dn
+
+# Change colnames of some columns using datatable 
+comb_df_sl = comb_df_s
+names(comb_df_sl)
+
+setnames(comb_df_sl
+         , old = c("asa", "rsa", "rd_values", "kd_values"
+                   , "log10_or_mychisq", "neglog_pval_fisher", "af"
+                   , LigDist_colname
+                   , "duet_scaled"
+                   , "foldx_scaled"
+                   , "deepddg_scaled"
+                   , "mcsm_na_scaled"
+                   , "ddg_dynamut_scaled"
+                   , "ddg_dynamut2_scaled"
+                   , "ddg_encom_scaled"
+                   , "dds_encom_scaled"
+                   , "ddg_sdm"
+                   , "ddg_mcsm")
+                   
+         , new = c("ASA", "RSA", "RD", "KD"
+                   , "Log10 (OR)", "-Log (P)", "MAF"
+                   , lig_dn
+                   , duet_dn
+                   , foldx_dn
+                   , deepddg_dn
+                   , mcsm_na_dn
+                   , dynamut_dn
+                   , dynamut2_dn
+                   , encom_ddg_dn
+                   , encom_dds_dn
+                   , sdm_dn
+                   , mcsm_dn)
+         )
+
+foo_cnames <- cbind(foo_cnames, colnames(comb_df_sl))
+
+# some more pretty labels
+table(comb_df_sl$mutation_info)
+
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==dr_muts_col] <- "DM"
+levels(comb_df_sl$mutation_info)[levels(comb_df_sl$mutation_info)==other_muts_col] <- "OM"
+
+table(comb_df_sl$mutation_info)
+
+#######################################################################
+#======================
+# Selecting dfs
+# with appropriate cols
+#=======================
+static_cols_start =  c("mutationinformation"
+                       , "position"
+                       , "mutation"
+                       , "mutation_info")
+
+static_cols_end = c(lig_dn
+                    , "ASA"
+                    , "RSA"
+                    , "RD"
+                    , "KD")
+
+# ordering is important!
+
+#########################################################################
+#==============
+# DUET: LF
+#==============
+cols_to_select_duet = c(static_cols_start,  c("duet_outcome", duet_dn), static_cols_end)
+wf_duet = comb_df_sl[, cols_to_select_duet]
+
+#pivot_cols_ps = cols_to_select_ps[1:5]; pivot_cols_ps
+pivot_cols_duet = cols_to_select_duet[1: (length(static_cols_start) + 1)]; pivot_cols_duet
+
+expected_rows_lf = nrow(wf_duet) * (length(wf_duet) - length(pivot_cols_duet))
+expected_rows_lf
+
+# LF data: duet
+lf_duet = gather(wf_duet
+                  , key = param_type
+                  , value = param_value
+                  , all_of(duet_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_duet) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", duet_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# FoldX: LF
+#==============
+cols_to_select_foldx= c(static_cols_start, c("foldx_outcome", foldx_dn), static_cols_end)
+wf_foldx = comb_df_sl[, cols_to_select_foldx]
+
+pivot_cols_foldx = cols_to_select_foldx[1: (length(static_cols_start) + 1)]; pivot_cols_foldx
+
+expected_rows_lf = nrow(wf_foldx) * (length(wf_foldx) - length(pivot_cols_foldx))
+expected_rows_lf
+
+# LF data: duet
+print("TESTXXXXXXXXXXXXXXXXXXXXX---------------------->>>>")
+lf_foldx <<- gather(wf_foldx
+                 , key = param_type
+                 , value = param_value
+                 , all_of(foldx_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_foldx) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", foldx_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Deepddg: LF
+#==============
+cols_to_select_deepddg  = c(static_cols_start, c("deepddg_outcome", deepddg_dn), static_cols_end)
+wf_deepddg = comb_df_sl[, cols_to_select_deepddg]
+
+pivot_cols_deepddg = cols_to_select_deepddg[1: (length(static_cols_start) + 1)]; pivot_cols_deepddg
+
+expected_rows_lf = nrow(wf_deepddg) * (length(wf_deepddg) - length(pivot_cols_deepddg))
+expected_rows_lf
+
+# LF data: duet
+lf_deepddg = gather(wf_deepddg
+                  , key = param_type
+                  , value = param_value
+                  , all_of(deepddg_dn):tail(static_cols_end,1)
+                  , factor_key = TRUE)
+
+if (nrow(lf_deepddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", deepddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM-NA: LF
+#==============
+cols_to_select_mcsm_na = c(static_cols_start, c("mcsm_na_outcome", mcsm_na_dn), static_cols_end)
+wf_mcsm_na = comb_df_sl[, cols_to_select_mcsm_na]
+
+pivot_cols_mcsm_na = cols_to_select_mcsm_na[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm_na
+
+expected_rows_lf = nrow(wf_mcsm_na) * (length(wf_mcsm_na) - length(pivot_cols_mcsm_na))
+expected_rows_lf
+
+# LF data: duet
+lf_mcsm_na = gather(wf_mcsm_na
+                    , key = param_type
+                    , value = param_value
+                    , all_of(mcsm_na_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_mcsm_na) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", mcsm_na_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut: LF
+#==============
+cols_to_select_dynamut  = c(static_cols_start, c("ddg_dynamut_outcome", dynamut_dn), static_cols_end)
+wf_dynamut = comb_df_sl[, cols_to_select_dynamut]
+
+pivot_cols_dynamut = cols_to_select_dynamut[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut
+
+expected_rows_lf = nrow(wf_dynamut) * (length(wf_dynamut) - length(pivot_cols_dynamut))
+expected_rows_lf
+
+# LF data: duet
+lf_dynamut = gather(wf_dynamut
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# Dynamut2: LF
+#==============
+cols_to_select_dynamut2 = c(static_cols_start, c("ddg_dynamut2_outcome", dynamut2_dn), static_cols_end)
+
+wf_dynamut2 = comb_df_sl[, cols_to_select_dynamut2]
+
+pivot_cols_dynamut2 = cols_to_select_dynamut2[1: (length(static_cols_start) + 1)]; pivot_cols_dynamut2
+
+expected_rows_lf = nrow(wf_dynamut2) * (length(wf_dynamut2) - length(pivot_cols_dynamut2))
+expected_rows_lf
+
+# LF data: duet
+lf_dynamut2 = gather(wf_dynamut2
+                    , key = param_type
+                    , value = param_value
+                    , all_of(dynamut2_dn):tail(static_cols_end,1)
+                    , factor_key = TRUE)
+
+if (nrow(lf_dynamut2) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", dynamut2_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# EnCOM ddg: LF
+#==============
+cols_to_select_encomddg = c(static_cols_start, c("ddg_encom_outcome", encom_ddg_dn), static_cols_end)
+wf_encomddg = comb_df_sl[, cols_to_select_encomddg]
+
+pivot_cols_encomddg = cols_to_select_encomddg[1: (length(static_cols_start) + 1)]; pivot_cols_encomddg 
+
+expected_rows_lf = nrow(wf_encomddg ) * (length(wf_encomddg ) - length(pivot_cols_encomddg))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomddg  = gather(wf_encomddg 
+                     , key = param_type
+                     , value = param_value
+                     , all_of(encom_ddg_dn):tail(static_cols_end,1)
+                     , factor_key = TRUE)
+
+if (nrow(lf_encomddg) == expected_rows_lf){
+  cat("\nPASS: long format data created for ", encom_ddg_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+#==============
+# EnCOM dds: LF
+#==============
+cols_to_select_encomdds = c(static_cols_start, c("dds_encom_outcome", encom_dds_dn), static_cols_end)
+wf_encomdds = comb_df_sl[, cols_to_select_encomdds]
+
+pivot_cols_encomdds = cols_to_select_encomdds[1: (length(static_cols_start) + 1)]; pivot_cols_encomdds 
+
+expected_rows_lf = nrow(wf_encomdds) * (length(wf_encomdds) - length(pivot_cols_encomdds))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_encomdds  = gather(wf_encomdds
+                      , key = param_type
+                      , value = param_value
+                      , all_of(encom_dds_dn):tail(static_cols_end,1)
+                      , factor_key = TRUE)
+
+if (nrow(lf_encomdds) == expected_rows_lf){
+  cat("\nPASS: long format data created for", encom_dds_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# SDM: LF
+#==============
+cols_to_select_sdm = c(static_cols_start, c("ddg_sdm_outcome", sdm_dn), static_cols_end)
+wf_sdm = comb_df_sl[, cols_to_select_sdm]
+
+pivot_cols_sdm = cols_to_select_sdm[1: (length(static_cols_start) + 1)]; pivot_cols_sdm
+
+expected_rows_lf = nrow(wf_sdm) * (length(wf_sdm) - length(pivot_cols_sdm))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_sdm  = gather(wf_sdm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(sdm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_sdm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", sdm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+
+############################################################################
+#==============
+# mCSM: LF
+#==============
+cols_to_select_mcsm  = c(static_cols_start, c("ddg_mcsm_outcome", mcsm_dn), static_cols_end)
+wf_mcsm = comb_df_sl[, cols_to_select_mcsm]
+
+pivot_cols_mcsm = cols_to_select_mcsm[1: (length(static_cols_start) + 1)]; pivot_cols_mcsm
+
+expected_rows_lf = nrow(wf_mcsm) * (length(wf_mcsm) - length(pivot_cols_mcsm))
+expected_rows_lf
+
+# LF data: encomddg 
+lf_mcsm  = gather(wf_mcsm
+                 , key = param_type
+                 , value = param_value
+                 , all_of(mcsm_dn):tail(static_cols_end,1)
+                 , factor_key = TRUE)
+
+if (nrow(lf_mcsm) == expected_rows_lf){
+  cat("\nPASS: long format data created for", mcsm_dn)
+}else{
+  cat("\nFAIL: long format data could not be created for duet")
+  quit()
+}
+############################################################################
+
--- a/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R
+++ b/scripts/plotting/redundant/other_plots_data_SAFEGUARD.R
--- a/scripts/plotting/resolving_ambiguous_muts.R
+++ b/scripts/plotting/resolving_ambiguous_muts.R
--- a/scripts/plotting/running_plotting_scripts.txt
+++ b/scripts/plotting/running_plotting_scripts.txt
@ -112,6 +112,30 @@ note:
 - fa flag has default if not supplied 
 - fb flag has default if not supplied
 
+ 
+#====================================
+# lineage_basic_barplots_combined.R
+#====================================
+#-----------------------------------------------------------------------
+./lineage_basic_barplots_combined.R-d streptomycin -g gid
+#-----------------------------------------------------------------------
+
+It replaces (and has an added diversity plot)
+ ## lineage_basic_barplot.R
+These have been moved to redundant/
+
+
+sources:
+ ## get_plotting_dfs.R
+ ## functions//bp_lineage.R"
+
+ outputs: 1 svg in the plotdir
+ ## basic_lineage_barplots_combined.svg
+ 
+note: 
+ - fa flag has default if not supplied 
+ - fb flag has default if not supplied
+ 
 ########################################################################
 # TODO
 Delete: dirs.R