separated plotting_thesis for generating plots

2022-08-04 18:47:18 +01:00 · 2022-08-04 18:47:18 +01:00 · ad2e538ec2
commit ad2e538ec2
parent 95131abc3c
11 changed files with 2807 additions and 0 deletions
--- a/scripts/plotting/mcsm_mean_stability.R
+++ b/scripts/plotting/mcsm_mean_stability.R
@ -0,0 +1,163 @@
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#########################################################
+# TASK:
+
+#########################################################
+#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+#require(data.table)
+#require(dplyr)
+
+source("plotting_data.R")
+# should return
+#my_df
+#my_df_u
+#dup_muts
+
+# cmd parse arguments
+#require('getopt', quietly = TRUE)
+#========================================================
+
+
+#========================================================
+#	Read file: call script for combining df for PS
+
+#source("../combining_two_df.R")
+
+#========================================================
+
+# plotting_data.R imports all the dir names, etc
+
+#=======
+# output
+#=======
+out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") 
+outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability)
+print(paste0("Output file:", outfile_mean_stability))
+
+#%%===============================================================
+
+#================
+# Data for plots
+#================
+# REASSIGNMENT as necessary
+df  = my_df_u
+rm(my_df)
+
+###########################
+# Data for bfactor figure
+# PS (duet) average 
+# Ligand affinity average
+###########################
+head(df$position); head(df$mutationinformation)
+head(df$duet_stability_change)
+
+# order data frame 
+#df = df[order(df$position),] #already done
+#head(df$position); head(df$mutationinformation)
+#head(df$duet_stability_change)
+
+#***********
+# PS(duet): average by position and then scale b/w -1 and 1
+# column to average: duet_stability_change (NOT scaled!)
+#***********
+mean_duet_by_position <- df %>%
+  group_by(position) %>%
+  summarize(averaged_duet = mean(duet_stability_change))
+
+# scale b/w -1 and 1
+duet_min = min(mean_duet_by_position['averaged_duet'])
+duet_max = max(mean_duet_by_position['averaged_duet']) 
+
+# scale the averaged_duet values
+mean_duet_by_position['averaged_duet_scaled'] = lapply(mean_duet_by_position['averaged_duet']
+                                                       , function(x) ifelse(x < 0, x/abs(duet_min), x/duet_max))
+
+cat(paste0('Average duet scores:\n', head(mean_duet_by_position['averaged_duet'])
+           , '\n---------------------------------------------------------------'
+           , '\nScaled duet scores:\n', head(mean_duet_by_position['averaged_duet_scaled'])))
+
+# sanity checks
+l_bound_duet = min(mean_duet_by_position['averaged_duet_scaled'])
+u_bound_duet = max(mean_duet_by_position['averaged_duet_scaled'])
+
+if ( (l_bound_duet == -1) && (u_bound_duet == 1) ){
+  cat(paste0("PASS: duet scores averaged by position and then scaled"
+        , "\nmin averaged duet: ", l_bound_duet
+        , "\nmax averaged duet: ", u_bound_duet))
+}else{
+  cat(paste0("FAIL: avergaed duet scores could not be scaled b/w -1 and 1"
+        , "\nmin averaged duet: ", l_bound_duet
+        , "\nmax averaged duet: ", u_bound_duet))
+  quit()
+} 
+
+#***********
+# Lig: average by position and then scale b/w -1 and 1
+# column: ligand_affinity_change (NOT scaled!)
+#***********
+mean_affinity_by_position <- df %>%
+  group_by(position) %>%
+  summarize(averaged_affinity = mean(ligand_affinity_change))
+
+# scale b/w -1 and 1
+affinity_min = min(mean_affinity_by_position['averaged_affinity'])
+affinity_max = max(mean_affinity_by_position['averaged_affinity']) 
+
+# scale the averaged_affinity values
+mean_affinity_by_position['averaged_affinity_scaled'] = lapply(mean_affinity_by_position['averaged_affinity']
+                                                               , function(x) ifelse(x < 0, x/abs(affinity_min), x/affinity_max))
+
+cat(paste0('Average affinity scores:\n', head(mean_affinity_by_position['averaged_affinity'])
+           , '\n---------------------------------------------------------------'
+           , '\nScaled affinity scores:\n', head(mean_affinity_by_position['averaged_affinity_scaled'])))
+
+# sanity checks
+l_bound_affinity = min(mean_affinity_by_position['averaged_affinity_scaled'])
+u_bound_affinity = max(mean_affinity_by_position['averaged_affinity_scaled'])
+
+if ( (l_bound_affinity == -1) && (u_bound_affinity == 1) ){
+  cat(paste0("PASS: affinity scores averaged by position and then scaled"
+             , "\nmin averaged affintiy: ", l_bound_affinity
+             , "\nmax averaged affintiy: ", u_bound_affinity))
+}else{
+  cat(paste0("FAIL: avergaed affinity scores could not be scaled b/w -1 and 1"
+             , "\nmin averaged affintiy: ", l_bound_affinity
+             , "\nmax averaged affintiy: ", u_bound_affinity))
+  quit()
+} 
+
+#***********
+# merge: mean_duet_by_position and mean_affinity_by_position
+#***********
+common_cols = intersect(colnames(mean_duet_by_position), colnames(mean_affinity_by_position))
+
+if (dim(mean_duet_by_position) && dim(mean_affinity_by_position)){
+  print(paste0("PASS: dim's match, mering dfs by column :", common_cols))
+  #combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position ))
+  combined_df = as.data.frame(merge(mean_duet_by_position
+                                    , mean_affinity_by_position
+                                    , by = common_cols
+                                    , all = T))
+  
+  cat(paste0("\nnrows combined_df:", nrow(combined_df)
+               , "\nnrows combined_df:", ncol(combined_df)))
+}else{
+    cat(paste0("FAIL: dim's mismatch, aborting cbind!"
+          , "\nnrows df1:", nrow(mean_duet_by_position)
+          , "\nnrows df2:", nrow(mean_affinity_by_position)))
+    quit()      
+}
+#%%============================================================
+# output
+write.csv(combined_df, outfile_mean_stability
+          , row.names = F)
+cat("Finished writing file:\n"
+    , outfile_mean_stability
+    , "\nNo. of rows:", nrow(combined_df)
+    , "\nNo. of cols:", ncol(combined_df))
+
+# end of script
+#===============================================================
--- a/scripts/plotting/plotting_thesis/basic_barplots.R
+++ b/scripts/plotting/plotting_thesis/basic_barplots.R
@ -0,0 +1,406 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: Barplots for mCSM DUET, ligand affinity, and foldX
+# basic barplots with count of mutations
+# basic barplots with frequency of count of mutations
+
+# , df_colname = ""
+# , leg_title = ""
+# , ats = 25     # axis text size
+# , als = 22     # axis label size
+# , lts = 20     # legend text size
+# , ltis = 22    # label title size
+# , geom_ls = 10 # geom_label size
+# , yaxis_title = "Number of nsSNPs"
+# , bp_plot_title = ""
+# , label_categories = c("Destabilising", "Stabilising")
+# , title_colour = "chocolate4"
+# , subtitle_text = NULL
+# , sts = 20
+# , subtitle_colour = "pink"
+# #, leg_position = c(0.73,0.8) # within plot area
+# , leg_position = "top"
+# , bar_fill_values = c("#F8766D", "#00BFC4")
+#########################################################
+
+#=======================================================================
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/"
+                       , tolower(gene), "/")
+cat("plots will output to:", outdir_images)
+
+###########################################################
+df3 = merged_df3
+# FIXME: port to a common script
+#=================
+# PREFORMATTING: for consistency
+#=================
+df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
+table(df3$sensitivity)
+
+# ConSurf labels
+consurf_colOld = "consurf_colour_rev"
+consurf_colNew = "consurf_outcome"
+df3[[consurf_colNew]] = df3[[consurf_colOld]]
+df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]])
+df3[[consurf_colNew]]
+levels(df3$consurf_outcome) = c( "nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
+levels(df3$consurf_outcome)
+
+# SNAP2 labels
+snap2_colname = "snap2_outcome"
+df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect")
+df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral")
+
+##############################################################
+gene_all_cols = colnames(df3)[colnames(df3)%in%all_cols]
+  
+gene_outcome_cols = colnames(df3)[colnames(df3)%in%c(outcome_cols_stability
+                                                     , outcome_cols_affinity
+                                                     , outcome_cols_conservation)]
+gene_outcome_cols
+
+
+#=======================================================================
+#------------------------------
+# stability barplots:
+outcome_cols_stability
+# label_categories should be  = levels(as.factor(plot_df[[df_colname]]))
+#------------------------------
+sts = 22
+subtitle_colour = "black"
+geom_ls = 10
+
+# duetP
+duetP = stability_count_bp(plotdf = df3
+               , df_colname = "duet_outcome"
+               , leg_title = "mCSM-DUET"
+               #, label_categories = labels_duet
+               , yaxis_title = "Number of nsSNPs"
+               , leg_position = "none"
+               , subtitle_text = "mCSM-DUET"
+               , geom_ls = geom_ls
+               , bar_fill_values = c("#F8766D", "#00BFC4")
+               , sts = sts
+               , subtitle_colour= subtitle_colour)
+
+# foldx
+foldxP = stability_count_bp(plotdf = df3
+                           , df_colname = "foldx_outcome"
+                           #, leg_title = "FoldX"
+                           #, label_categories = labels_foldx
+                           , yaxis_title = ""
+                           , leg_position = "none"
+                           , subtitle_text = "FoldX"
+                           , geom_ls = geom_ls
+                           , bar_fill_values = c("#F8766D", "#00BFC4")
+                           , sts = sts
+                           , subtitle_colour= subtitle_colour)
+
+
+# deepddg
+deepddgP = stability_count_bp(plotdf = df3
+                            , df_colname = "deepddg_outcome"
+                            #, leg_title = "DeepDDG"
+                            #, label_categories = labels_deepddg
+                            , yaxis_title = "Number of nsSNPs"
+                            , leg_position = "none"
+                            , subtitle_text = "DeepDDG"
+                            , geom_ls = geom_ls
+                            , bar_fill_values = c("#F8766D", "#00BFC4")
+                            , sts = sts
+                            , subtitle_colour= subtitle_colour)
+
+
+# deepddg
+dynamut2P = stability_count_bp(plotdf = df3
+                              , df_colname = "ddg_dynamut2_outcome"
+                              #, leg_title = "Dynamut2"
+                              #, label_categories = labels_ddg_dynamut2_outcome
+                              , yaxis_title = ""
+                              , leg_position = "none"
+                              , subtitle_text = "Dynamut2"
+                              , geom_ls = geom_ls
+                              , bar_fill_values = c("#F8766D", "#00BFC4")
+                              , sts = sts
+                              , subtitle_colour= subtitle_colour)
+
+dynamut2P
+
+# extract common legend
+common_legend = get_legend(duetP +
+    guides(color = guide_legend(nrow = 1)) +
+    theme(legend.position = "top"))
+
+#==========================
+# output: STABILITY PLOTS
+#===========================
+bp_stability_CLP = paste0(outdir_images
+                          , tolower(gene)
+                          ,"_bp_stability_CL.svg")
+
+svg(bp_stability_CLP,  width = 15, height = 12)
+print(paste0("plot filename:", bp_stability_CLP))
+
+cowplot::plot_grid(
+  common_legend,
+  cowplot::plot_grid(duetP, foldxP
+                     , deepddgP, dynamut2P
+                     , nrow = 2
+                     , ncol = 2
+                     #, labels = c("(a)", "(b)", "(c)", "(d)")
+                     , labels = "AUTO"
+                     , label_size = 25)
+  , ncol = 1
+  , nrow = 2
+  , rel_heights = c(0.4/10,9/10))
+
+dev.off()
+###########################################################
+#=========================
+# Affinity outcome
+# check this var: outcome_cols_affinity
+# get from preformatting or put in globals
+#==========================
+DistCutOff = 10
+LigDist_colname  # = "ligand_distance" # from globals 
+ppi2Dist_colname  = "interface_dist"
+naDist_colname    = "TBC"
+
+###########################################################
+# get plotting data within the distance
+df3_lig  = df3[df3[[LigDist_colname]]<DistCutOff,]
+df3_ppi2 = df3[df3[[ppi2Dist_colname]]<DistCutOff,]
+df3_na   = df3[df3[[naDist_colname]]<DistCutOff,]
+common_bp_title = paste0("Sites <", DistCutOff, angstroms_symbol)
+
+#------------------------------
+# barplot for ligand affinity:
+# <10 Ang of ligand
+#------------------------------
+mLigP = stability_count_bp(plotdf = df3_lig
+               , df_colname = "ligand_outcome"
+               #, leg_title  = "mCSM-lig"
+               #, label_categories = labels_lig
+               , yaxis_title = "Number of nsSNPs"
+               , leg_position = "none"
+               , subtitle_text = "mCSM-lig"
+               , geom_ls = geom_ls
+               , bar_fill_values = c("#F8766D", "#00BFC4")
+               , sts = sts
+               , subtitle_colour= subtitle_colour
+               , bp_plot_title = paste(common_bp_title, "ligand")
+               )
+
+#------------------------------
+# barplot for ligand affinity:
+# <10 Ang of ligand
+# mmCSM-lig: will be the same no. of sites but the effect will be different
+#------------------------------
+mmLigP = stability_count_bp(plotdf = df3_lig
+                   , df_colname = "mmcsm_lig_outcome"
+                   #, leg_title  = "mmCSM-lig"
+                   #, label_categories = labels_mmlig
+                   , yaxis_title = ""
+                   , leg_position = "none"
+                   , subtitle_text = "mmCSM-lig"
+                   , geom_ls = geom_ls
+                   , bar_fill_values = c("#F8766D", "#00BFC4")
+                   , sts = sts
+                   , subtitle_colour= subtitle_colour
+                   , bp_plot_title = paste(common_bp_title, "ligand")
+                   )
+
+#------------------------------
+# barplot for ppi2 affinity
+#  <10 Ang of interface
+#------------------------------
+ppi2P = stability_count_bp(plotdf = df3_ppi2
+                   , df_colname = "mcsm_ppi2_outcome"
+                   #, leg_title  = "mCSM-ppi2"
+                   #, label_categories = labels_ppi2
+                   , yaxis_title = ""
+                   , leg_position = "none"
+                   , subtitle_text = "mCSM-ppi2"
+                   , geom_ls = geom_ls
+                   , bar_fill_values = c("#F8766D", "#00BFC4")
+                   , sts = sts
+                   , subtitle_colour= subtitle_colour
+                   , bp_plot_title = paste(common_bp_title, "interface")
+                   )
+
+# extract common legend
+common_legend_aff = get_legend(mLigP +
+    guides(color = guide_legend(nrow = 1)) +
+    theme(legend.position = "top"))
+
+#==========================
+# output: AFFINITY PLOTS
+#==========================
+bp_affinity_CLP =  paste0(outdir_images
+                          ,tolower(gene)
+                          ,"_bp_affinity_CL.svg" )
+
+print(paste0("plot filename:", bp_stability_CLP))
+svg(bp_affinity_CLP,  width = 15, height = 6.5)
+
+cowplot::plot_grid(
+  common_legend,
+  cowplot::plot_grid(mLigP, mmLigP
+                     , ppi2P
+                     , nrow = 1
+                     , ncol = 3
+                     #, labels = c("(a)", "(b)", "(c)", "(d)")
+                     , labels = "AUTO"
+                     , label_size = 25)
+  , ncol = 1
+  , nrow = 2
+  , rel_heights = c(0.4/10,9/10))
+  #, rel_widths = c(1,1,1))
+
+
+dev.off()
+
+################################################################
+#=========================
+# Conservation outcome
+# check this var:
+outcome_cols_conservation
+#==========================
+# provean
+proveanP = stability_count_bp(plotdf = df3
+                              , df_colname = "provean_outcome"
+                              #, leg_title = "PROVEAN"
+                              #, label_categories = labels_provean
+                              , yaxis_title = ""
+                              , leg_position = "top"
+                              , subtitle_text = "PROVEAN"
+                              , geom_ls = geom_ls
+                              , bar_fill_values = c("#F8766D", "#00BFC4")
+                              , sts = sts
+                              , subtitle_colour= subtitle_colour)
+
+
+# snap2
+snap2P = stability_count_bp(plotdf = df3
+                            , df_colname = "snap2_outcome"
+                            #, leg_title = "SNAP2"
+                            #, label_categories = labels_snap2
+                            , yaxis_title = "Number of nsSNPs"
+                            , leg_position = "top"
+                            , subtitle_text = "SNAP2"
+                            , geom_ls = geom_ls
+                            , bar_fill_values = c("#F8766D", "#00BFC4")
+                            , sts = sts
+                            , subtitle_colour= subtitle_colour)
+
+# consurf 
+consurfP = stability_count_bp(plotdf = df3
+                              , df_colname = "consurf_outcome"
+                              #, leg_title = "ConSurf"
+                              #, label_categories = labels_consurf
+                              , yaxis_title = ""
+                              , leg_position = "top"
+                              , subtitle_text = "ConSurf"
+                              , geom_ls = 5
+                              , bar_fill_values = consurf_colours # from globals
+                              , sts = sts
+                              , subtitle_colour= subtitle_colour)
+
+consurfP
+#============================
+# output: CONSERVATION PLOTS
+#============================
+bp_conservation_CLP =  paste0(outdir_images
+                              ,tolower(gene)
+                              ,"_bp_conservation_CL.svg" )
+
+print(paste0("plot filename:", bp_conservation_CLP))
+svg(bp_conservation_CLP,  width = 15, height = 6.5)
+
+cowplot::plot_grid(proveanP, snap2P, consurfP
+                     , nrow = 1
+                     , ncol = 3
+                     #, labels = c("(a)", "(b)", "(c)", "(d)")
+                     , labels = "AUTO"
+                     , label_size = 25
+                     #, rel_heights = c(0.4/10,9/10))
+                     , rel_widths  = c(0.9, 0.9, 1.1))
+
+
+dev.off()
+
+#####################################################################
+#===============================================================
+# ------------------------------
+# bp site site count: ALL
+# <10 Ang ligand
+# ------------------------------
+posC_all = site_snp_count_bp(plotdf = df3
+                  , df_colname = "position"
+                  , xaxis_title = ""
+                  , yaxis_title = "Number of Sites"
+                  , subtitle_size = 20)
+
+
+# ------------------------------
+# bp site site count: mCSM-lig
+# < 10 Ang ligand
+# ------------------------------
+common_bp_title = paste0("Sites <", DistCutOff, angstroms_symbol)
+
+posC_lig = site_snp_count_bp(plotdf = df3_lig
+                  , df_colname = "position"
+                  , xaxis_title = "Number of nsSNPs"
+                  , yaxis_title = "" #+  annotate("text", x = 1.5, y = 2.2, label = "Text No. 1")
+
+                  , subtitle_text = paste0(common_bp_title, " ligand")
+                  , subtitle_size = 20
+                  , subtitle_colour = subtitle_colour)
+# ------------------------------
+# bp site site count: ppi2
+# < 10 Ang interface
+# ------------------------------
+
+posC_ppi2 = site_snp_count_bp(plotdf = df3_ppi2
+                  , df_colname = "position"
+                  , xaxis_title = ""
+                  , yaxis_title = ""
+                  , subtitle_text = paste0(common_bp_title, " interface")
+                  , subtitle_size = 20
+                  , subtitle_colour = subtitle_colour)
+
+# ------------------------------
+#FIXME: bp site site count: na
+# < 10 Ang TBC
+# ------------------------------
+# posC_na = site_snp_count_bp(plotdf = df3_na
+#                   , df_colname = "position"
+#                   , xaxis_title = ""
+#                   , yaxis_title = "")
+
+
+#===========================
+# output: SITE SNP count:
+# all + affinity
+#==========================
+pos_count_combined_CLP =  paste0(outdir_images
+                            ,tolower(gene)
+                            ,"_pos_count_PS_AFF.svg")
+
+
+svg(pos_count_combined_CLP, width = 15, height = 6.5)
+print(paste0("plot filename:", pos_count_combined_CLP))
+
+cowplot::plot_grid(posC_all, posC_lig, posC_ppi2
+                     #, posC_na
+                     , nrow = 1
+                     , ncol = 3
+                     #, labels = c("(a)", "(b)", "(c)", "(d)")
+                     , labels = "AUTO"
+                     , label_size = 25)
+
+dev.off()
+#===============================================================
--- a/scripts/plotting/plotting_thesis/corr/corr_adjusted_PS_LIG.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_adjusted_PS_LIG.R
@ -0,0 +1,330 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Corr plots for PS and Lig 
+
+# Output: 1 svg
+
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+
+source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+require(cowplot)
+source("combining_dfs_plotting.R")
+source("my_pairs_panel.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))     
+
+#=======
+# output
+#=======
+# can't combine by cowplot because not ggplots
+#corr_plot_combined = "corr_combined.svg"
+#plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
+
+# PS
+corr_ps_adjusted = "corr_PS_adjusted.svg"
+plot_corr_ps_adjusted =  paste0(plotdir,"/", corr_ps)
+
+# LIG
+corr_lig_adjusted = "corr_LIG_adjusted.svg"
+plot_corr_lig_adjusted =  paste0(plotdir,"/", corr_lig)
+
+####################################################################
+#               end of loading libraries and functions                 #
+########################################################################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+df_ps = merged_df3_comp 
+df_lig = merged_df3_comp_lig
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Data for Correlation plots:PS
+#===========================
+table(df_ps$duet_outcome)
+
+
+#===========================
+# Data for Correlation plots:foldx
+#===========================
+#============================
+# adding foldx scaled values
+# scale data b/w -1 and 1
+#============================
+n = which(colnames(df_ps) == "ddg"); n 
+
+my_min = min(df_ps[,n]); my_min 
+my_max = max(df_ps[,n]); my_max 
+
+df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
+                            , df_ps[,n]/abs(my_min)
+                            , df_ps[,n]/my_max) 
+# sanity check
+my_min = min(df_ps$foldx_scaled); my_min 
+my_max = max(df_ps$foldx_scaled); my_max
+
+if (my_min == -1 && my_max == 1){
+  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
+      , "\nProceeding with assigning foldx outcome category")
+}else{
+  cat("FAIL: could not scale foldx ddg values"
+      , "Aborting!")
+}
+
+
+#================================
+# adding foldx outcome category
+# ddg<0 = "Stabilising" (-ve)
+#=================================
+
+c1 = table(df_ps$ddg < 0)
+df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
+c2 = table(df_ps$ddg < 0)
+
+if ( all(c1 == c2) ){
+  cat("PASS: foldx outcome successfully created")
+}else{
+  cat("FAIL: foldx outcome could not be created. Aborting!")
+  exit()
+}
+
+table(df_ps$foldx_outcome)
+
+
+#======================
+# adding log cols 
+#======================
+ 
+df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
+df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+
+df_ps$log10_or_kin = log10(df_ps$or_kin)
+df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+
+# subset data to generate pairwise correlations
+cols_to_select =  c("duet_scaled"
+                    
+                    , "foldx_scaled"
+                    
+                    #, "log10_or_mychisq"
+                    #, "neglog_pval_fisher"
+                    
+                    , "or_kin"
+                    , "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "asa"
+                    , "rsa"
+                    , "kd_values"
+                    , "rd_values"
+                    
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[, cols_to_select]
+
+dim(corr_data_ps)
+
+#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
+#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+                     
+                     , "Foldx"
+                     #, "Log(OR)"
+                     #, "-Log(P)"
+                     
+                     , "OR adjusted"
+                     , "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "ASA"
+                     , "RSA"
+                     , "KD"
+                     , "RD"
+                     
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+#-----------------
+# generate corr PS plot
+#-----------------
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+my_corr_ps = corr_data_ps[start:(end-offset)]
+head(my_corr_ps)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+cat("Corr plot PS:", plot_corr_ps_adjusted)
+svg(plot_corr_ps_adjusted, width = 15, height = 15)
+
+OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 2
+             , cex.axis = 1.5
+             , cex.labels = 1.5
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(OutPlot1)
+dev.off()
+
+#===========================
+# Data for Correlation plots: LIG
+#===========================
+table(df_lig$ligand_outcome)
+
+df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+
+df_lig$log10_or_kin = log10(df_lig$or_kin)
+df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+
+# subset data to generate pairwise correlations
+cols_to_select =  c("affinity_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    #, "or_kin"
+                    #, "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select]
+
+
+dim(corr_data_lig)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("Ligand Affinity"
+                     
+                     , "Log(OR)"
+                     , "-Log(P)"
+                     
+                     #, "OR adjusted"
+                     #, "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames
+colnames(corr_data_lig)
+
+#-----------------
+# generate corr LIG plot
+#-----------------
+
+start = 1
+end = which(colnames(corr_data_lig) == drug); end # should be the last column
+offset = 1
+
+my_corr_lig = corr_data_lig[start:(end-offset)]
+head(my_corr_lig)
+
+cat("Corr LIG plot:", plot_corr_lig_adjusted)
+svg(plot_corr_lig_adjusted, width = 15, height = 15)
+
+OutPlot2  = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
+                  , method = "spearman" # correlation method
+                  , hist.col = "grey" ##00AFBB
+                  , density = TRUE  # show density plots
+                  , ellipses = F # show correlation ellipses
+                  , stars = T
+                  , rug = F
+                  , breaks = "Sturges"
+                  , show.points = T
+                  , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
+                  , pch = 21
+                  , jitter = T
+                  #, alpha = .05
+                  #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+                  , cex = 3
+                  , cex.axis = 2.5
+                  , cex.labels = 2.1
+                  , cex.cor = 1
+                  , smooth = F
+)
+
+print(OutPlot2)
+dev.off()
+#######################################################
+
+
--- a/scripts/plotting/plotting_thesis/corr/corr_plots.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_plots.R
@ -0,0 +1,242 @@
+#!/usr/bin/env Rscript
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+
+spec = matrix(c(
+  "drug"       , "d",  1, "character",
+  "gene"       , "g",  1, "character",
+  "data_file1" , "fa", 2, "character",
+  "data_file2" , "fb", 2, "character" 
+), byrow = TRUE, ncol = 4)
+
+opt = getopt(spec)
+
+drug            = opt$drug
+gene            = opt$gene
+infile_params   = opt$data_file1
+infile_metadata = opt$data_file2
+
+if(is.null(drug)|is.null(gene)) {
+  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+
+#===========
+# Input
+#===========
+
+source("get_plotting_dfs.R")
+
+#===========
+# output
+#===========
+# PS
+corr_ps = "corr_PS.svg"
+plot_corr_ps =  paste0(plotdir,"/", corr_ps)
+
+corr_ps_all = "corr_PS_all.svg"
+plot_corr_ps_all =  paste0(plotdir,"/", corr_ps_all)
+
+
+# LIG
+corr_lig = "corr_LIG.svg"
+plot_corr_lig =  paste0(plotdir,"/", corr_lig)
+
+corr_lig_all = "corr_LIG_all.svg"
+plot_corr_lig_all =  paste0(plotdir,"/", corr_lig_all)
+
+##############################################################################
+foo = corr_ps_df3
+#foo2 = corr_ps_df2
+
+bar = corr_lig_df3
+#bar2 = corr_lig_df2
+
+#================================
+# Data for Correlation plots: PS
+#================================
+# subset data to generate pairwise correlations
+cols_to_select = c("DUET"
+                     , "Foldx"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     , "MAF"
+                     , "duet_outcome"
+                     , drug)
+corr_data_ps = foo[names(foo)%in%cols_to_select]
+length(cols_to_select)
+
+colnames(corr_data_ps)
+
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+my_corr_ps = corr_data_ps[start:(end - offset)]
+head(my_corr_ps)
+
+#---------------------
+# Corr plot PS: short
+# data: corr_ps_df3
+# cols: 7
+#---------------------
+cat("Corr plot PS DUET with coloured dots:", plot_corr_ps)
+svg(plot_corr_ps, width = 15, height = 15)
+
+pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))] # foldx colours are reveresed
+             , pch = 21 # for bg
+             , jitter = T
+             , alpha = 1
+             , cex = 1.8
+             , cex.axis = 2
+             , cex.labels = 4
+             , cex.cor = 1
+             , smooth = F
+)
+dev.off()
+
+corr_ps_rho = corr.test(my_corr_ps[1:5], method = "spearman")$r
+corr_ps_p = corr.test(my_corr_ps[1:5], method = "spearman")$p
+
+#---------------------
+# Corr plot PS: ALL
+# data: corr_ps_df3
+# cols: 10
+#---------------------
+end_ps_all = which(colnames(foo) == drug); end_ps_all # should be the last column
+
+my_corr_ps_all = foo[start:(end_ps_all - offset)]
+cols_to_drop = "Mutation"
+my_corr_ps_all = my_corr_ps_all[, !(names(my_corr_ps_all)%in%cols_to_drop)]
+head(my_corr_ps_all)
+length(colnames(my_corr_ps_all))
+
+cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all)
+svg(plot_corr_ps_all, width = 15, height = 15)
+
+pairs.panels(my_corr_ps_all[1:(length(my_corr_ps_all)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps_all$duet_outcome))] # foldx colours are reveresed
+             , pch = 21 # for bg
+             , jitter = T
+             , alpha = 1
+             , cex = 1.5
+             , cex.axis = 2
+             , cex.labels = 2.5
+             , cex.cor = 1
+             , smooth = F
+)
+dev.off()
+
+#==================================
+# Data for Correlation plots: LIG
+#==================================
+cols_to_select_lig = c("Ligand Affinity"
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     , "MAF"
+                     , "ligand_outcome"
+                     , drug)
+
+corr_data_lig = bar[names(bar)%in%cols_to_select_lig]
+length(cols_to_select_lig)
+
+colnames(corr_data_lig)
+
+start_lig = 1
+end_lig = which(colnames(corr_data_lig) == drug); end_lig # should be the last column
+offset_lig = 1
+
+my_corr_lig = corr_data_lig[start_lig:(end_lig-offset_lig)]
+head(my_corr_lig)
+
+#---------------------
+# Corr plot LIG: short
+# data: corr_lig_df3
+# cols: 7
+#---------------------
+cat("Corr LIG plot with coloured dots:", plot_corr_lig)
+svg(plot_corr_lig, width = 15, height = 15)
+
+pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))] 
+             , pch = 21 # for bg
+             , jitter = T
+             , cex = 2
+             , cex.axis = 2
+             , cex.labels = 4
+             , cex.cor = 1
+             , smooth = F
+)
+
+dev.off()
+
+corr_lig_rho = corr.test(my_corr_lig[1:4], method = "spearman")$r
+corr_lig_p = corr.test(my_corr_lig[1:4], method = "spearman")$p
+
+#---------------------
+# Corr plot LIG: ALL
+# data: corr_lig_df3
+# cols: 9
+#---------------------
+end_lig_all = which(colnames(bar) == drug); end_lig_all # should be the last column
+
+my_corr_lig_all = bar[start_lig:(end_lig_all - offset_lig)]
+cols_to_drop = "Mutation"
+my_corr_lig_all = my_corr_lig_all[, !(names(my_corr_lig_all)%in%cols_to_drop)]
+head(my_corr_lig_all)
+length(colnames(my_corr_lig_all))
+
+cat("Corr plot LIG with coloured dots:", plot_corr_lig_all)
+svg(plot_corr_lig_all, width = 15, height = 15)
+
+pairs.panels(my_corr_lig_all[1:(length(my_corr_lig_all)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig_all$ligand_outcome))] # foldx colours are reveresed
+             , pch = 21 # for bg
+             , jitter = T
+             , alpha = 1
+             , cex = 1.5
+             , cex.axis = 2
+             , cex.labels = 2.2
+             , cex.cor = 1
+             , smooth = F
+)
+dev.off()
+
+
+######################################################################=
+#                             End of script
+######################################################################=
--- a/scripts/plotting/plotting_thesis/corr/corr_plots_gc_i.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_plots_gc_i.R
@ -0,0 +1,276 @@
+#!/usr/bin/env Rscript       
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#===================================================================
+corr_data = corr_data_extract(merged_df3, drug_name = drug)
+#corr_data = corr_data_extract(merged_df2, drug_name = drug)
+
+geneL_normal = c("pnca")
+geneL_na_dy = c("gid")
+geneL_na = c("rpob")
+geneL_ppi2 = c("alr", "embb", "katg", "rpob")
+
+core_cols <- c( "Log (OR)" , "MAF", "-Log (P)"
+                    , "DUET", "FoldX"
+                    , "DeepDDG", "Dynamut2"
+                    , "ASA", "RSA", "RD", "KD"
+                    , "Consurf", "SNAP2"
+                    #, "mutation_info_labels"
+)
+
+
+if (tolower(gene)%in%geneL_normal){
+  corrplot_cols = core_cols
+}
+
+if (tolower(gene)%in%geneL_na_dy){
+  additional_cols = c("mCSM-NA"
+                    , "Dynamut"
+                    , "ENCoM-DDG"
+                    , "ENCoM-DDS"
+                    , "mCSM"
+                    , "SDM"
+                    , "DUET-d"
+                    , "mutation_info_labels")
+  corrplot_cols = c(core_cols, additional_cols)
+}
+if (tolower(gene)%in%geneL_na){
+  additional_cols = c("mCSM-NA"
+                    , "mutation_info_labels")
+  corrplot_cols = c(core_cols, additional_cols)
+  
+}
+
+if (tolower(gene)%in%geneL_ppi2){
+  additional_cols = c("mCSM-PPI2"
+                    , "mutation_info_labels")
+  corrplot_cols = c(core_cols, additional_cols)
+}
+
+#========================================
+# corrplot_cols <- c( "Log (OR)"
+#                     , "MAF"
+#                     , "-Log (P)"
+#                     , "DUET"
+#                     , "FoldX"
+#                     , "DeepDDG"
+#                     , "Dynamut2"
+#                     , "mCSM-NA"
+#                     , "Dynamut"
+#                     , "ENCoM-DDG"
+#                     , "ENCoM-DDS"
+#                     , "mCSM"
+#                     , "SDM"
+#                     , "DUET-d"
+#                     , "ASA"
+#                     , "RSA"
+#                     , "RD"
+#                     , "KD"
+#                     , "mutation_info_labels"
+#                    )
+
+corr_df <- corr_data[, corrplot_cols] # col order is according to corrplot_cols
+head(corr_df); names(corr_df)
+
+if ( all( corrplot_cols%in%names(corr_df) ) ){
+  cat("\nPASS: Successfully selected"
+      , length(corrplot_cols)
+      , "columns for building correlation df")
+} else {
+  cat("\nFAIl: Something went wrong, numbers mismatch"
+      , "\nExpected cols:", length(corrplot_cols)
+      , "\nGot:", length(corr_df) )
+}
+
+#=====================================================
+corrplot_df <- corr_df 
+
+# stat_df = corrplot_df[, c("Log (OR)"
+#                           , "MAF" 
+#                           , "-Log (P)")]
+
+plot_title <- "Correlation plots (stability)"
+
+# Checkbox Names
+# FIXME: select columns conditionally based on gene and grey out the ones that are not present!
+
+cBCorrNames = c( "Odds Ratio"
+                , "Allele Frequency"
+                , "P-value"
+                , "DUET"
+                , "FoldX"
+                , "DeepDDG"
+                , "Dynamut2"
+                , "ASA"
+                , "RSA"
+                , "RD"
+                , "KD"
+                , "Consurf"
+                , "SNAP2"
+                , "Nucleic Acid affinity"
+                , "PPi2 affinity"
+                
+                #, "Dynamut"
+                #, "ENCoM-Stability"
+                #, "ENCoM-Flexibility"
+                #, "mCSM"
+                #, "SDM"
+                #, "DUET-d"
+)
+
+# Checkbox Values (aka Column Names that are in corrplot_df)
+cBCorrVals = c("Log (OR)"
+              , "MAF"
+              , "-Log (P)"
+              , "DUET"
+              , "FoldX"
+              , "DeepDDG"
+              , "Dynamut2"
+              , "ASA"
+              , "RSA"
+              , "RD"
+              , "KD"
+              , "Consurf"
+              , "SNAP2"
+              , "mCSM-NA"
+              , "mCSM-PPI2"
+              # , "Dynamut"
+              # , "ENCoM-DDG"
+              # , "ENCoM-DDS"
+              # , "mCSM"
+              # , "SDM"
+              # , "DUET-d"
+ )
+
+# Pre-selected checkboxes
+cBCorrSelected = c("Log (OR)"
+                   , "MAF"
+                   , "-Log (P)")
+
+#################
+# Define UI
+#################
+u_corr <- fluidPage(
+  
+  headerPanel(plot_title),
+  
+  sidebarLayout(position = "left"
+                , sidebarPanel(
+                 checkboxGroupInput("variable", "Choose parameter:"
+                                       , choiceNames  = cBCorrNames
+                                       , choiceValues = cBCorrVals
+                                       , selected     = cBCorrSelected
+                  )
+                  
+                  # could be a fluid Row
+                  , actionButton("add_col"     , "Render")
+                  , actionButton("reset_graph" , "Reset Graphs")
+                  , actionButton("select_all"  , "Select All")
+                 
+                  )
+                
+                # output/display
+                , mainPanel(plotOutput(outputId = 'corrplot'
+                                    , height = "1200px"
+                                    , width  = "1500px")
+#                           , height = "800px"
+#                          , width  = "600px")
+                          , textOutput("txt")
+                )
+  )
+)
+
+#################
+# Define server
+#################
+s_corr <- shinyServer(function(input, output, session)
+  
+{
+  
+ #================
+ # Initial render
+ #================
+  output$corrplot <- renderPlot({ 
+  
+    #---------------------
+    # My correlation plot: initial plot
+    #---------------------
+    c_plot <- my_corr_pairs(corr_data_all = corrplot_df
+                            , corr_cols = cBCorrSelected
+                            , corr_method = "spearman"
+                            , dot_size = 2
+                            , ats = 1.5
+                            , corr_lab_size = length(cBCorrNames)/length(cBCorrSelected) * 1.3
+                            , corr_value_size = 1)
+  })
+  
+ #====================
+ # Interactive render
+ #====================
+  observeEvent(
+    input$add_col, {
+     
+      # select cols for corrplot
+      corr_cols_s <- c(input$variable)
+    
+      # render plot
+      if (length(c(input$variable)) >= 2) {
+      output$corrplot <- renderPlot({
+        
+        #---------------------
+        # My correlation plot: user selects columns
+        #---------------------
+        c_plot <- my_corr_pairs(corr_data_all     = corrplot_df
+                                , corr_cols       = corr_cols_s
+                                , dot_size        = 2
+                                , ats             = 1.5
+                                , corr_lab_size   = length(cBCorrNames)/length(corr_cols_s) * 1.3
+                                , corr_value_size = 1)
+
+      })
+      } else{ output$txt = renderText({"Argh, common! It's a correlation plot. Select >=2 vars!"})
+        
+      }
+      
+    })
+  
+ #==================================
+ # Add button: Select All checkbox
+ #==================================
+  observeEvent(
+    input$select_all,{
+
+      updateCheckboxGroupInput(session, "variable", selected = cBCorrVals)
+    }
+)
+
+ #================
+ # Reset render
+ #================
+  observeEvent(
+    input$reset_graph,{
+
+      # reset checkboxes to default selection
+      updateCheckboxGroupInput(session, "variable", selected = cBCorrSelected)
+
+
+      # render plot
+      output$corrplot <- renderPlot({
+
+        #---------------------
+        # My correlation plot: reset plot
+        #---------------------
+        c_plot <- my_corr_pairs(corr_data_all     = corrplot_df
+                                , corr_cols       = cBCorrSelected
+                                , dot_size        = 1.2
+                                , ats             = 1.5
+                                , corr_lab_size   = length(cBCorrNames)/length(cBCorrSelected) * 1.3
+                                , corr_value_size = 1)
+      })
+    }
+  )
+}
+)
+
+shinyApp(ui = u_corr, server = s_corr)
--- a/scripts/plotting/plotting_thesis/corr/corr_plots_gc_lig_i.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_plots_gc_lig_i.R
@ -0,0 +1,220 @@
+#!/usr/bin/env Rscript       
+
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+
+#===================================================================
+corr_data = corr_data_extract(merged_df3, drug_name = drug)
+#corr_data = corr_data_extract(merged_df2, drug_name = drug)
+#================================================================
+#other globals
+dist_colname <- LigDist_colname # ligand_distance (from globals)
+dist_cutoff <- LigDist_cutoff # 10 (from globals)
+
+cat("\nLigand distance cut off, colname:", dist_colname
+    , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
+    , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")
+
+########################################################################
+
+#==========================================
+#####################
+# Correlation plot
+#####################
+colnames(corr_df_m3_f)
+
+corrplot_cols_lig <- c( "Log (OR)"
+                    ,  "MAF"
+                    , "-Log (P)"
+                    , "mCSM-lig"
+                    , "mCSM-NA"
+                    , "ASA"
+                    , "RSA"
+                    , "RD"
+                    , "KD" 
+                    , dist_colname
+                   , "mutation_info_labels"
+                   )
+
+corr_df_lig <- corr_df_m3_f[, corrplot_cols_lig]
+head(corr_df_lig)
+
+corrplot_df_lig <- corr_df_lig 
+
+# static df
+# stat_df = corrplot_df_lig[, c("Log (OR)"
+#                           , "MAF"
+#                           , "-Log (P)"
+#                           )]
+
+plot_title_lig <- "Correlation plots (ligand affinity)"
+
+# Checkbox Names
+cCorrNames = c( "Odds Ratio"
+             , "Allele Frequency"
+             , "P-value"
+             , "Ligand affinity"
+             , "Nucleic Acid affinity"
+             , "ASA"
+             , "RSA"
+             , "RD"
+             , "KD" 
+             , "Ligand Distance")
+
+# Checkbox Values (aka Column Names that are in corrplot_df_lig)
+cCorrVals = c("Log (OR)"
+          , "MAF"
+          , "-Log (P)"
+          , "mCSM-lig"
+          , "mCSM-NA"
+          , "ASA"
+          , "RSA"
+          , "RD"
+          , "KD" 
+          , dist_colname)
+
+# Pre-selected checkboxes
+cCorrSelected = c("Log (OR)"
+                   , "MAF"
+                   , "-Log (P)")
+#============
+# Define UI 
+#============
+u_corr_lig<- fluidPage(
+  headerPanel(plot_title_lig),
+  sidebarLayout(position = "left"
+                , sidebarPanel("Correlations: Filtered data data"
+                  , numericInput(inputId = "lig_dist"
+                                 , label = "Ligand distance cutoff"
+                                 , value =  dist_cutoff # 10 default from globals
+                                 , min = min_ang
+                                 , max = max_ang)
+                  , checkboxGroupInput("variable", "Choose parameter:"
+                                       , choiceNames  = cCorrNames
+                                       , choiceValues = cCorrVals
+                                       , selected     = cCorrSelected
+                  )
+                  # could be a fluid Row
+                  , actionButton("add_col"     , "Render")
+                  , actionButton("reset_graph" , "Reset Graphs")
+                  , actionButton("select_all"  , "Select All")
+                  
+                )
+                
+                # output/display
+                , mainPanel(plotOutput(outputId = 'corrplot'
+                                     , height = "1000px"
+                                     , width  = "1200px")
+                          # , height = "800px"
+                          # , width  = "600px")
+                , textOutput("txt")
+                )
+  )
+)
+
+#===============
+# Define server
+#===============
+s_corr_lig <- shinyServer(function(input, output, session)
+  
+{ 
+
+ #================
+ # Initial render
+ #================
+  output$corrplot <- renderPlot({ 
+
+    # get the user-specified lig_list
+    dist_cutoff_ini = input$lig_dist
+    
+    # subset data for plot
+    corrplot_df_lig_ini = corrplot_df_lig[corrplot_df_lig[[dist_colname]] < dist_cutoff_ini,]
+    
+    #---------------------
+    # My correlation plot: initial plot
+    #---------------------
+    c_plot <- my_corr_pairs(
+                            #corr_data_all = corrplot_df_lig
+                            corr_data_all = corrplot_df_lig_ini
+                            , corr_cols = cCorrSelected
+                            , dot_size = 2
+                            , ats = 1.5
+                            , corr_lab_size = length(cCorrNames)/length(cCorrSelected) * 1.3
+                            , corr_value_size = 1)
+    
+  })
+  
+ #====================
+ # Interactive render
+ #====================
+  observeEvent(
+    input$add_col, {
+      
+      # get the user-specified lig_list
+      dist_cutoff_user = input$lig_dist
+
+      # subset data for plot
+      corrplot_df_lig_s = corrplot_df_lig[corrplot_df_lig[[dist_colname]] < dist_cutoff_user,]
+      
+      # select cols for corrplot
+      corr_cols_s = c(input$variable)
+    
+      # render plot
+      if (length(c(input$variable)) >= 2) {
+      
+        output$corrplot <- renderPlot({ 
+          
+          #---------------------
+          # My correlation plot: user selects columns
+          #---------------------
+          c_plot <- my_corr_pairs(corr_data_all = corrplot_df_lig_s
+                                  , corr_cols = corr_cols_s
+                                  , dot_size = 1.6
+                                  , ats = 1.5
+                                  , corr_lab_size = length(cCorrNames)/length(corr_cols_s) * 1.3
+                                  , corr_value_size = 1)
+        })
+      } else { output$txt = renderText({"Fuddu! It's a correlation plot. Select >=2 vars bewakoof!"})}
+      
+    })
+    
+ #==================================
+ # Add button: Select All checkbox
+ #==================================
+  observeEvent(
+    input$select_all,{
+
+      updateCheckboxGroupInput(session, "variable", selected = cCorrVals)
+    }
+  )
+  
+ #================
+ # Reset render
+ #================
+  observeEvent(
+    input$reset_graph,{
+      
+      # reset checkboxes
+      updateCheckboxGroupInput(session, "variable", selected = cCorrSelected)
+
+      # render plot
+      output$corrplot <- renderPlot({ 
+        
+        #---------------------
+        # My correlation plot: reset plot
+        #---------------------
+        c_plot <- my_corr_pairs(corr_data_all = corrplot_df_lig
+                                , corr_cols = cCorrSelected
+                                , dot_size = 2
+                                , ats = 1.5
+                                , corr_lab_size = length(cCorrNames)/length(cCorrSelected) * 1.3
+                                , corr_value_size = 1)
+        
+      })
+    }
+  )
+}
+)
+
+shinyApp(ui = u_corr_lig, server = s_corr_lig)
+
--- a/scripts/plotting/plotting_thesis/corr/ggcorr_all_PS_LIG.R
+++ b/scripts/plotting/plotting_thesis/corr/ggcorr_all_PS_LIG.R
@ -0,0 +1,323 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Corr plots for PS and Lig 
+
+# Output: 1 svg
+
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+require(cowplot)
+source("combining_dfs_plotting.R")
+#source("my_pairs_panel.R")
+# should return the following dfs, directories and variables
+
+# FIXME: Can't output from here
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))     
+
+#=======
+# output
+#=======
+# can't combine by cowplot because not ggplots
+#corr_plot_combined = "corr_combined.svg"
+#plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
+
+# PS
+#ggcorr_all_ps = "ggcorr_all_PS.svg"
+ggcorr_all_ps = "ggcorr_all_PS.png"
+plot_ggcorr_all_ps =  paste0(plotdir,"/", ggcorr_all_ps)
+
+# LIG
+#ggcorr_all_lig = "ggcorr_all_LIG.svg"
+ggcorr_all_lig = "ggcorr_all_LIG.png"
+plot_ggcorr_all_lig =  paste0(plotdir,"/", ggcorr_all_lig )
+
+# combined
+ggcorr_all_combined_labelled = "ggcorr_all_combined_labelled.png"
+plot_ggcorr_all_combined_labelled  =  paste0(plotdir,"/", ggcorr_all_combined_labelled)
+
+####################################################################
+#               end of loading libraries and functions                 #
+########################################################################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+#df_ps = merged_df3_comp 
+#df_lig = merged_df3_comp_lig
+merged_df3 = as.data.frame(merged_df3)
+df_ps = merged_df3 
+df_lig = merged_df3_lig
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+
+#======================
+# adding log cols 
+#======================
+# subset data to generate pairwise correlations
+cols_to_select =  c("duet_scaled"
+                    
+                    , "foldx_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    #, "or_kin"
+                    #, "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "asa"
+                    , "rsa"
+                    , "kd_values"
+                    , "rd_values"
+                    
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[, cols_to_select]
+
+dim(corr_data_ps)
+
+#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
+#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+                     
+                     , "Foldx"
+                     
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     
+                     #, "OR (adjusted)"
+                     #, "-Log (P wald)"
+                     
+                     , "AF"
+                     
+                     , "ASA"
+                     , "RSA"
+                     , "KD"
+                     , "RD"
+                     
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+#------------------------
+# Data for ggcorr PS plot
+#------------------------
+start = 1
+end_ggcorr = which(colnames(corr_data_ps) == "duet_outcome"); end_ggcorr # should be the last column
+offset = 1
+
+my_ggcorr_ps = corr_data_ps[start:(end_ggcorr-1)]
+head(my_ggcorr_ps)
+
+# correlation matrix
+corr1 <- round(cor(my_ggcorr_ps, method = "spearman", use = "pairwise.complete.obs"), 1)
+
+# p-value matrix
+pmat1 <- cor_pmat(my_ggcorr_ps, method = "spearman", use = "pairwise.complete.obs"
+                  ,   conf.level = 0.99)
+
+corr2 = psych::corr.test(my_ggcorr_ps
+                        , method = "spearman"
+                        , use =  "pairwise.complete.obs")$r
+corr2 = round(corr2, 1)
+
+pmat2 =  psych::corr.test(my_ggcorr_ps
+                            , method = "spearman"
+                            , adjust = "none"
+                            , use =  "pairwise.complete.obs")$p
+
+corr1== corr2
+pmat1==pmat2
+
+#------------------------
+# Generate ggcorr PS plot
+#------------------------
+cat("ggCorr plot PS:", plot_ggcorr_all_ps)
+#png(filename = plot_ggcorr_all_ps, width = 1024, height = 768, units = "px", pointsize = 20)
+ggcorr_ps = ggcorrplot(corr1
+                       , p.mat = pmat1
+                       , hc.order = TRUE
+                       , outline.col = "black"
+                       , ggtheme = ggplot2::theme_gray
+                       , colors = c("#6D9EC1", "white", "#E46726")
+                       , title = "DUET and Foldx stability")    
+
+
+ggcorr_ps
+#dev.off()
+
+#===========================
+# Data for Correlation plots: LIG
+#===========================
+table(df_lig$ligand_outcome)
+
+df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+
+df_lig$log10_or_kin = log10(df_lig$or_kin)
+df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+# subset data to generate pairwise correlations
+cols_to_select_lig =  c("affinity_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    , "or_kin"
+                    , "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "asa"
+                    , "rsa"
+                    , "kd_values"
+                    , "rd_values"
+                    
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select_lig]
+
+dim(corr_data_lig)
+
+# assign nice colnames (for display)
+my_corr_colnames_lig = c("Ligand Affinity"
+                     
+                     , "Log (OR)"
+                     , "-Log (P)"
+                     
+                     , "OR (adjusted)"
+                     , "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "ASA"
+                     , "RSA"
+                     , "KD"
+                     , "RD"
+                     
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames_lig
+colnames(corr_data_lig)
+
+#------------------------
+# Data for ggcorr LIG plot
+#------------------------
+
+start = 1
+end_ggcorr_lig = which(colnames(corr_data_lig) == "ligand_outcome"); end_ggcorr_lig # should be the last column
+offset = 1
+
+my_ggcorr_lig = corr_data_lig[start:(end_ggcorr_lig-1)]
+head(my_ggcorr_lig); str(my_ggcorr_lig)
+
+# correlation matrix
+corr1_lig <- round(cor(my_ggcorr_lig, method = "spearman", use = "pairwise.complete.obs"), 1)
+
+# p-value matrix
+pmat1_lig <- cor_pmat(my_ggcorr_lig, method = "spearman", use = "pairwise.complete.obs")
+
+corr2_lig = psych::corr.test(my_ggcorr_lig
+                             , method = "spearman"
+                             , use =  "pairwise.complete.obs")$r
+
+corr2_lig = round(corr2_lig, 1)
+
+pmat2_lig =  psych::corr.test(my_ggcorr_lig
+                              , method = "spearman"
+                              , adjust = "none"
+                              , use =  "pairwise.complete.obs")$p
+
+corr1_lig == corr2_lig
+pmat1_lig == pmat2_lig
+
+
+# for display order columns by hc order of ps 
+
+#col_order = levels(ggcorr_ps$data[2])
+
+#col_order <- c("Species", "Petal.Width", "Sepal.Length",
+               #"Sepal.Width", "Petal.Length")
+#my_data2 <- my_data[, col_order]
+#my_data2
+
+#------------------------
+# Generate ggcorr LIG plot
+#------------------------
+cat("ggCorr LIG plot:", plot_ggcorr_all_lig)
+#svg(plot_ggcorr_all_lig, width = 15, height = 15)
+#png(plot_ggcorr_all_lig, width = 1024, height = 768, units = "px", pointsize = 20)
+
+ggcorr_lig = ggcorrplot(corr1_lig
+                        , p.mat = pmat1_lig
+                        , hc.order = TRUE
+                        , outline.col = "black"
+                        
+                        , ggtheme = ggplot2::theme_gray
+                        , colors = c("#6D9EC1", "white", "#E46726")
+                        , title = "Ligand affinty")    
+
+
+ggcorr_lig
+#dev.off()
+
+#######################################################
+#=============================
+# combine plots for output
+#=============================
+
--- a/scripts/plotting/plotting_thesis/corr_plots_thesis.R
+++ b/scripts/plotting/plotting_thesis/corr_plots_thesis.R
@ -0,0 +1,141 @@
+merged_df3 = as.data.frame(merged_df3)
+corr_plotdf = corr_data_extract(merged_df3, extract_scaled_cols = F)
+
+#================
+# stability
+#================
+corr_ps_colnames = c("DUET"
+                     , "FoldX"
+                     , "DeepDDG"
+                     , "Dynamut2"
+                     , "MAF"             
+                     , "Log (OR)"       
+                     , "-Log (P)"
+                     #, "ligand_distance"
+                     , "dst_mode"
+                     , drug)
+
+corr_df_ps = corr_plotdf[, corr_ps_colnames]
+
+color_coln = which(colnames(corr_df_ps) == "dst_mode")
+end = which(colnames(corr_df_ps) == drug)
+ncol_omit = 2
+corr_end = end-ncol_omit
+
+#------------------------
+# Output: stability corrP
+#------------------------
+corr_psP =  paste0(outdir_images
+                          ,tolower(gene)
+                          ,"_corr_stability.svg" )
+
+cat("Corr plot stability with coloured dots:", corr_psP)
+svg(corr_psP, width = 15, height = 15)
+
+my_corr_pairs(corr_data_all = corr_df_ps
+  , corr_cols = colnames(corr_df_ps[1:corr_end])
+  , corr_method = "spearman" # other options: "pearson" or "kendall"
+  , colour_categ_col = colnames(corr_df_ps[color_coln]) #"dst_mode"
+  , categ_colour =  c("red", "blue")
+  , density_show = F
+  , hist_col = "coral4"
+  , dot_size = 1.6
+  , ats = 1.5
+  , corr_lab_size = 3
+  , corr_value_size = 1)
+
+dev.off()
+#####################################################
+DistCutOff = 10
+LigDist_colname  # = "ligand_distance" # from globals 
+ppi2Dist_colname  = "interface_dist"
+naDist_colname    = "TBC"
+#####################################################
+
+#================
+# ligand affinity
+#================
+corr_lig_colnames = c("mCSM-lig"       
+                      , "MAF"             
+                      , "Log (OR)"       
+                      , "-Log (P)"
+                      , "ligand_distance"
+                      , "dst_mode"
+                      , drug)
+
+corr_df_lig = corr_plotdf[, corr_lig_colnames]
+corr_df_lig = corr_df_lig[corr_df_lig[[LigDist_colname]]<DistCutOff,]
+
+color_coln = which(colnames(corr_df_lig) == "dst_mode")
+end = which(colnames(corr_df_lig) == drug)
+ncol_omit = 3  #omit dist col
+corr_end = end-ncol_omit
+
+#------------------------
+# Output: ligand corrP
+#------------------------
+corr_ligP =  paste0(outdir_images
+                   ,tolower(gene)
+                   ,"_corr_lig.svg" )
+
+cat("Corr plot affinity with coloured dots:", corr_ligP)
+svg(corr_ligP, width = 10, height = 10)
+
+my_corr_pairs(corr_data_all = corr_df_lig
+              , corr_cols = colnames(corr_df_lig[1:corr_end])
+              , corr_method = "spearman" # other options: "pearson" or "kendall"
+              , colour_categ_col = colnames(corr_df_lig[color_coln]) #"dst_mode"
+              , categ_colour =  c("red", "blue")
+              , density_show = F
+              , hist_col = "coral4"
+              , dot_size = 2
+              , ats = 1.5
+              , corr_lab_size =3
+              , corr_value_size = 1)
+dev.off()
+####################################################
+#================
+# ppi2 affinity
+#================
+corr_ppi2_colnames = c("mCSM-PPI2"
+                       , "MAF"             
+                       , "Log (OR)"       
+                       , "-Log (P)"
+                       , "interface_dist" 
+                       , "dst_mode"
+                       , drug)
+
+
+corr_df_ppi2 = corr_plotdf[, corr_ppi2_colnames]
+corr_df_ppi2 = corr_df_ppi2[corr_df_ppi2[[ppi2Dist_colname]]<DistCutOff,]
+
+color_coln = which(colnames(corr_df_ppi2) == "dst_mode")
+end = which(colnames(corr_df_ppi2) == drug)
+ncol_omit = 3 #omit dist col
+corr_end = end-ncol_omit
+
+#------------------------
+# Output: ppi2 corrP
+#------------------------
+corr_ppi2P =  paste0(outdir_images
+                    ,tolower(gene)
+                    ,"_corr_ppi2.svg" )
+
+cat("Corr plot ppi2 with coloured dots:", corr_ppi2P)
+svg(corr_ppi2P, width = 10, height = 10)
+
+my_corr_pairs(corr_data_all = corr_df_ppi2
+              , corr_cols = colnames(corr_df_ppi2[1:corr_end])
+              , corr_method = "spearman" # other options: "pearson" or "kendall"
+              , colour_categ_col = colnames(corr_df_ppi2[color_coln]) #"dst_mode"
+              , categ_colour =  c("red", "blue")
+              , density_show = F
+              , hist_col = "coral4"
+              , dot_size = 2
+              , ats = 1.5
+              , corr_lab_size = 3
+              , corr_value_size = 1)
+
+#==================
+# mCSSM-NA affinity
+#==================
--- a/scripts/plotting/plotting_thesis/linage_dist_ens_stability.R
+++ b/scripts/plotting/plotting_thesis/linage_dist_ens_stability.R
@ -0,0 +1,138 @@
+#!/usr/bin/env Rscript  
+
+#########################################################
+# TASK: Lineage dist plots for stability:
+# average the four tools
+
+# func from : lineage_dist.R
+# plotdf
+# , x_axis = "duet_scaled"
+# , y_axis = "lineage_labels"
+# , x_lab = "DUET"
+# , all_lineages = F
+# , use_lineages = c("L1", "L2", "L3", "L4")
+# , with_facet = F
+# , facet_wrap_var = "" # FIXME: document what this is for
+# , fill_categ = "mutation_info_labels"
+# , fill_categ_cols = c("#E69F00", "#999999")
+# , my_ats = 15 # axis text size
+# , my_als = 20 # axis label size
+# , my_leg_ts = 16
+# , my_leg_title = 16
+# , my_strip_ts = 20
+# , leg_pos = c(0.8, 0.9)
+# , leg_pos_wf = c("top", "left", "bottom", "right")
+# , leg_dir_wf = c("horizontal", "vertical")
+# , leg_label = ""
+#########################################################
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/"
+                       , tolower(gene), "/")
+cat("plots will output to:", outdir_images)
+#########################################################
+#=======
+# Data
+#=======
+df2 = merged_df2
+
+#==================================
+# PREFORMATTING: for consistency
+# IMPORTANT for calculating effects
+#==================================
+head(df2$ddg_foldx)
+df2['ddg_foldxC'] = abs(df2$ddg_foldx)
+head(df2['ddg_foldxC'])
+
+# reverse signs for foldx scaled values for consistency with other tools
+df2['foldx_scaled_signC'] = abs(df2$foldx_scaled)
+
+# remove the old ones from 
+rm_foldx_cols = c("ddg_foldx","foldx_scaled")
+raw_cols_stab_revised    = raw_cols_stability[!raw_cols_stability%in%rm_foldx_cols]
+raw_cols_stab_revised    = c(raw_cols_stab_revised,"ddg_foldxC")
+
+scaled_cols_stab_revised = scaled_cols_stability[!scaled_cols_stability%in%rm_foldx_cols]
+scaled_cols_stab_revised = c(scaled_cols_stab_revised, "foldx_scaled_signC")
+
+
+#=================
+# PREFORMATTING: for consistency
+#=================
+df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S")
+table(df2$sensitivity)
+
+cols_to_extract  = colnames(df2)[colnames(df2)%in%c(common_cols
+                                                    , outcome_cols_stability
+                                                    , raw_cols_stability
+                                                    , scaled_cols_stability
+                                                    , raw_cols_stab_revised
+                                                    , scaled_cols_stab_revised
+                                                    , "lineage","lineage_labels")]
+
+df2_plot = df2[, cols_to_extract]
+
+all(table(df2_plot$lineage) == table(df2_plot$lineage_labels))
+
+# find which stability cols to average: should contain revised foldx
+if ("foldx_scaled_signC"%in%colnames(df2_plot)){
+  cat("\nPASS: finding stability cols to average")
+  cols2avg_new = which(colnames(df2_plot)%in%scaled_cols_stab_revised)
+}else{
+  stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
+}
+
+# ensemble average across predictors
+df2_plot['ens_stab_new'] = rowMeans(df2_plot[, cols2avg_new])
+
+head(df2_plot$position); head(df2_plot$mutationinformation)
+table(df2_plot['ens_stab_new'])
+
+# scaling average values
+df2_plot["ens_stab_new_scaled"] = lapply(df2_plot["ens_stab_new"]
+                                         , function(x) {
+                                           scales::rescale_mid(x
+                                                           , to  = c(-1,1)
+                                                           , from = c( min(df2_plot["ens_stab_new"])
+                                                                       , max(df2_plot["ens_stab_new"]))
+                                                           , mid = 0
+                                                           #, from = c(0,1))
+                                           )})
+
+min(df2_plot['ens_stab_new']); max(df2_plot['ens_stab_new'])
+foo = df2_plot[c("cols2avg_new", "ens_stab_new_scaled")]
+min(df2_plot['ens_stab_new_scaled']); max(df2_plot['ens_stab_new_scaled'])
+
+###########################################################
+#====================
+# Output Lineage plot
+#====================
+linD_ens_stabP = paste0(outdir_images
+                          , tolower(gene)
+                          ,"_linD_ens_stabP.svg")
+
+cat("\nOutput plot:", linD_ens_stabP)
+svg(linD_ens_stabP, width = 10, height = 10)
+
+linP_dm_om = lineage_distP(df2_plot
+                           , with_facet = F
+                           , x_axis = "ens_stab_new_scaled"
+                           , y_axis = "lineage_labels"
+                           , x_lab = "Average stability"
+                           #, fill_categ = "mutation_info_orig", fill_categ_cols = c("#E69F00", "#999999")
+                           , fill_categ = "sensitivity"
+                           , fill_categ_cols = c("red", "blue")
+                           , label_categories = c("Resistant", "Sensitive")
+                           , leg_label = ""
+                           , my_ats = 22 # axis text size
+                           , my_als = 22 # axis label size
+                           , my_leg_ts = 22
+                           , my_leg_title = 22
+                           , my_strip_ts = 22
+                           , alpha = 0.56
+)
+
+linP_dm_om
+dev.off()
--- a/scripts/plotting/plotting_thesis/preformatting.R
+++ b/scripts/plotting/plotting_thesis/preformatting.R
@ -0,0 +1,236 @@
+#!/usr/bin/env Rscript       
+#source("~/git/LSHTM_analysis/config/alr.R")
+source("~/git/LSHTM_analysis/config/embb.R")
+#source("~/git/LSHTM_analysis/config/katg.R")
+#source("~/git/LSHTM_analysis/config/gid.R")
+#source("~/git/LSHTM_analysis/config/pnca.R")
+#source("~/git/LSHTM_analysis/config/rpob.R")
+
+# get plottting dfs 
+source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
+###################################################################
+# FIXME: ADD distance to NA when SP replies
+dist_columns = c("ligand_distance", "interface_dist")
+DistCutOff = 10
+common_cols  = c("mutationinformation"
+                 , "X5uhc_position"
+                 , "X5uhc_offset"
+                 , "position"
+                 , "dst_mode"
+                 , "mutation_info_labels"
+                 , "sensitivity", dist_columns )
+
+#===================
+# stability cols
+#===================
+raw_cols_stability =  c("duet_stability_change"
+                        , "deepddg"
+                        , "ddg_dynamut2"
+                        , "ddg_foldx")
+
+scaled_cols_stability = c("duet_scaled"       
+                          , "deepddg_scaled"   
+                          , "ddg_dynamut2_scaled"
+                          , "foldx_scaled")
+
+outcome_cols_stability = c("duet_outcome"
+                           , "deepddg_outcome"
+                           , "ddg_dynamut2_outcome"
+                           , "foldx_outcome")
+
+#===================
+# affinity cols
+#===================
+raw_cols_affinity =  c("ligand_affinity_change"
+                       , "mmcsm_lig"
+                       , "mcsm_ppi2_affinity"
+                       , "mcsm_na_affinity")
+
+scaled_cols_affinity = c("affinity_scaled" 
+                         , "mmcsm_lig_scaled" 
+                         , "mcsm_ppi2_scaled" 
+                         , "mcsm_na_scaled" )
+
+outcome_cols_affinity  = c( "ligand_outcome"
+                            , "mmcsm_lig_outcome"
+                            , "mcsm_ppi2_outcome"
+                            , "mcsm_na_outcome")
+#===================
+# conservation cols
+#===================
+raw_cols_conservation =  c("consurf_score"
+                           , "snap2_score"
+                           , "provean_score")
+
+scaled_cols_conservation = c("consurf_scaled"
+                             , "snap2_scaled"
+                             , "provean_scaled")
+
+# CANNOT strictly be used, as categories are not identical with conssurf missing altogether
+outcome_cols_conservation = c("provean_outcome"
+                              , "snap2_outcome"
+                              , "consurf_colour_rev"
+                              , "consurf_colour"#doesn't exist,use this mapping
+)
+
+all_cols = c(common_cols
+            , raw_cols_stability
+            , scaled_cols_stability
+            , outcome_cols_stability
+            , raw_cols_affinity
+            , scaled_cols_affinity
+            , outcome_cols_affinity
+            , raw_cols_conservation
+            , scaled_cols_conservation
+            , outcome_cols_conservation)
+
+
+#=======
+# output
+#=======
+outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
+
+####################################
+# merged_df3: NECESSARY pre-processing
+###################################
+df3 = merged_df3
+
+#=================
+# PREFORMATTING: for consistency
+#=================
+df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
+table(df3$sensitivity)
+
+# ConSurf labels
+consurf_colOld = "consurf_colour_rev"
+consurf_colNew = "consurf_outcome"
+df3[[consurf_colNew]] = df3[[consurf_colOld]]
+df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]])
+df3[[consurf_colNew]]
+levels(df3$consurf_outcome) = c( "nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
+levels(df3$consurf_outcome)
+
+# SNAP2 labels
+snap2_colname = "snap2_outcome"
+df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect")
+df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral")
+
+#  for ref: not needed perse as function already does this and assigns labels for barplots
+# labels_duet = levels(as.factor(df3$duet_outcome))
+# labels_foldx = levels(as.factor(df3$foldx_outcome))
+# labels_deepddg = levels(as.factor(df3$deepddg_outcome))
+# labels_ddg_dynamut2_outcome = levels(as.factor(df3$ddg_dynamut2_outcome))
+# 
+# labels_lig = levels(as.factor(df3_lig$ligand_outcome))
+# labels_mmlig = levels(as.factor(df3_lig$mmcsm_lig_outcome))
+# labels_ppi2 = levels(as.factor(df3_ppi2$mcsm_ppi2_outcome))
+# 
+# labels_provean = levels(as.factor(df3$provean_outcome))
+# labels_snap2   = levels(as.factor(df3$snap2_outcome))
+# labels_consurf = levels(as.factor(df3$consurf_colour_rev))
+# df3$consurf_colour_rev = as.factor(df3$consurf_colour_rev )
+##############################################################################
+#######################################
+# merged_df2: NECESSARY pre-processing
+######################################
+df2 = merged_df2
+
+#=================
+# PREFORMATTING: for consistency
+#=================
+df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S")
+table(df2$sensitivity)
+
+#----------------------------------------------------
+# Create dst2: fill na in dst with value of dst_mode
+# for epistasis
+#----------------------------------------------------
+df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2f$dst)
+
+#----------------------------------------------------
+# reverse signs for foldx scaled values for
+# to allow average with other tools
+#----------------------------------------------------
+head(df2['ddg_foldx'])
+df2['ddg_foldxC'] = abs(df2$ddg_foldx)
+head(df2['ddg_foldxC'])
+
+head(df2['foldx_scaled'])
+df2['foldx_scaled_signC'] = abs(df2$foldx_scaled)
+head(df2['foldx_scaled_signC'])
+
+rm_foldx_cols = c("ddg_foldx","foldx_scaled")
+raw_cols_stab_revised    = raw_cols_stability[!raw_cols_stability%in%rm_foldx_cols]
+raw_cols_stab_revised    = c(raw_cols_stab_revised,"ddg_foldxC")
+
+scaled_cols_stab_revised = scaled_cols_stability[!scaled_cols_stability%in%rm_foldx_cols]
+scaled_cols_stab_revised = c(scaled_cols_stab_revised, "foldx_scaled_signC")
+
+######################################################
+# Affinity related variables
+DistCutOff = 10
+LigDist_colname  # = "ligand_distance" # from globals 
+ppi2Dist_colname  = "interface_dist"
+naDist_colname    = "TBC"
+
+######################################################
+# corr colnames
+# drug
+# "dst_mode"
+# "ligand_distance"
+# "DUET"
+# "mCSM-lig"       
+# "FoldX"
+# "DeepDDG"
+# "ASA"
+# "RSA"
+# "KD"             
+# "RD"
+# "Consurf"
+# "SNAP2"
+# "MAF"             
+# "Log (OR)"       
+# "-Log (P)"
+# "Dynamut2"
+# "mCSM-PPI2"       
+# "interface_dist" 
+
+corr_ps_colnames = c("DUET"
+, "FoldX"
+, "DeepDDG"
+, "Dynamut2"
+
+, "MAF"             
+, "Log (OR)"       
+, "-Log (P)"
+
+# , "ASA"
+# , "RSA"
+# , "KD"             
+# , "RD"
+# , "Consurf"
+# , "SNAP2"
+
+#, "mCSM-lig"       
+#, "ligand_distance"
+#, "mCSM-PPI2"       
+#, "interface_dist" 
+, "dst_mode"
+, drug
+)
+
+corr_lig_colnames = c("mCSM-lig"       
+                     , "MAF"             
+                     , "Log (OR)"       
+                     , "-Log (P)"
+                     , "ligand_distance"
+                     , "dst_mode"
+                     , drug)
+
+corr_ppi2_colnames = c("mCSM-PPI2"
+                       , "MAF"             
+                       , "Log (OR)"       
+                       , "-Log (P)"
+                       , "interface_dist" 
+                       , "dst_mode"
+                       , drug)
--- a/scripts/plotting/replaceBfactor_pdb.R
+++ b/scripts/plotting/replaceBfactor_pdb.R
@ -0,0 +1,332 @@
+#!/usr/bin/env Rscript                                                  
+
+#########################################################
+# TASK: Replace B-factors in the pdb file with the mean
+# normalised stability values.
+
+# read pdb file
+# make two copies so you can replace B factors for 1)duet
+# 2)affinity values and output 2 separate pdbs for
+# rendering on chimera
+
+# read mcsm mean stability value files
+# extract the respective mean values and assign to the
+# b-factor column within their respective pdbs
+
+# generate some distribution plots for inspection
+
+#########################################################
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+cat(c(getwd(),"\n"))
+
+#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
+library(bio3d)
+require("getopt", quietly = TRUE) # cmd parse arguments
+#========================================================
+#drug = "pyrazinamide"
+#gene = "pncA"
+
+# command line args
+spec = matrix(c(
+  "drug"   , "d", 1, "character",
+  "gene"   , "g", 1, "character"
+), byrow = TRUE, ncol = 4)
+
+opt = getopt(spec)
+
+drug = opt$drug
+gene = opt$gene
+
+if(is.null(drug)|is.null(gene)) {
+  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+}
+#========================================================
+gene_match = paste0(gene,"_p.")
+cat(gene_match)
+
+#=============
+# directories
+#=============
+datadir = paste0("~/git/Data")
+indir = paste0(datadir, "/", drug, "/input")
+outdir = paste0("~/git/Data", "/", drug, "/output")
+#outdir_plots = paste0("~/git/Data", "/", drug, "/output/plots")
+outdir_plots = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
+
+#======
+# input
+#======
+in_filename_pdb = paste0(tolower(gene), "_complex.pdb") 
+infile_pdb = paste0(indir, "/", in_filename_pdb)
+cat(paste0("Input file:", infile_pdb) )
+
+#in_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv")
+#infile_mean_stability = paste0(outdir, "/", in_filename_mean_stability)
+
+in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stab_aff.csv")
+infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability)
+
+cat(paste0("Input file:", infile_mean_stability) )
+
+#=======
+# output
+#=======
+#out_filename_duet_mspdb = paste0(tolower(gene), "_complex_bduet_ms.pdb")
+out_filename_duet_mspdb = paste0(tolower(gene), "_complex_b_stab_ms.pdb") 
+outfile_duet_mspdb = paste0(outdir_plots, "/", out_filename_duet_mspdb)
+print(paste0("Output file:", outfile_duet_mspdb))
+
+out_filename_lig_mspdb  = paste0(tolower(gene), "_complex_blig_ms.pdb") 
+outfile_lig_mspdb = paste0(outdir_plots, "/", out_filename_lig_mspdb)
+print(paste0("Output file:", outfile_lig_mspdb))
+
+#%%===============================================================
+#NOTE: duet here refers to the ensemble stability values
+
+###########################
+# Read file: average stability values
+# or mcsm_normalised file
+###########################
+my_df <- read.csv(infile_mean_stability, header = T)
+str(my_df)
+
+#############
+# Read pdb
+#############
+# list of 8
+my_pdb = read.pdb(infile_pdb
+                  , maxlines = -1
+                  , multi = FALSE 
+                  , rm.insert = FALSE
+                  , rm.alt = TRUE
+                  , ATOM.only = FALSE 
+                  , hex = FALSE
+                  , verbose = TRUE)
+
+rm(in_filename_mean_stability, in_filename_pdb)
+
+# assign separately for duet and ligand 
+my_pdb_duet = my_pdb
+my_pdb_lig = my_pdb
+
+#=========================================================
+# Replacing B factor with mean stability scores
+# within the respective dfs
+#==========================================================
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+#df_duet = my_pdb_duet[[1]]
+df_duet= my_pdb_duet[['atom']]
+df_lig = my_pdb_lig[['atom']]
+
+# make a copy: required for downstream sanity checks
+d2_duet = df_duet
+d2_lig = df_lig
+
+# sanity checks: B factor
+max(df_duet$b); min(df_duet$b)
+max(df_lig$b); min(df_lig$b)
+
+#*******************************************
+# histograms and density plots for inspection
+# 1: original B-factors
+# 2: original mean stability values
+# 3: replaced B-factors with mean stability values
+#*********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    #, mfrow = c(3,2)
+    , mfrow = c(3,4))
+
+#=============
+# Row 1 plots: original B-factors
+# duet and affinity
+#=============
+hist(df_duet$b
+     , xlab = "" 
+     , main = "Bfactor stability")
+
+plot(density(df_duet$b)
+     , xlab = ""
+     , main = "Bfactor stability")
+
+
+hist(df_lig$b
+     , xlab = "" 
+     , main = "Bfactor affinity")
+
+plot(density(df_lig$b)
+     , xlab = ""
+     , main = "Bfactor affinity")
+
+#=============
+# Row 2 plots: original mean stability values
+# duet and affinity
+#=============
+
+#hist(my_df$averaged_duet
+hist(my_df$avg_ens_stability_scaled
+     , xlab = "" 
+     , main = "mean stability values")
+
+#plot(density(my_df$averaged_duet)
+plot(density(my_df$avg_ens_stability_scaled)
+     , xlab = ""
+     , main = "mean stability values")
+
+#hist(my_df$averaged_affinity
+hist(my_df$avg_ens_affinity_scaled
+     , xlab = "" 
+     , main = "mean affinity values")
+
+#plot(density(my_df$averaged_affinity)
+plot(density(my_df$avg_ens_affinity_scaled)
+     , xlab = ""
+     , main = "mean affinity values")
+
+#==============
+# Row 3 plots: replaced B-factors with mean stability values
+# After actual replacement in the b factor column
+#===============
+################################################################
+#=========
+# step 0_P1: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# df_duet$duet_scaled = my_df$averge_duet_scaled[match(df_duet$resno, my_df$position)]
+
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don"t run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+
+#df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)]
+#df_lig$b = my_df$averaged_affinity_scaled[match(df_lig$resno, my_df$position)]
+
+df_duet$b = my_df$avg_ens_stability_scaled[match(df_duet$resno, my_df$position)]
+df_lig$b  = my_df$avg_ens_affinity_scaled[match(df_lig$resno, my_df$position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet
+b_na_lig  = sum(is.na(df_lig$b)) ; b_na_lig 
+
+# count number of 0"s in Bactor
+sum(df_duet$b == 0)
+sum(df_lig$b  == 0)
+
+# replace all NA in b factor with 0
+na_rep = 2
+df_duet$b[is.na(df_duet$b)] = na_rep
+df_lig$b[is.na(df_lig$b)] = na_rep
+
+# # sanity check: should be 0 and True
+# # duet and lig
+# if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) {
+#   print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig")
+# } else {
+#   print("FAIL: NA replacement in df_duet NOT successful")
+#   quit()
+# }
+# 
+# max(df_duet$b); min(df_duet$b)
+# 
+# # sanity checks: should be True
+# if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){
+#   print("PASS: B-factors replaced correctly in df_duet")
+# } else {
+#   print ("FAIL: To replace B-factors in df_duet")
+#   quit()
+# }
+
+# if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){
+#   print("PASS: B-factors replaced correctly in df_lig")
+# } else {
+#   print ("FAIL: To replace B-factors in df_lig")
+#   quit()
+# }
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+if ( (dim(df_duet)[1] == dim(d2_duet)[1]) & (dim(df_lig)[1] == dim(d2_lig)[1]) &
+     (dim(df_duet)[2] == dim(d2_duet)[2]) & (dim(df_lig)[2] == dim(d2_lig)[2])
+    ){
+  print("PASS: Dims of both dfs as expected")
+} else {
+  print ("FAIL: Dims mismatch")
+  quit()}
+
+#=========
+# step 4_P1:
+# VERY important
+#=========
+# assign it back to the pdb file
+my_pdb_duet[['atom']] = df_duet
+max(df_duet$b); min(df_duet$b)
+table(df_duet$b)
+sum(is.na(df_duet$b))
+
+my_pdb_lig[['atom']] = df_lig
+max(df_lig$b); min(df_lig$b)
+
+#=========
+# step 5_P1
+#=========
+cat(paste0("output file duet mean stability pdb:", outfile_duet_mspdb))
+write.pdb(my_pdb_duet, outfile_duet_mspdb)
+
+cat(paste0("output file ligand mean stability pdb:", outfile_lig_mspdb))
+write.pdb(my_pdb_lig, outfile_lig_mspdb)
+
+#============================
+# Add the 3rd histogram and density plots for comparisons
+#============================
+# Plots continued...
+# Row 3 plots: hist and density of replaced B-factors with stability values
+hist(df_duet$b
+     , xlab = ""
+     , main = "repalcedB duet")
+
+plot(density(df_duet$b)
+     , xlab = ""
+     , main = "replacedB duet")
+
+
+hist(df_lig$b
+     , xlab = ""
+     , main = "repalcedB affinity")
+
+plot(density(df_lig$b)
+     , xlab = ""
+     , main = "replacedB affinity")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = paste0(tolower(gene), ": Stability Distribution")
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#============================================
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# NOTE: This replaced B-factor distribution has the same
+# x-axis as the PredAff normalised values, but the distribution
+# is affected since 0 is overinflated/or hs an additional blip because
+# of the positions not associated with resistance. This is because all the positions
+# where there are no SNPs have been assigned 0???
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+