separated plotting_thesis for generating plots

2022-08-04 18:47:18 +01:00 · 2022-08-04 18:47:18 +01:00 · ad2e538ec2
commit ad2e538ec2
parent 95131abc3c
11 changed files with 2807 additions and 0 deletions
--- a/scripts/plotting/mcsm_mean_stability.R
+++ b/scripts/plotting/mcsm_mean_stability.R
@ -0,0 +1,163 @@
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 #########################################################
 # TASK:
 #########################################################
 #source("~/git/LSHTM_analysis/scripts/Header_TT.R")
 #require(data.table)
 #require(dplyr)
 source("plotting_data.R")
 # should return
 #my_df
 #my_df_u
 #dup_muts
 # cmd parse arguments
 #require('getopt', quietly = TRUE)
 #========================================================
 #========================================================
 #	Read file: call script for combining df for PS
 #source("../combining_two_df.R")
 #========================================================
 # plotting_data.R imports all the dir names, etc
 #=======
 # output
 #=======
 out_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv") 
 outfile_mean_stability = paste0(outdir, "/", out_filename_mean_stability)
 print(paste0("Output file:", outfile_mean_stability))
 #%%===============================================================
 #================
 # Data for plots
 #================
 # REASSIGNMENT as necessary
 df  = my_df_u
 rm(my_df)
 ###########################
 # Data for bfactor figure
 # PS (duet) average 
 # Ligand affinity average
 ###########################
 head(df$position); head(df$mutationinformation)
 head(df$duet_stability_change)
 # order data frame 
 #df = df[order(df$position),] #already done
 #head(df$position); head(df$mutationinformation)
 #head(df$duet_stability_change)
 #***********
 # PS(duet): average by position and then scale b/w -1 and 1
 # column to average: duet_stability_change (NOT scaled!)
 #***********
 mean_duet_by_position <- df %>%
  group_by(position) %>%
  summarize(averaged_duet = mean(duet_stability_change))
 # scale b/w -1 and 1
 duet_min = min(mean_duet_by_position['averaged_duet'])
 duet_max = max(mean_duet_by_position['averaged_duet']) 
 # scale the averaged_duet values
 mean_duet_by_position['averaged_duet_scaled'] = lapply(mean_duet_by_position['averaged_duet']
                                                       , function(x) ifelse(x < 0, x/abs(duet_min), x/duet_max))
 cat(paste0('Average duet scores:\n', head(mean_duet_by_position['averaged_duet'])
           , '\n---------------------------------------------------------------'
           , '\nScaled duet scores:\n', head(mean_duet_by_position['averaged_duet_scaled'])))
 # sanity checks
 l_bound_duet = min(mean_duet_by_position['averaged_duet_scaled'])
 u_bound_duet = max(mean_duet_by_position['averaged_duet_scaled'])
 if ( (l_bound_duet == -1) && (u_bound_duet == 1) ){
  cat(paste0("PASS: duet scores averaged by position and then scaled"
        , "\nmin averaged duet: ", l_bound_duet
        , "\nmax averaged duet: ", u_bound_duet))
 }else{
  cat(paste0("FAIL: avergaed duet scores could not be scaled b/w -1 and 1"
        , "\nmin averaged duet: ", l_bound_duet
        , "\nmax averaged duet: ", u_bound_duet))
  quit()
 } 
 #***********
 # Lig: average by position and then scale b/w -1 and 1
 # column: ligand_affinity_change (NOT scaled!)
 #***********
 mean_affinity_by_position <- df %>%
  group_by(position) %>%
  summarize(averaged_affinity = mean(ligand_affinity_change))
 # scale b/w -1 and 1
 affinity_min = min(mean_affinity_by_position['averaged_affinity'])
 affinity_max = max(mean_affinity_by_position['averaged_affinity']) 
 # scale the averaged_affinity values
 mean_affinity_by_position['averaged_affinity_scaled'] = lapply(mean_affinity_by_position['averaged_affinity']
                                                               , function(x) ifelse(x < 0, x/abs(affinity_min), x/affinity_max))
 cat(paste0('Average affinity scores:\n', head(mean_affinity_by_position['averaged_affinity'])
           , '\n---------------------------------------------------------------'
           , '\nScaled affinity scores:\n', head(mean_affinity_by_position['averaged_affinity_scaled'])))
 # sanity checks
 l_bound_affinity = min(mean_affinity_by_position['averaged_affinity_scaled'])
 u_bound_affinity = max(mean_affinity_by_position['averaged_affinity_scaled'])
 if ( (l_bound_affinity == -1) && (u_bound_affinity == 1) ){
  cat(paste0("PASS: affinity scores averaged by position and then scaled"
             , "\nmin averaged affintiy: ", l_bound_affinity
             , "\nmax averaged affintiy: ", u_bound_affinity))
 }else{
  cat(paste0("FAIL: avergaed affinity scores could not be scaled b/w -1 and 1"
             , "\nmin averaged affintiy: ", l_bound_affinity
             , "\nmax averaged affintiy: ", u_bound_affinity))
  quit()
 } 
 #***********
 # merge: mean_duet_by_position and mean_affinity_by_position
 #***********
 common_cols = intersect(colnames(mean_duet_by_position), colnames(mean_affinity_by_position))
 if (dim(mean_duet_by_position) && dim(mean_affinity_by_position)){
  print(paste0("PASS: dim's match, mering dfs by column :", common_cols))
  #combined = as.data.frame(cbind(mean_duet_by_position, mean_affinity_by_position ))
  combined_df = as.data.frame(merge(mean_duet_by_position
                                    , mean_affinity_by_position
                                    , by = common_cols
                                    , all = T))
  cat(paste0("\nnrows combined_df:", nrow(combined_df)
               , "\nnrows combined_df:", ncol(combined_df)))
 }else{
    cat(paste0("FAIL: dim's mismatch, aborting cbind!"
          , "\nnrows df1:", nrow(mean_duet_by_position)
          , "\nnrows df2:", nrow(mean_affinity_by_position)))
    quit()      
 }
 #%%============================================================
 # output
 write.csv(combined_df, outfile_mean_stability
          , row.names = F)
 cat("Finished writing file:\n"
    , outfile_mean_stability
    , "\nNo. of rows:", nrow(combined_df)
    , "\nNo. of cols:", ncol(combined_df))
 # end of script
 #===============================================================
--- a/scripts/plotting/plotting_thesis/basic_barplots.R
+++ b/scripts/plotting/plotting_thesis/basic_barplots.R
@ -0,0 +1,406 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Barplots for mCSM DUET, ligand affinity, and foldX
 # basic barplots with count of mutations
 # basic barplots with frequency of count of mutations
 # , df_colname = ""
 # , leg_title = ""
 # , ats = 25     # axis text size
 # , als = 22     # axis label size
 # , lts = 20     # legend text size
 # , ltis = 22    # label title size
 # , geom_ls = 10 # geom_label size
 # , yaxis_title = "Number of nsSNPs"
 # , bp_plot_title = ""
 # , label_categories = c("Destabilising", "Stabilising")
 # , title_colour = "chocolate4"
 # , subtitle_text = NULL
 # , sts = 20
 # , subtitle_colour = "pink"
 # #, leg_position = c(0.73,0.8) # within plot area
 # , leg_position = "top"
 # , bar_fill_values = c("#F8766D", "#00BFC4")
 #########################################################
 #=======================================================================
 #=======
 # output
 #=======
 outdir_images = paste0("~/git/Writing/thesis/images/results/"
                       , tolower(gene), "/")
 cat("plots will output to:", outdir_images)
 ###########################################################
 df3 = merged_df3
 # FIXME: port to a common script
 #=================
 # PREFORMATTING: for consistency
 #=================
 df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
 table(df3$sensitivity)
 # ConSurf labels
 consurf_colOld = "consurf_colour_rev"
 consurf_colNew = "consurf_outcome"
 df3[[consurf_colNew]] = df3[[consurf_colOld]]
 df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]])
 df3[[consurf_colNew]]
 levels(df3$consurf_outcome) = c( "nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
 levels(df3$consurf_outcome)
 # SNAP2 labels
 snap2_colname = "snap2_outcome"
 df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect")
 df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral")
 ##############################################################
 gene_all_cols = colnames(df3)[colnames(df3)%in%all_cols]
 gene_outcome_cols = colnames(df3)[colnames(df3)%in%c(outcome_cols_stability
                                                     , outcome_cols_affinity
                                                     , outcome_cols_conservation)]
 gene_outcome_cols
 #=======================================================================
 #------------------------------
 # stability barplots:
 outcome_cols_stability
 # label_categories should be  = levels(as.factor(plot_df[[df_colname]]))
 #------------------------------
 sts = 22
 subtitle_colour = "black"
 geom_ls = 10
 # duetP
 duetP = stability_count_bp(plotdf = df3
               , df_colname = "duet_outcome"
               , leg_title = "mCSM-DUET"
               #, label_categories = labels_duet
               , yaxis_title = "Number of nsSNPs"
               , leg_position = "none"
               , subtitle_text = "mCSM-DUET"
               , geom_ls = geom_ls
               , bar_fill_values = c("#F8766D", "#00BFC4")
               , sts = sts
               , subtitle_colour= subtitle_colour)
 # foldx
 foldxP = stability_count_bp(plotdf = df3
                           , df_colname = "foldx_outcome"
                           #, leg_title = "FoldX"
                           #, label_categories = labels_foldx
                           , yaxis_title = ""
                           , leg_position = "none"
                           , subtitle_text = "FoldX"
                           , geom_ls = geom_ls
                           , bar_fill_values = c("#F8766D", "#00BFC4")
                           , sts = sts
                           , subtitle_colour= subtitle_colour)
 # deepddg
 deepddgP = stability_count_bp(plotdf = df3
                            , df_colname = "deepddg_outcome"
                            #, leg_title = "DeepDDG"
                            #, label_categories = labels_deepddg
                            , yaxis_title = "Number of nsSNPs"
                            , leg_position = "none"
                            , subtitle_text = "DeepDDG"
                            , geom_ls = geom_ls
                            , bar_fill_values = c("#F8766D", "#00BFC4")
                            , sts = sts
                            , subtitle_colour= subtitle_colour)
 # deepddg
 dynamut2P = stability_count_bp(plotdf = df3
                              , df_colname = "ddg_dynamut2_outcome"
                              #, leg_title = "Dynamut2"
                              #, label_categories = labels_ddg_dynamut2_outcome
                              , yaxis_title = ""
                              , leg_position = "none"
                              , subtitle_text = "Dynamut2"
                              , geom_ls = geom_ls
                              , bar_fill_values = c("#F8766D", "#00BFC4")
                              , sts = sts
                              , subtitle_colour= subtitle_colour)
 dynamut2P
 # extract common legend
 common_legend = get_legend(duetP +
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "top"))
 #==========================
 # output: STABILITY PLOTS
 #===========================
 bp_stability_CLP = paste0(outdir_images
                          , tolower(gene)
                          ,"_bp_stability_CL.svg")
 svg(bp_stability_CLP,  width = 15, height = 12)
 print(paste0("plot filename:", bp_stability_CLP))
 cowplot::plot_grid(
  common_legend,
  cowplot::plot_grid(duetP, foldxP
                     , deepddgP, dynamut2P
                     , nrow = 2
                     , ncol = 2
                     #, labels = c("(a)", "(b)", "(c)", "(d)")
                     , labels = "AUTO"
                     , label_size = 25)
  , ncol = 1
  , nrow = 2
  , rel_heights = c(0.4/10,9/10))
 dev.off()
 ###########################################################
 #=========================
 # Affinity outcome
 # check this var: outcome_cols_affinity
 # get from preformatting or put in globals
 #==========================
 DistCutOff = 10
 LigDist_colname  # = "ligand_distance" # from globals 
 ppi2Dist_colname  = "interface_dist"
 naDist_colname    = "TBC"
 ###########################################################
 # get plotting data within the distance
 df3_lig  = df3[df3[[LigDist_colname]]<DistCutOff,]
 df3_ppi2 = df3[df3[[ppi2Dist_colname]]<DistCutOff,]
 df3_na   = df3[df3[[naDist_colname]]<DistCutOff,]
 common_bp_title = paste0("Sites <", DistCutOff, angstroms_symbol)
 #------------------------------
 # barplot for ligand affinity:
 # <10 Ang of ligand
 #------------------------------
 mLigP = stability_count_bp(plotdf = df3_lig
               , df_colname = "ligand_outcome"
               #, leg_title  = "mCSM-lig"
               #, label_categories = labels_lig
               , yaxis_title = "Number of nsSNPs"
               , leg_position = "none"
               , subtitle_text = "mCSM-lig"
               , geom_ls = geom_ls
               , bar_fill_values = c("#F8766D", "#00BFC4")
               , sts = sts
               , subtitle_colour= subtitle_colour
               , bp_plot_title = paste(common_bp_title, "ligand")
               )
 #------------------------------
 # barplot for ligand affinity:
 # <10 Ang of ligand
 # mmCSM-lig: will be the same no. of sites but the effect will be different
 #------------------------------
 mmLigP = stability_count_bp(plotdf = df3_lig
                   , df_colname = "mmcsm_lig_outcome"
                   #, leg_title  = "mmCSM-lig"
                   #, label_categories = labels_mmlig
                   , yaxis_title = ""
                   , leg_position = "none"
                   , subtitle_text = "mmCSM-lig"
                   , geom_ls = geom_ls
                   , bar_fill_values = c("#F8766D", "#00BFC4")
                   , sts = sts
                   , subtitle_colour= subtitle_colour
                   , bp_plot_title = paste(common_bp_title, "ligand")
                   )
 #------------------------------
 # barplot for ppi2 affinity
 #  <10 Ang of interface
 #------------------------------
 ppi2P = stability_count_bp(plotdf = df3_ppi2
                   , df_colname = "mcsm_ppi2_outcome"
                   #, leg_title  = "mCSM-ppi2"
                   #, label_categories = labels_ppi2
                   , yaxis_title = ""
                   , leg_position = "none"
                   , subtitle_text = "mCSM-ppi2"
                   , geom_ls = geom_ls
                   , bar_fill_values = c("#F8766D", "#00BFC4")
                   , sts = sts
                   , subtitle_colour= subtitle_colour
                   , bp_plot_title = paste(common_bp_title, "interface")
                   )
 # extract common legend
 common_legend_aff = get_legend(mLigP +
    guides(color = guide_legend(nrow = 1)) +
    theme(legend.position = "top"))
 #==========================
 # output: AFFINITY PLOTS
 #==========================
 bp_affinity_CLP =  paste0(outdir_images
                          ,tolower(gene)
                          ,"_bp_affinity_CL.svg" )
 print(paste0("plot filename:", bp_stability_CLP))
 svg(bp_affinity_CLP,  width = 15, height = 6.5)
 cowplot::plot_grid(
  common_legend,
  cowplot::plot_grid(mLigP, mmLigP
                     , ppi2P
                     , nrow = 1
                     , ncol = 3
                     #, labels = c("(a)", "(b)", "(c)", "(d)")
                     , labels = "AUTO"
                     , label_size = 25)
  , ncol = 1
  , nrow = 2
  , rel_heights = c(0.4/10,9/10))
  #, rel_widths = c(1,1,1))
 dev.off()
 ################################################################
 #=========================
 # Conservation outcome
 # check this var:
 outcome_cols_conservation
 #==========================
 # provean
 proveanP = stability_count_bp(plotdf = df3
                              , df_colname = "provean_outcome"
                              #, leg_title = "PROVEAN"
                              #, label_categories = labels_provean
                              , yaxis_title = ""
                              , leg_position = "top"
                              , subtitle_text = "PROVEAN"
                              , geom_ls = geom_ls
                              , bar_fill_values = c("#F8766D", "#00BFC4")
                              , sts = sts
                              , subtitle_colour= subtitle_colour)
 # snap2
 snap2P = stability_count_bp(plotdf = df3
                            , df_colname = "snap2_outcome"
                            #, leg_title = "SNAP2"
                            #, label_categories = labels_snap2
                            , yaxis_title = "Number of nsSNPs"
                            , leg_position = "top"
                            , subtitle_text = "SNAP2"
                            , geom_ls = geom_ls
                            , bar_fill_values = c("#F8766D", "#00BFC4")
                            , sts = sts
                            , subtitle_colour= subtitle_colour)
 # consurf 
 consurfP = stability_count_bp(plotdf = df3
                              , df_colname = "consurf_outcome"
                              #, leg_title = "ConSurf"
                              #, label_categories = labels_consurf
                              , yaxis_title = ""
                              , leg_position = "top"
                              , subtitle_text = "ConSurf"
                              , geom_ls = 5
                              , bar_fill_values = consurf_colours # from globals
                              , sts = sts
                              , subtitle_colour= subtitle_colour)
 consurfP
 #============================
 # output: CONSERVATION PLOTS
 #============================
 bp_conservation_CLP =  paste0(outdir_images
                              ,tolower(gene)
                              ,"_bp_conservation_CL.svg" )
 print(paste0("plot filename:", bp_conservation_CLP))
 svg(bp_conservation_CLP,  width = 15, height = 6.5)
 cowplot::plot_grid(proveanP, snap2P, consurfP
                     , nrow = 1
                     , ncol = 3
                     #, labels = c("(a)", "(b)", "(c)", "(d)")
                     , labels = "AUTO"
                     , label_size = 25
                     #, rel_heights = c(0.4/10,9/10))
                     , rel_widths  = c(0.9, 0.9, 1.1))
 dev.off()
 #####################################################################
 #===============================================================
 # ------------------------------
 # bp site site count: ALL
 # <10 Ang ligand
 # ------------------------------
 posC_all = site_snp_count_bp(plotdf = df3
                  , df_colname = "position"
                  , xaxis_title = ""
                  , yaxis_title = "Number of Sites"
                  , subtitle_size = 20)
 # ------------------------------
 # bp site site count: mCSM-lig
 # < 10 Ang ligand
 # ------------------------------
 common_bp_title = paste0("Sites <", DistCutOff, angstroms_symbol)
 posC_lig = site_snp_count_bp(plotdf = df3_lig
                  , df_colname = "position"
                  , xaxis_title = "Number of nsSNPs"
                  , yaxis_title = "" #+  annotate("text", x = 1.5, y = 2.2, label = "Text No. 1")
                  , subtitle_text = paste0(common_bp_title, " ligand")
                  , subtitle_size = 20
                  , subtitle_colour = subtitle_colour)
 # ------------------------------
 # bp site site count: ppi2
 # < 10 Ang interface
 # ------------------------------
 posC_ppi2 = site_snp_count_bp(plotdf = df3_ppi2
                  , df_colname = "position"
                  , xaxis_title = ""
                  , yaxis_title = ""
                  , subtitle_text = paste0(common_bp_title, " interface")
                  , subtitle_size = 20
                  , subtitle_colour = subtitle_colour)
 # ------------------------------
 #FIXME: bp site site count: na
 # < 10 Ang TBC
 # ------------------------------
 # posC_na = site_snp_count_bp(plotdf = df3_na
 #                   , df_colname = "position"
 #                   , xaxis_title = ""
 #                   , yaxis_title = "")
 #===========================
 # output: SITE SNP count:
 # all + affinity
 #==========================
 pos_count_combined_CLP =  paste0(outdir_images
                            ,tolower(gene)
                            ,"_pos_count_PS_AFF.svg")
 svg(pos_count_combined_CLP, width = 15, height = 6.5)
 print(paste0("plot filename:", pos_count_combined_CLP))
 cowplot::plot_grid(posC_all, posC_lig, posC_ppi2
                     #, posC_na
                     , nrow = 1
                     , ncol = 3
                     #, labels = c("(a)", "(b)", "(c)", "(d)")
                     , labels = "AUTO"
                     , label_size = 25)
 dev.off()
 #===============================================================
--- a/scripts/plotting/plotting_thesis/corr/corr_adjusted_PS_LIG.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_adjusted_PS_LIG.R
@ -0,0 +1,330 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Corr plots for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("~/git/LSHTM_analysis/scripts/Header_TT.R")
 require(cowplot)
 source("combining_dfs_plotting.R")
 source("my_pairs_panel.R")
 # should return the following dfs, directories and variables
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nAngstrom symbol:", angstroms_symbol
           , "\nNo. of duplicated muts:", dup_muts_nu
           , "\nNA count for ORs:", na_count
           , "\nNA count in df2:", na_count_df2
           , "\nNA count in df3:", na_count_df3))     
 #=======
 # output
 #=======
 # can't combine by cowplot because not ggplots
 #corr_plot_combined = "corr_combined.svg"
 #plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
 # PS
 corr_ps_adjusted = "corr_PS_adjusted.svg"
 plot_corr_ps_adjusted =  paste0(plotdir,"/", corr_ps)
 # LIG
 corr_lig_adjusted = "corr_LIG_adjusted.svg"
 plot_corr_lig_adjusted =  paste0(plotdir,"/", corr_lig)
 ####################################################################
 #               end of loading libraries and functions                 #
 ########################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 df_ps = merged_df3_comp 
 df_lig = merged_df3_comp_lig
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Data for Correlation plots:PS
 #===========================
 table(df_ps$duet_outcome)
 #===========================
 # Data for Correlation plots:foldx
 #===========================
 #============================
 # adding foldx scaled values
 # scale data b/w -1 and 1
 #============================
 n = which(colnames(df_ps) == "ddg"); n 
 my_min = min(df_ps[,n]); my_min 
 my_max = max(df_ps[,n]); my_max 
 df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
                            , df_ps[,n]/abs(my_min)
                            , df_ps[,n]/my_max) 
 # sanity check
 my_min = min(df_ps$foldx_scaled); my_min 
 my_max = max(df_ps$foldx_scaled); my_max
 if (my_min == -1 && my_max == 1){
  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
      , "\nProceeding with assigning foldx outcome category")
 }else{
  cat("FAIL: could not scale foldx ddg values"
      , "Aborting!")
 }
 #================================
 # adding foldx outcome category
 # ddg<0 = "Stabilising" (-ve)
 #=================================
 c1 = table(df_ps$ddg < 0)
 df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
 c2 = table(df_ps$ddg < 0)
 if ( all(c1 == c2) ){
  cat("PASS: foldx outcome successfully created")
 }else{
  cat("FAIL: foldx outcome could not be created. Aborting!")
  exit()
 }
 table(df_ps$foldx_outcome)
 #======================
 # adding log cols 
 #======================
 df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
 df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
 df_ps$log10_or_kin = log10(df_ps$or_kin)
 df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
 # subset data to generate pairwise correlations
 cols_to_select =  c("duet_scaled"
                    , "foldx_scaled"
                    #, "log10_or_mychisq"
                    #, "neglog_pval_fisher"
                    , "or_kin"
                    , "neglog_pwald_kin"
                    , "af"
                    , "asa"
                    , "rsa"
                    , "kd_values"
                    , "rd_values"
                    , "duet_outcome"
                    , drug)
 corr_data_ps = df_ps[, cols_to_select]
 dim(corr_data_ps)
 #p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
 #p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
 # assign nice colnames (for display)
 my_corr_colnames = c("DUET"
                     , "Foldx"
                     #, "Log(OR)"
                     #, "-Log(P)"
                     , "OR adjusted"
                     , "-Log(P wald)"
                     , "AF"
                     , "ASA"
                     , "RSA"
                     , "KD"
                     , "RD"
                     , "duet_outcome"
                     , drug)
 length(my_corr_colnames)
 colnames(corr_data_ps)
 colnames(corr_data_ps) <- my_corr_colnames
 colnames(corr_data_ps)
 #-----------------
 # generate corr PS plot
 #-----------------
 start = 1
 end = which(colnames(corr_data_ps) == drug); end # should be the last column
 offset = 1
 my_corr_ps = corr_data_ps[start:(end-offset)]
 head(my_corr_ps)
 #my_cols = c("#f8766d", "#00bfc4")
 # deep blue :#007d85
 # deep red: #ae301e
 cat("Corr plot PS:", plot_corr_ps_adjusted)
 svg(plot_corr_ps_adjusted, width = 15, height = 15)
 OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
             , pch = 21
             , jitter = T
             #, alpha = .05
             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 2
             , cex.axis = 1.5
             , cex.labels = 1.5
             , cex.cor = 1
             , smooth = F
 )
 print(OutPlot1)
 dev.off()
 #===========================
 # Data for Correlation plots: LIG
 #===========================
 table(df_lig$ligand_outcome)
 df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
 df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
 df_lig$log10_or_kin = log10(df_lig$or_kin)
 df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
 # subset data to generate pairwise correlations
 cols_to_select =  c("affinity_scaled"
                    , "log10_or_mychisq"
                    , "neglog_pval_fisher"
                    #, "or_kin"
                    #, "neglog_pwald_kin"
                    , "af"
                    , "ligand_outcome"
                    , drug)
 corr_data_lig = df_lig[, cols_to_select]
 dim(corr_data_lig)
 # assign nice colnames (for display)
 my_corr_colnames = c("Ligand Affinity"
                     , "Log(OR)"
                     , "-Log(P)"
                     #, "OR adjusted"
                     #, "-Log(P wald)"
                     , "AF"
                     , "ligand_outcome"
                     , drug)
 length(my_corr_colnames)
 colnames(corr_data_lig)
 colnames(corr_data_lig) <- my_corr_colnames
 colnames(corr_data_lig)
 #-----------------
 # generate corr LIG plot
 #-----------------
 start = 1
 end = which(colnames(corr_data_lig) == drug); end # should be the last column
 offset = 1
 my_corr_lig = corr_data_lig[start:(end-offset)]
 head(my_corr_lig)
 cat("Corr LIG plot:", plot_corr_lig_adjusted)
 svg(plot_corr_lig_adjusted, width = 15, height = 15)
 OutPlot2  = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
                  , method = "spearman" # correlation method
                  , hist.col = "grey" ##00AFBB
                  , density = TRUE  # show density plots
                  , ellipses = F # show correlation ellipses
                  , stars = T
                  , rug = F
                  , breaks = "Sturges"
                  , show.points = T
                  , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
                  , pch = 21
                  , jitter = T
                  #, alpha = .05
                  #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
                  , cex = 3
                  , cex.axis = 2.5
                  , cex.labels = 2.1
                  , cex.cor = 1
                  , smooth = F
 )
 print(OutPlot2)
 dev.off()
 #######################################################
--- a/scripts/plotting/plotting_thesis/corr/corr_plots.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_plots.R
@ -0,0 +1,242 @@
 #!/usr/bin/env Rscript
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 source("~/git/LSHTM_analysis/scripts/Header_TT.R")
 spec = matrix(c(
  "drug"       , "d",  1, "character",
  "gene"       , "g",  1, "character",
  "data_file1" , "fa", 2, "character",
  "data_file2" , "fb", 2, "character" 
 ), byrow = TRUE, ncol = 4)
 opt = getopt(spec)
 drug            = opt$drug
 gene            = opt$gene
 infile_params   = opt$data_file1
 infile_metadata = opt$data_file2
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
 #===========
 # Input
 #===========
 source("get_plotting_dfs.R")
 #===========
 # output
 #===========
 # PS
 corr_ps = "corr_PS.svg"
 plot_corr_ps =  paste0(plotdir,"/", corr_ps)
 corr_ps_all = "corr_PS_all.svg"
 plot_corr_ps_all =  paste0(plotdir,"/", corr_ps_all)
 # LIG
 corr_lig = "corr_LIG.svg"
 plot_corr_lig =  paste0(plotdir,"/", corr_lig)
 corr_lig_all = "corr_LIG_all.svg"
 plot_corr_lig_all =  paste0(plotdir,"/", corr_lig_all)
 ##############################################################################
 foo = corr_ps_df3
 #foo2 = corr_ps_df2
 bar = corr_lig_df3
 #bar2 = corr_lig_df2
 #================================
 # Data for Correlation plots: PS
 #================================
 # subset data to generate pairwise correlations
 cols_to_select = c("DUET"
                     , "Foldx"
                     , "Log (OR)"
                     , "-Log (P)"
                     , "MAF"
                     , "duet_outcome"
                     , drug)
 corr_data_ps = foo[names(foo)%in%cols_to_select]
 length(cols_to_select)
 colnames(corr_data_ps)
 start = 1
 end = which(colnames(corr_data_ps) == drug); end # should be the last column
 offset = 1
 my_corr_ps = corr_data_ps[start:(end - offset)]
 head(my_corr_ps)
 #---------------------
 # Corr plot PS: short
 # data: corr_ps_df3
 # cols: 7
 #---------------------
 cat("Corr plot PS DUET with coloured dots:", plot_corr_ps)
 svg(plot_corr_ps, width = 15, height = 15)
 pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))] # foldx colours are reveresed
             , pch = 21 # for bg
             , jitter = T
             , alpha = 1
             , cex = 1.8
             , cex.axis = 2
             , cex.labels = 4
             , cex.cor = 1
             , smooth = F
 )
 dev.off()
 corr_ps_rho = corr.test(my_corr_ps[1:5], method = "spearman")$r
 corr_ps_p = corr.test(my_corr_ps[1:5], method = "spearman")$p
 #---------------------
 # Corr plot PS: ALL
 # data: corr_ps_df3
 # cols: 10
 #---------------------
 end_ps_all = which(colnames(foo) == drug); end_ps_all # should be the last column
 my_corr_ps_all = foo[start:(end_ps_all - offset)]
 cols_to_drop = "Mutation"
 my_corr_ps_all = my_corr_ps_all[, !(names(my_corr_ps_all)%in%cols_to_drop)]
 head(my_corr_ps_all)
 length(colnames(my_corr_ps_all))
 cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all)
 svg(plot_corr_ps_all, width = 15, height = 15)
 pairs.panels(my_corr_ps_all[1:(length(my_corr_ps_all)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps_all$duet_outcome))] # foldx colours are reveresed
             , pch = 21 # for bg
             , jitter = T
             , alpha = 1
             , cex = 1.5
             , cex.axis = 2
             , cex.labels = 2.5
             , cex.cor = 1
             , smooth = F
 )
 dev.off()
 #==================================
 # Data for Correlation plots: LIG
 #==================================
 cols_to_select_lig = c("Ligand Affinity"
                     , "Log (OR)"
                     , "-Log (P)"
                     , "MAF"
                     , "ligand_outcome"
                     , drug)
 corr_data_lig = bar[names(bar)%in%cols_to_select_lig]
 length(cols_to_select_lig)
 colnames(corr_data_lig)
 start_lig = 1
 end_lig = which(colnames(corr_data_lig) == drug); end_lig # should be the last column
 offset_lig = 1
 my_corr_lig = corr_data_lig[start_lig:(end_lig-offset_lig)]
 head(my_corr_lig)
 #---------------------
 # Corr plot LIG: short
 # data: corr_lig_df3
 # cols: 7
 #---------------------
 cat("Corr LIG plot with coloured dots:", plot_corr_lig)
 svg(plot_corr_lig, width = 15, height = 15)
 pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))] 
             , pch = 21 # for bg
             , jitter = T
             , cex = 2
             , cex.axis = 2
             , cex.labels = 4
             , cex.cor = 1
             , smooth = F
 )
 dev.off()
 corr_lig_rho = corr.test(my_corr_lig[1:4], method = "spearman")$r
 corr_lig_p = corr.test(my_corr_lig[1:4], method = "spearman")$p
 #---------------------
 # Corr plot LIG: ALL
 # data: corr_lig_df3
 # cols: 9
 #---------------------
 end_lig_all = which(colnames(bar) == drug); end_lig_all # should be the last column
 my_corr_lig_all = bar[start_lig:(end_lig_all - offset_lig)]
 cols_to_drop = "Mutation"
 my_corr_lig_all = my_corr_lig_all[, !(names(my_corr_lig_all)%in%cols_to_drop)]
 head(my_corr_lig_all)
 length(colnames(my_corr_lig_all))
 cat("Corr plot LIG with coloured dots:", plot_corr_lig_all)
 svg(plot_corr_lig_all, width = 15, height = 15)
 pairs.panels(my_corr_lig_all[1:(length(my_corr_lig_all)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig_all$ligand_outcome))] # foldx colours are reveresed
             , pch = 21 # for bg
             , jitter = T
             , alpha = 1
             , cex = 1.5
             , cex.axis = 2
             , cex.labels = 2.2
             , cex.cor = 1
             , smooth = F
 )
 dev.off()
 ######################################################################=
 #                             End of script
 ######################################################################=
--- a/scripts/plotting/plotting_thesis/corr/corr_plots_gc_i.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_plots_gc_i.R
@ -0,0 +1,276 @@
 #!/usr/bin/env Rscript       
 source("~/git/LSHTM_analysis/config/gid.R")
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 #===================================================================
 corr_data = corr_data_extract(merged_df3, drug_name = drug)
 #corr_data = corr_data_extract(merged_df2, drug_name = drug)
 geneL_normal = c("pnca")
 geneL_na_dy = c("gid")
 geneL_na = c("rpob")
 geneL_ppi2 = c("alr", "embb", "katg", "rpob")
 core_cols <- c( "Log (OR)" , "MAF", "-Log (P)"
                    , "DUET", "FoldX"
                    , "DeepDDG", "Dynamut2"
                    , "ASA", "RSA", "RD", "KD"
                    , "Consurf", "SNAP2"
                    #, "mutation_info_labels"
 )
 if (tolower(gene)%in%geneL_normal){
  corrplot_cols = core_cols
 }
 if (tolower(gene)%in%geneL_na_dy){
  additional_cols = c("mCSM-NA"
                    , "Dynamut"
                    , "ENCoM-DDG"
                    , "ENCoM-DDS"
                    , "mCSM"
                    , "SDM"
                    , "DUET-d"
                    , "mutation_info_labels")
  corrplot_cols = c(core_cols, additional_cols)
 }
 if (tolower(gene)%in%geneL_na){
  additional_cols = c("mCSM-NA"
                    , "mutation_info_labels")
  corrplot_cols = c(core_cols, additional_cols)
 }
 if (tolower(gene)%in%geneL_ppi2){
  additional_cols = c("mCSM-PPI2"
                    , "mutation_info_labels")
  corrplot_cols = c(core_cols, additional_cols)
 }
 #========================================
 # corrplot_cols <- c( "Log (OR)"
 #                     , "MAF"
 #                     , "-Log (P)"
 #                     , "DUET"
 #                     , "FoldX"
 #                     , "DeepDDG"
 #                     , "Dynamut2"
 #                     , "mCSM-NA"
 #                     , "Dynamut"
 #                     , "ENCoM-DDG"
 #                     , "ENCoM-DDS"
 #                     , "mCSM"
 #                     , "SDM"
 #                     , "DUET-d"
 #                     , "ASA"
 #                     , "RSA"
 #                     , "RD"
 #                     , "KD"
 #                     , "mutation_info_labels"
 #                    )
 corr_df <- corr_data[, corrplot_cols] # col order is according to corrplot_cols
 head(corr_df); names(corr_df)
 if ( all( corrplot_cols%in%names(corr_df) ) ){
  cat("\nPASS: Successfully selected"
      , length(corrplot_cols)
      , "columns for building correlation df")
 } else {
  cat("\nFAIl: Something went wrong, numbers mismatch"
      , "\nExpected cols:", length(corrplot_cols)
      , "\nGot:", length(corr_df) )
 }
 #=====================================================
 corrplot_df <- corr_df 
 # stat_df = corrplot_df[, c("Log (OR)"
 #                           , "MAF" 
 #                           , "-Log (P)")]
 plot_title <- "Correlation plots (stability)"
 # Checkbox Names
 # FIXME: select columns conditionally based on gene and grey out the ones that are not present!
 cBCorrNames = c( "Odds Ratio"
                , "Allele Frequency"
                , "P-value"
                , "DUET"
                , "FoldX"
                , "DeepDDG"
                , "Dynamut2"
                , "ASA"
                , "RSA"
                , "RD"
                , "KD"
                , "Consurf"
                , "SNAP2"
                , "Nucleic Acid affinity"
                , "PPi2 affinity"
                #, "Dynamut"
                #, "ENCoM-Stability"
                #, "ENCoM-Flexibility"
                #, "mCSM"
                #, "SDM"
                #, "DUET-d"
 )
 # Checkbox Values (aka Column Names that are in corrplot_df)
 cBCorrVals = c("Log (OR)"
              , "MAF"
              , "-Log (P)"
              , "DUET"
              , "FoldX"
              , "DeepDDG"
              , "Dynamut2"
              , "ASA"
              , "RSA"
              , "RD"
              , "KD"
              , "Consurf"
              , "SNAP2"
              , "mCSM-NA"
              , "mCSM-PPI2"
              # , "Dynamut"
              # , "ENCoM-DDG"
              # , "ENCoM-DDS"
              # , "mCSM"
              # , "SDM"
              # , "DUET-d"
 )
 # Pre-selected checkboxes
 cBCorrSelected = c("Log (OR)"
                   , "MAF"
                   , "-Log (P)")
 #################
 # Define UI
 #################
 u_corr <- fluidPage(
  headerPanel(plot_title),
  sidebarLayout(position = "left"
                , sidebarPanel(
                 checkboxGroupInput("variable", "Choose parameter:"
                                       , choiceNames  = cBCorrNames
                                       , choiceValues = cBCorrVals
                                       , selected     = cBCorrSelected
                  )
                  # could be a fluid Row
                  , actionButton("add_col"     , "Render")
                  , actionButton("reset_graph" , "Reset Graphs")
                  , actionButton("select_all"  , "Select All")
                  )
                # output/display
                , mainPanel(plotOutput(outputId = 'corrplot'
                                    , height = "1200px"
                                    , width  = "1500px")
 #                           , height = "800px"
 #                          , width  = "600px")
                          , textOutput("txt")
                )
  )
 )
 #################
 # Define server
 #################
 s_corr <- shinyServer(function(input, output, session)
 {
 #================
 # Initial render
 #================
  output$corrplot <- renderPlot({ 
    #---------------------
    # My correlation plot: initial plot
    #---------------------
    c_plot <- my_corr_pairs(corr_data_all = corrplot_df
                            , corr_cols = cBCorrSelected
                            , corr_method = "spearman"
                            , dot_size = 2
                            , ats = 1.5
                            , corr_lab_size = length(cBCorrNames)/length(cBCorrSelected) * 1.3
                            , corr_value_size = 1)
  })
 #====================
 # Interactive render
 #====================
  observeEvent(
    input$add_col, {
      # select cols for corrplot
      corr_cols_s <- c(input$variable)
      # render plot
      if (length(c(input$variable)) >= 2) {
      output$corrplot <- renderPlot({
        #---------------------
        # My correlation plot: user selects columns
        #---------------------
        c_plot <- my_corr_pairs(corr_data_all     = corrplot_df
                                , corr_cols       = corr_cols_s
                                , dot_size        = 2
                                , ats             = 1.5
                                , corr_lab_size   = length(cBCorrNames)/length(corr_cols_s) * 1.3
                                , corr_value_size = 1)
      })
      } else{ output$txt = renderText({"Argh, common! It's a correlation plot. Select >=2 vars!"})
      }
    })
 #==================================
 # Add button: Select All checkbox
 #==================================
  observeEvent(
    input$select_all,{
      updateCheckboxGroupInput(session, "variable", selected = cBCorrVals)
    }
 )
 #================
 # Reset render
 #================
  observeEvent(
    input$reset_graph,{
      # reset checkboxes to default selection
      updateCheckboxGroupInput(session, "variable", selected = cBCorrSelected)
      # render plot
      output$corrplot <- renderPlot({
        #---------------------
        # My correlation plot: reset plot
        #---------------------
        c_plot <- my_corr_pairs(corr_data_all     = corrplot_df
                                , corr_cols       = cBCorrSelected
                                , dot_size        = 1.2
                                , ats             = 1.5
                                , corr_lab_size   = length(cBCorrNames)/length(cBCorrSelected) * 1.3
                                , corr_value_size = 1)
      })
    }
  )
 }
 )
 shinyApp(ui = u_corr, server = s_corr)
--- a/scripts/plotting/plotting_thesis/corr/corr_plots_gc_lig_i.R
+++ b/scripts/plotting/plotting_thesis/corr/corr_plots_gc_lig_i.R
@ -0,0 +1,220 @@
 #!/usr/bin/env Rscript       
 source("~/git/LSHTM_analysis/config/gid.R")
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 #===================================================================
 corr_data = corr_data_extract(merged_df3, drug_name = drug)
 #corr_data = corr_data_extract(merged_df2, drug_name = drug)
 #================================================================
 #other globals
 dist_colname <- LigDist_colname # ligand_distance (from globals)
 dist_cutoff <- LigDist_cutoff # 10 (from globals)
 cat("\nLigand distance cut off, colname:", dist_colname
    , "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
    , "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")
 ########################################################################
 #==========================================
 #####################
 # Correlation plot
 #####################
 colnames(corr_df_m3_f)
 corrplot_cols_lig <- c( "Log (OR)"
                    ,  "MAF"
                    , "-Log (P)"
                    , "mCSM-lig"
                    , "mCSM-NA"
                    , "ASA"
                    , "RSA"
                    , "RD"
                    , "KD" 
                    , dist_colname
                   , "mutation_info_labels"
                   )
 corr_df_lig <- corr_df_m3_f[, corrplot_cols_lig]
 head(corr_df_lig)
 corrplot_df_lig <- corr_df_lig 
 # static df
 # stat_df = corrplot_df_lig[, c("Log (OR)"
 #                           , "MAF"
 #                           , "-Log (P)"
 #                           )]
 plot_title_lig <- "Correlation plots (ligand affinity)"
 # Checkbox Names
 cCorrNames = c( "Odds Ratio"
             , "Allele Frequency"
             , "P-value"
             , "Ligand affinity"
             , "Nucleic Acid affinity"
             , "ASA"
             , "RSA"
             , "RD"
             , "KD" 
             , "Ligand Distance")
 # Checkbox Values (aka Column Names that are in corrplot_df_lig)
 cCorrVals = c("Log (OR)"
          , "MAF"
          , "-Log (P)"
          , "mCSM-lig"
          , "mCSM-NA"
          , "ASA"
          , "RSA"
          , "RD"
          , "KD" 
          , dist_colname)
 # Pre-selected checkboxes
 cCorrSelected = c("Log (OR)"
                   , "MAF"
                   , "-Log (P)")
 #============
 # Define UI 
 #============
 u_corr_lig<- fluidPage(
  headerPanel(plot_title_lig),
  sidebarLayout(position = "left"
                , sidebarPanel("Correlations: Filtered data data"
                  , numericInput(inputId = "lig_dist"
                                 , label = "Ligand distance cutoff"
                                 , value =  dist_cutoff # 10 default from globals
                                 , min = min_ang
                                 , max = max_ang)
                  , checkboxGroupInput("variable", "Choose parameter:"
                                       , choiceNames  = cCorrNames
                                       , choiceValues = cCorrVals
                                       , selected     = cCorrSelected
                  )
                  # could be a fluid Row
                  , actionButton("add_col"     , "Render")
                  , actionButton("reset_graph" , "Reset Graphs")
                  , actionButton("select_all"  , "Select All")
                )
                # output/display
                , mainPanel(plotOutput(outputId = 'corrplot'
                                     , height = "1000px"
                                     , width  = "1200px")
                          # , height = "800px"
                          # , width  = "600px")
                , textOutput("txt")
                )
  )
 )
 #===============
 # Define server
 #===============
 s_corr_lig <- shinyServer(function(input, output, session)
 { 
 #================
 # Initial render
 #================
  output$corrplot <- renderPlot({ 
    # get the user-specified lig_list
    dist_cutoff_ini = input$lig_dist
    # subset data for plot
    corrplot_df_lig_ini = corrplot_df_lig[corrplot_df_lig[[dist_colname]] < dist_cutoff_ini,]
    #---------------------
    # My correlation plot: initial plot
    #---------------------
    c_plot <- my_corr_pairs(
                            #corr_data_all = corrplot_df_lig
                            corr_data_all = corrplot_df_lig_ini
                            , corr_cols = cCorrSelected
                            , dot_size = 2
                            , ats = 1.5
                            , corr_lab_size = length(cCorrNames)/length(cCorrSelected) * 1.3
                            , corr_value_size = 1)
  })
 #====================
 # Interactive render
 #====================
  observeEvent(
    input$add_col, {
      # get the user-specified lig_list
      dist_cutoff_user = input$lig_dist
      # subset data for plot
      corrplot_df_lig_s = corrplot_df_lig[corrplot_df_lig[[dist_colname]] < dist_cutoff_user,]
      # select cols for corrplot
      corr_cols_s = c(input$variable)
      # render plot
      if (length(c(input$variable)) >= 2) {
        output$corrplot <- renderPlot({ 
          #---------------------
          # My correlation plot: user selects columns
          #---------------------
          c_plot <- my_corr_pairs(corr_data_all = corrplot_df_lig_s
                                  , corr_cols = corr_cols_s
                                  , dot_size = 1.6
                                  , ats = 1.5
                                  , corr_lab_size = length(cCorrNames)/length(corr_cols_s) * 1.3
                                  , corr_value_size = 1)
        })
      } else { output$txt = renderText({"Fuddu! It's a correlation plot. Select >=2 vars bewakoof!"})}
    })
 #==================================
 # Add button: Select All checkbox
 #==================================
  observeEvent(
    input$select_all,{
      updateCheckboxGroupInput(session, "variable", selected = cCorrVals)
    }
  )
 #================
 # Reset render
 #================
  observeEvent(
    input$reset_graph,{
      # reset checkboxes
      updateCheckboxGroupInput(session, "variable", selected = cCorrSelected)
      # render plot
      output$corrplot <- renderPlot({ 
        #---------------------
        # My correlation plot: reset plot
        #---------------------
        c_plot <- my_corr_pairs(corr_data_all = corrplot_df_lig
                                , corr_cols = cCorrSelected
                                , dot_size = 2
                                , ats = 1.5
                                , corr_lab_size = length(cCorrNames)/length(cCorrSelected) * 1.3
                                , corr_value_size = 1)
      })
    }
  )
 }
 )
 shinyApp(ui = u_corr_lig, server = s_corr_lig)
--- a/scripts/plotting/plotting_thesis/corr/ggcorr_all_PS_LIG.R
+++ b/scripts/plotting/plotting_thesis/corr/ggcorr_all_PS_LIG.R
@ -0,0 +1,323 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Corr plots for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("~/git/LSHTM_analysis/scripts/Header_TT.R")
 require(cowplot)
 source("combining_dfs_plotting.R")
 #source("my_pairs_panel.R")
 # should return the following dfs, directories and variables
 # FIXME: Can't output from here
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nAngstrom symbol:", angstroms_symbol
           , "\nNo. of duplicated muts:", dup_muts_nu
           , "\nNA count for ORs:", na_count
           , "\nNA count in df2:", na_count_df2
           , "\nNA count in df3:", na_count_df3))     
 #=======
 # output
 #=======
 # can't combine by cowplot because not ggplots
 #corr_plot_combined = "corr_combined.svg"
 #plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
 # PS
 #ggcorr_all_ps = "ggcorr_all_PS.svg"
 ggcorr_all_ps = "ggcorr_all_PS.png"
 plot_ggcorr_all_ps =  paste0(plotdir,"/", ggcorr_all_ps)
 # LIG
 #ggcorr_all_lig = "ggcorr_all_LIG.svg"
 ggcorr_all_lig = "ggcorr_all_LIG.png"
 plot_ggcorr_all_lig =  paste0(plotdir,"/", ggcorr_all_lig )
 # combined
 ggcorr_all_combined_labelled = "ggcorr_all_combined_labelled.png"
 plot_ggcorr_all_combined_labelled  =  paste0(plotdir,"/", ggcorr_all_combined_labelled)
 ####################################################################
 #               end of loading libraries and functions                 #
 ########################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 #df_ps = merged_df3_comp 
 #df_lig = merged_df3_comp_lig
 merged_df3 = as.data.frame(merged_df3)
 df_ps = merged_df3 
 df_lig = merged_df3_lig
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #======================
 # adding log cols 
 #======================
 # subset data to generate pairwise correlations
 cols_to_select =  c("duet_scaled"
                    , "foldx_scaled"
                    , "log10_or_mychisq"
                    , "neglog_pval_fisher"
                    #, "or_kin"
                    #, "neglog_pwald_kin"
                    , "af"
                    , "asa"
                    , "rsa"
                    , "kd_values"
                    , "rd_values"
                    , "duet_outcome"
                    , drug)
 corr_data_ps = df_ps[, cols_to_select]
 dim(corr_data_ps)
 #p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
 #p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
 # assign nice colnames (for display)
 my_corr_colnames = c("DUET"
                     , "Foldx"
                     , "Log (OR)"
                     , "-Log (P)"
                     #, "OR (adjusted)"
                     #, "-Log (P wald)"
                     , "AF"
                     , "ASA"
                     , "RSA"
                     , "KD"
                     , "RD"
                     , "duet_outcome"
                     , drug)
 length(my_corr_colnames)
 colnames(corr_data_ps)
 colnames(corr_data_ps) <- my_corr_colnames
 colnames(corr_data_ps)
 #------------------------
 # Data for ggcorr PS plot
 #------------------------
 start = 1
 end_ggcorr = which(colnames(corr_data_ps) == "duet_outcome"); end_ggcorr # should be the last column
 offset = 1
 my_ggcorr_ps = corr_data_ps[start:(end_ggcorr-1)]
 head(my_ggcorr_ps)
 # correlation matrix
 corr1 <- round(cor(my_ggcorr_ps, method = "spearman", use = "pairwise.complete.obs"), 1)
 # p-value matrix
 pmat1 <- cor_pmat(my_ggcorr_ps, method = "spearman", use = "pairwise.complete.obs"
                  ,   conf.level = 0.99)
 corr2 = psych::corr.test(my_ggcorr_ps
                        , method = "spearman"
                        , use =  "pairwise.complete.obs")$r
 corr2 = round(corr2, 1)
 pmat2 =  psych::corr.test(my_ggcorr_ps
                            , method = "spearman"
                            , adjust = "none"
                            , use =  "pairwise.complete.obs")$p
 corr1== corr2
 pmat1==pmat2
 #------------------------
 # Generate ggcorr PS plot
 #------------------------
 cat("ggCorr plot PS:", plot_ggcorr_all_ps)
 #png(filename = plot_ggcorr_all_ps, width = 1024, height = 768, units = "px", pointsize = 20)
 ggcorr_ps = ggcorrplot(corr1
                       , p.mat = pmat1
                       , hc.order = TRUE
                       , outline.col = "black"
                       , ggtheme = ggplot2::theme_gray
                       , colors = c("#6D9EC1", "white", "#E46726")
                       , title = "DUET and Foldx stability")    
 ggcorr_ps
 #dev.off()
 #===========================
 # Data for Correlation plots: LIG
 #===========================
 table(df_lig$ligand_outcome)
 df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
 df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
 df_lig$log10_or_kin = log10(df_lig$or_kin)
 df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
 # subset data to generate pairwise correlations
 cols_to_select_lig =  c("affinity_scaled"
                    , "log10_or_mychisq"
                    , "neglog_pval_fisher"
                    , "or_kin"
                    , "neglog_pwald_kin"
                    , "af"
                    , "asa"
                    , "rsa"
                    , "kd_values"
                    , "rd_values"
                    , "ligand_outcome"
                    , drug)
 corr_data_lig = df_lig[, cols_to_select_lig]
 dim(corr_data_lig)
 # assign nice colnames (for display)
 my_corr_colnames_lig = c("Ligand Affinity"
                     , "Log (OR)"
                     , "-Log (P)"
                     , "OR (adjusted)"
                     , "-Log(P wald)"
                     , "AF"
                     , "ASA"
                     , "RSA"
                     , "KD"
                     , "RD"
                     , "ligand_outcome"
                     , drug)
 length(my_corr_colnames)
 colnames(corr_data_lig)
 colnames(corr_data_lig) <- my_corr_colnames_lig
 colnames(corr_data_lig)
 #------------------------
 # Data for ggcorr LIG plot
 #------------------------
 start = 1
 end_ggcorr_lig = which(colnames(corr_data_lig) == "ligand_outcome"); end_ggcorr_lig # should be the last column
 offset = 1
 my_ggcorr_lig = corr_data_lig[start:(end_ggcorr_lig-1)]
 head(my_ggcorr_lig); str(my_ggcorr_lig)
 # correlation matrix
 corr1_lig <- round(cor(my_ggcorr_lig, method = "spearman", use = "pairwise.complete.obs"), 1)
 # p-value matrix
 pmat1_lig <- cor_pmat(my_ggcorr_lig, method = "spearman", use = "pairwise.complete.obs")
 corr2_lig = psych::corr.test(my_ggcorr_lig
                             , method = "spearman"
                             , use =  "pairwise.complete.obs")$r
 corr2_lig = round(corr2_lig, 1)
 pmat2_lig =  psych::corr.test(my_ggcorr_lig
                              , method = "spearman"
                              , adjust = "none"
                              , use =  "pairwise.complete.obs")$p
 corr1_lig == corr2_lig
 pmat1_lig == pmat2_lig
 # for display order columns by hc order of ps 
 #col_order = levels(ggcorr_ps$data[2])
 #col_order <- c("Species", "Petal.Width", "Sepal.Length",
               #"Sepal.Width", "Petal.Length")
 #my_data2 <- my_data[, col_order]
 #my_data2
 #------------------------
 # Generate ggcorr LIG plot
 #------------------------
 cat("ggCorr LIG plot:", plot_ggcorr_all_lig)
 #svg(plot_ggcorr_all_lig, width = 15, height = 15)
 #png(plot_ggcorr_all_lig, width = 1024, height = 768, units = "px", pointsize = 20)
 ggcorr_lig = ggcorrplot(corr1_lig
                        , p.mat = pmat1_lig
                        , hc.order = TRUE
                        , outline.col = "black"
                        , ggtheme = ggplot2::theme_gray
                        , colors = c("#6D9EC1", "white", "#E46726")
                        , title = "Ligand affinty")    
 ggcorr_lig
 #dev.off()
 #######################################################
 #=============================
 # combine plots for output
 #=============================
 +
--- a/scripts/plotting/plotting_thesis/corr_plots_thesis.R
+++ b/scripts/plotting/plotting_thesis/corr_plots_thesis.R
@ -0,0 +1,141 @@
 merged_df3 = as.data.frame(merged_df3)
 corr_plotdf = corr_data_extract(merged_df3, extract_scaled_cols = F)
 #================
 # stability
 #================
 corr_ps_colnames = c("DUET"
                     , "FoldX"
                     , "DeepDDG"
                     , "Dynamut2"
                     , "MAF"             
                     , "Log (OR)"       
                     , "-Log (P)"
                     #, "ligand_distance"
                     , "dst_mode"
                     , drug)
 corr_df_ps = corr_plotdf[, corr_ps_colnames]
 color_coln = which(colnames(corr_df_ps) == "dst_mode")
 end = which(colnames(corr_df_ps) == drug)
 ncol_omit = 2
 corr_end = end-ncol_omit
 #------------------------
 # Output: stability corrP
 #------------------------
 corr_psP =  paste0(outdir_images
                          ,tolower(gene)
                          ,"_corr_stability.svg" )
 cat("Corr plot stability with coloured dots:", corr_psP)
 svg(corr_psP, width = 15, height = 15)
 my_corr_pairs(corr_data_all = corr_df_ps
  , corr_cols = colnames(corr_df_ps[1:corr_end])
  , corr_method = "spearman" # other options: "pearson" or "kendall"
  , colour_categ_col = colnames(corr_df_ps[color_coln]) #"dst_mode"
  , categ_colour =  c("red", "blue")
  , density_show = F
  , hist_col = "coral4"
  , dot_size = 1.6
  , ats = 1.5
  , corr_lab_size = 3
  , corr_value_size = 1)
 dev.off()
 #####################################################
 DistCutOff = 10
 LigDist_colname  # = "ligand_distance" # from globals 
 ppi2Dist_colname  = "interface_dist"
 naDist_colname    = "TBC"
 #####################################################
 #================
 # ligand affinity
 #================
 corr_lig_colnames = c("mCSM-lig"       
                      , "MAF"             
                      , "Log (OR)"       
                      , "-Log (P)"
                      , "ligand_distance"
                      , "dst_mode"
                      , drug)
 corr_df_lig = corr_plotdf[, corr_lig_colnames]
 corr_df_lig = corr_df_lig[corr_df_lig[[LigDist_colname]]<DistCutOff,]
 color_coln = which(colnames(corr_df_lig) == "dst_mode")
 end = which(colnames(corr_df_lig) == drug)
 ncol_omit = 3  #omit dist col
 corr_end = end-ncol_omit
 #------------------------
 # Output: ligand corrP
 #------------------------
 corr_ligP =  paste0(outdir_images
                   ,tolower(gene)
                   ,"_corr_lig.svg" )
 cat("Corr plot affinity with coloured dots:", corr_ligP)
 svg(corr_ligP, width = 10, height = 10)
 my_corr_pairs(corr_data_all = corr_df_lig
              , corr_cols = colnames(corr_df_lig[1:corr_end])
              , corr_method = "spearman" # other options: "pearson" or "kendall"
              , colour_categ_col = colnames(corr_df_lig[color_coln]) #"dst_mode"
              , categ_colour =  c("red", "blue")
              , density_show = F
              , hist_col = "coral4"
              , dot_size = 2
              , ats = 1.5
              , corr_lab_size =3
              , corr_value_size = 1)
 dev.off()
 ####################################################
 #================
 # ppi2 affinity
 #================
 corr_ppi2_colnames = c("mCSM-PPI2"
                       , "MAF"             
                       , "Log (OR)"       
                       , "-Log (P)"
                       , "interface_dist" 
                       , "dst_mode"
                       , drug)
 corr_df_ppi2 = corr_plotdf[, corr_ppi2_colnames]
 corr_df_ppi2 = corr_df_ppi2[corr_df_ppi2[[ppi2Dist_colname]]<DistCutOff,]
 color_coln = which(colnames(corr_df_ppi2) == "dst_mode")
 end = which(colnames(corr_df_ppi2) == drug)
 ncol_omit = 3 #omit dist col
 corr_end = end-ncol_omit
 #------------------------
 # Output: ppi2 corrP
 #------------------------
 corr_ppi2P =  paste0(outdir_images
                    ,tolower(gene)
                    ,"_corr_ppi2.svg" )
 cat("Corr plot ppi2 with coloured dots:", corr_ppi2P)
 svg(corr_ppi2P, width = 10, height = 10)
 my_corr_pairs(corr_data_all = corr_df_ppi2
              , corr_cols = colnames(corr_df_ppi2[1:corr_end])
              , corr_method = "spearman" # other options: "pearson" or "kendall"
              , colour_categ_col = colnames(corr_df_ppi2[color_coln]) #"dst_mode"
              , categ_colour =  c("red", "blue")
              , density_show = F
              , hist_col = "coral4"
              , dot_size = 2
              , ats = 1.5
              , corr_lab_size = 3
              , corr_value_size = 1)
 #==================
 # mCSSM-NA affinity
 #==================
--- a/scripts/plotting/plotting_thesis/linage_dist_ens_stability.R
+++ b/scripts/plotting/plotting_thesis/linage_dist_ens_stability.R
@ -0,0 +1,138 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Lineage dist plots for stability:
 # average the four tools
 # func from : lineage_dist.R
 # plotdf
 # , x_axis = "duet_scaled"
 # , y_axis = "lineage_labels"
 # , x_lab = "DUET"
 # , all_lineages = F
 # , use_lineages = c("L1", "L2", "L3", "L4")
 # , with_facet = F
 # , facet_wrap_var = "" # FIXME: document what this is for
 # , fill_categ = "mutation_info_labels"
 # , fill_categ_cols = c("#E69F00", "#999999")
 # , my_ats = 15 # axis text size
 # , my_als = 20 # axis label size
 # , my_leg_ts = 16
 # , my_leg_title = 16
 # , my_strip_ts = 20
 # , leg_pos = c(0.8, 0.9)
 # , leg_pos_wf = c("top", "left", "bottom", "right")
 # , leg_dir_wf = c("horizontal", "vertical")
 # , leg_label = ""
 #########################################################
 #=======
 # output
 #=======
 outdir_images = paste0("~/git/Writing/thesis/images/results/"
                       , tolower(gene), "/")
 cat("plots will output to:", outdir_images)
 #########################################################
 #=======
 # Data
 #=======
 df2 = merged_df2
 #==================================
 # PREFORMATTING: for consistency
 # IMPORTANT for calculating effects
 #==================================
 head(df2$ddg_foldx)
 df2['ddg_foldxC'] = abs(df2$ddg_foldx)
 head(df2['ddg_foldxC'])
 # reverse signs for foldx scaled values for consistency with other tools
 df2['foldx_scaled_signC'] = abs(df2$foldx_scaled)
 # remove the old ones from 
 rm_foldx_cols = c("ddg_foldx","foldx_scaled")
 raw_cols_stab_revised    = raw_cols_stability[!raw_cols_stability%in%rm_foldx_cols]
 raw_cols_stab_revised    = c(raw_cols_stab_revised,"ddg_foldxC")
 scaled_cols_stab_revised = scaled_cols_stability[!scaled_cols_stability%in%rm_foldx_cols]
 scaled_cols_stab_revised = c(scaled_cols_stab_revised, "foldx_scaled_signC")
 #=================
 # PREFORMATTING: for consistency
 #=================
 df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S")
 table(df2$sensitivity)
 cols_to_extract  = colnames(df2)[colnames(df2)%in%c(common_cols
                                                    , outcome_cols_stability
                                                    , raw_cols_stability
                                                    , scaled_cols_stability
                                                    , raw_cols_stab_revised
                                                    , scaled_cols_stab_revised
                                                    , "lineage","lineage_labels")]
 df2_plot = df2[, cols_to_extract]
 all(table(df2_plot$lineage) == table(df2_plot$lineage_labels))
 # find which stability cols to average: should contain revised foldx
 if ("foldx_scaled_signC"%in%colnames(df2_plot)){
  cat("\nPASS: finding stability cols to average")
  cols2avg_new = which(colnames(df2_plot)%in%scaled_cols_stab_revised)
 }else{
  stop("\nAbort: Foldx column has opposing sign. Can't proceed to avergae.")
 }
 # ensemble average across predictors
 df2_plot['ens_stab_new'] = rowMeans(df2_plot[, cols2avg_new])
 head(df2_plot$position); head(df2_plot$mutationinformation)
 table(df2_plot['ens_stab_new'])
 # scaling average values
 df2_plot["ens_stab_new_scaled"] = lapply(df2_plot["ens_stab_new"]
                                         , function(x) {
                                           scales::rescale_mid(x
                                                           , to  = c(-1,1)
                                                           , from = c( min(df2_plot["ens_stab_new"])
                                                                       , max(df2_plot["ens_stab_new"]))
                                                           , mid = 0
                                                           #, from = c(0,1))
                                           )})
 min(df2_plot['ens_stab_new']); max(df2_plot['ens_stab_new'])
 foo = df2_plot[c("cols2avg_new", "ens_stab_new_scaled")]
 min(df2_plot['ens_stab_new_scaled']); max(df2_plot['ens_stab_new_scaled'])
 ###########################################################
 #====================
 # Output Lineage plot
 #====================
 linD_ens_stabP = paste0(outdir_images
                          , tolower(gene)
                          ,"_linD_ens_stabP.svg")
 cat("\nOutput plot:", linD_ens_stabP)
 svg(linD_ens_stabP, width = 10, height = 10)
 linP_dm_om = lineage_distP(df2_plot
                           , with_facet = F
                           , x_axis = "ens_stab_new_scaled"
                           , y_axis = "lineage_labels"
                           , x_lab = "Average stability"
                           #, fill_categ = "mutation_info_orig", fill_categ_cols = c("#E69F00", "#999999")
                           , fill_categ = "sensitivity"
                           , fill_categ_cols = c("red", "blue")
                           , label_categories = c("Resistant", "Sensitive")
                           , leg_label = ""
                           , my_ats = 22 # axis text size
                           , my_als = 22 # axis label size
                           , my_leg_ts = 22
                           , my_leg_title = 22
                           , my_strip_ts = 22
                           , alpha = 0.56
 )
 linP_dm_om
 dev.off()
--- a/scripts/plotting/plotting_thesis/preformatting.R
+++ b/scripts/plotting/plotting_thesis/preformatting.R
@ -0,0 +1,236 @@
 #!/usr/bin/env Rscript       
 #source("~/git/LSHTM_analysis/config/alr.R")
 source("~/git/LSHTM_analysis/config/embb.R")
 #source("~/git/LSHTM_analysis/config/katg.R")
 #source("~/git/LSHTM_analysis/config/gid.R")
 #source("~/git/LSHTM_analysis/config/pnca.R")
 #source("~/git/LSHTM_analysis/config/rpob.R")
 # get plottting dfs 
 source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 ###################################################################
 # FIXME: ADD distance to NA when SP replies
 dist_columns = c("ligand_distance", "interface_dist")
 DistCutOff = 10
 common_cols  = c("mutationinformation"
                 , "X5uhc_position"
                 , "X5uhc_offset"
                 , "position"
                 , "dst_mode"
                 , "mutation_info_labels"
                 , "sensitivity", dist_columns )
 #===================
 # stability cols
 #===================
 raw_cols_stability =  c("duet_stability_change"
                        , "deepddg"
                        , "ddg_dynamut2"
                        , "ddg_foldx")
 scaled_cols_stability = c("duet_scaled"       
                          , "deepddg_scaled"   
                          , "ddg_dynamut2_scaled"
                          , "foldx_scaled")
 outcome_cols_stability = c("duet_outcome"
                           , "deepddg_outcome"
                           , "ddg_dynamut2_outcome"
                           , "foldx_outcome")
 #===================
 # affinity cols
 #===================
 raw_cols_affinity =  c("ligand_affinity_change"
                       , "mmcsm_lig"
                       , "mcsm_ppi2_affinity"
                       , "mcsm_na_affinity")
 scaled_cols_affinity = c("affinity_scaled" 
                         , "mmcsm_lig_scaled" 
                         , "mcsm_ppi2_scaled" 
                         , "mcsm_na_scaled" )
 outcome_cols_affinity  = c( "ligand_outcome"
                            , "mmcsm_lig_outcome"
                            , "mcsm_ppi2_outcome"
                            , "mcsm_na_outcome")
 #===================
 # conservation cols
 #===================
 raw_cols_conservation =  c("consurf_score"
                           , "snap2_score"
                           , "provean_score")
 scaled_cols_conservation = c("consurf_scaled"
                             , "snap2_scaled"
                             , "provean_scaled")
 # CANNOT strictly be used, as categories are not identical with conssurf missing altogether
 outcome_cols_conservation = c("provean_outcome"
                              , "snap2_outcome"
                              , "consurf_colour_rev"
                              , "consurf_colour"#doesn't exist,use this mapping
 )
 all_cols = c(common_cols
            , raw_cols_stability
            , scaled_cols_stability
            , outcome_cols_stability
            , raw_cols_affinity
            , scaled_cols_affinity
            , outcome_cols_affinity
            , raw_cols_conservation
            , scaled_cols_conservation
            , outcome_cols_conservation)
 #=======
 # output
 #=======
 outdir_images = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
 ####################################
 # merged_df3: NECESSARY pre-processing
 ###################################
 df3 = merged_df3
 #=================
 # PREFORMATTING: for consistency
 #=================
 df3$sensitivity = ifelse(df3$dst_mode == 1, "R", "S")
 table(df3$sensitivity)
 # ConSurf labels
 consurf_colOld = "consurf_colour_rev"
 consurf_colNew = "consurf_outcome"
 df3[[consurf_colNew]] = df3[[consurf_colOld]]
 df3[[consurf_colNew]] = as.factor(df3[[consurf_colNew]])
 df3[[consurf_colNew]]
 levels(df3$consurf_outcome) = c( "nsd", 1, 2, 3, 4, 5, 6, 7, 8, 9)
 levels(df3$consurf_outcome)
 # SNAP2 labels
 snap2_colname = "snap2_outcome"
 df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "effect", "Effect")
 df3[[snap2_colname]] <- str_replace(df3[[snap2_colname]], "neutral", "Neutral")
 #  for ref: not needed perse as function already does this and assigns labels for barplots
 # labels_duet = levels(as.factor(df3$duet_outcome))
 # labels_foldx = levels(as.factor(df3$foldx_outcome))
 # labels_deepddg = levels(as.factor(df3$deepddg_outcome))
 # labels_ddg_dynamut2_outcome = levels(as.factor(df3$ddg_dynamut2_outcome))
 # 
 # labels_lig = levels(as.factor(df3_lig$ligand_outcome))
 # labels_mmlig = levels(as.factor(df3_lig$mmcsm_lig_outcome))
 # labels_ppi2 = levels(as.factor(df3_ppi2$mcsm_ppi2_outcome))
 # 
 # labels_provean = levels(as.factor(df3$provean_outcome))
 # labels_snap2   = levels(as.factor(df3$snap2_outcome))
 # labels_consurf = levels(as.factor(df3$consurf_colour_rev))
 # df3$consurf_colour_rev = as.factor(df3$consurf_colour_rev )
 ##############################################################################
 #######################################
 # merged_df2: NECESSARY pre-processing
 ######################################
 df2 = merged_df2
 #=================
 # PREFORMATTING: for consistency
 #=================
 df2$sensitivity = ifelse(df2$dst_mode == 1, "R", "S")
 table(df2$sensitivity)
 #----------------------------------------------------
 # Create dst2: fill na in dst with value of dst_mode
 # for epistasis
 #----------------------------------------------------
 df2$dst2 = ifelse(is.na(df2$dst), df2$dst_mode, df2f$dst)
 #----------------------------------------------------
 # reverse signs for foldx scaled values for
 # to allow average with other tools
 #----------------------------------------------------
 head(df2['ddg_foldx'])
 df2['ddg_foldxC'] = abs(df2$ddg_foldx)
 head(df2['ddg_foldxC'])
 head(df2['foldx_scaled'])
 df2['foldx_scaled_signC'] = abs(df2$foldx_scaled)
 head(df2['foldx_scaled_signC'])
 rm_foldx_cols = c("ddg_foldx","foldx_scaled")
 raw_cols_stab_revised    = raw_cols_stability[!raw_cols_stability%in%rm_foldx_cols]
 raw_cols_stab_revised    = c(raw_cols_stab_revised,"ddg_foldxC")
 scaled_cols_stab_revised = scaled_cols_stability[!scaled_cols_stability%in%rm_foldx_cols]
 scaled_cols_stab_revised = c(scaled_cols_stab_revised, "foldx_scaled_signC")
 ######################################################
 # Affinity related variables
 DistCutOff = 10
 LigDist_colname  # = "ligand_distance" # from globals 
 ppi2Dist_colname  = "interface_dist"
 naDist_colname    = "TBC"
 ######################################################
 # corr colnames
 # drug
 # "dst_mode"
 # "ligand_distance"
 # "DUET"
 # "mCSM-lig"       
 # "FoldX"
 # "DeepDDG"
 # "ASA"
 # "RSA"
 # "KD"             
 # "RD"
 # "Consurf"
 # "SNAP2"
 # "MAF"             
 # "Log (OR)"       
 # "-Log (P)"
 # "Dynamut2"
 # "mCSM-PPI2"       
 # "interface_dist" 
 corr_ps_colnames = c("DUET"
 , "FoldX"
 , "DeepDDG"
 , "Dynamut2"
 , "MAF"             
 , "Log (OR)"       
 , "-Log (P)"
 # , "ASA"
 # , "RSA"
 # , "KD"             
 # , "RD"
 # , "Consurf"
 # , "SNAP2"
 #, "mCSM-lig"       
 #, "ligand_distance"
 #, "mCSM-PPI2"       
 #, "interface_dist" 
 , "dst_mode"
 , drug
 )
 corr_lig_colnames = c("mCSM-lig"       
                     , "MAF"             
                     , "Log (OR)"       
                     , "-Log (P)"
                     , "ligand_distance"
                     , "dst_mode"
                     , drug)
 corr_ppi2_colnames = c("mCSM-PPI2"
                       , "MAF"             
                       , "Log (OR)"       
                       , "-Log (P)"
                       , "interface_dist" 
                       , "dst_mode"
                       , drug)
--- a/scripts/plotting/replaceBfactor_pdb.R
+++ b/scripts/plotting/replaceBfactor_pdb.R
@ -0,0 +1,332 @@
 #!/usr/bin/env Rscript                                                  
 #########################################################
 # TASK: Replace B-factors in the pdb file with the mean
 # normalised stability values.
 # read pdb file
 # make two copies so you can replace B factors for 1)duet
 # 2)affinity values and output 2 separate pdbs for
 # rendering on chimera
 # read mcsm mean stability value files
 # extract the respective mean values and assign to the
 # b-factor column within their respective pdbs
 # generate some distribution plots for inspection
 #########################################################
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 cat(c(getwd(),"\n"))
 #source("~/git/LSHTM_analysis/scripts/Header_TT.R")
 library(bio3d)
 require("getopt", quietly = TRUE) # cmd parse arguments
 #========================================================
 #drug = "pyrazinamide"
 #gene = "pncA"
 # command line args
 spec = matrix(c(
  "drug"   , "d", 1, "character",
  "gene"   , "g", 1, "character"
 ), byrow = TRUE, ncol = 4)
 opt = getopt(spec)
 drug = opt$drug
 gene = opt$gene
 if(is.null(drug)|is.null(gene)) {
  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 }
 #========================================================
 gene_match = paste0(gene,"_p.")
 cat(gene_match)
 #=============
 # directories
 #=============
 datadir = paste0("~/git/Data")
 indir = paste0(datadir, "/", drug, "/input")
 outdir = paste0("~/git/Data", "/", drug, "/output")
 #outdir_plots = paste0("~/git/Data", "/", drug, "/output/plots")
 outdir_plots = paste0("~/git/Writing/thesis/images/results/", tolower(gene))
 #======
 # input
 #======
 in_filename_pdb = paste0(tolower(gene), "_complex.pdb") 
 infile_pdb = paste0(indir, "/", in_filename_pdb)
 cat(paste0("Input file:", infile_pdb) )
 #in_filename_mean_stability = paste0(tolower(gene), "_mean_stability.csv")
 #infile_mean_stability = paste0(outdir, "/", in_filename_mean_stability)
 in_filename_mean_stability = paste0(tolower(gene), "_mean_ens_stab_aff.csv")
 infile_mean_stability = paste0(outdir_plots, "/", in_filename_mean_stability)
 cat(paste0("Input file:", infile_mean_stability) )
 #=======
 # output
 #=======
 #out_filename_duet_mspdb = paste0(tolower(gene), "_complex_bduet_ms.pdb")
 out_filename_duet_mspdb = paste0(tolower(gene), "_complex_b_stab_ms.pdb") 
 outfile_duet_mspdb = paste0(outdir_plots, "/", out_filename_duet_mspdb)
 print(paste0("Output file:", outfile_duet_mspdb))
 out_filename_lig_mspdb  = paste0(tolower(gene), "_complex_blig_ms.pdb") 
 outfile_lig_mspdb = paste0(outdir_plots, "/", out_filename_lig_mspdb)
 print(paste0("Output file:", outfile_lig_mspdb))
 #%%===============================================================
 #NOTE: duet here refers to the ensemble stability values
 ###########################
 # Read file: average stability values
 # or mcsm_normalised file
 ###########################
 my_df <- read.csv(infile_mean_stability, header = T)
 str(my_df)
 #############
 # Read pdb
 #############
 # list of 8
 my_pdb = read.pdb(infile_pdb
                  , maxlines = -1
                  , multi = FALSE 
                  , rm.insert = FALSE
                  , rm.alt = TRUE
                  , ATOM.only = FALSE 
                  , hex = FALSE
                  , verbose = TRUE)
 rm(in_filename_mean_stability, in_filename_pdb)
 # assign separately for duet and ligand 
 my_pdb_duet = my_pdb
 my_pdb_lig = my_pdb
 #=========================================================
 # Replacing B factor with mean stability scores
 # within the respective dfs
 #==========================================================
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 #df_duet = my_pdb_duet[[1]]
 df_duet= my_pdb_duet[['atom']]
 df_lig = my_pdb_lig[['atom']]
 # make a copy: required for downstream sanity checks
 d2_duet = df_duet
 d2_lig = df_lig
 # sanity checks: B factor
 max(df_duet$b); min(df_duet$b)
 max(df_lig$b); min(df_lig$b)
 #*******************************************
 # histograms and density plots for inspection
 # 1: original B-factors
 # 2: original mean stability values
 # 3: replaced B-factors with mean stability values
 #*********************************************
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    #, mfrow = c(3,2)
    , mfrow = c(3,4))
 #=============
 # Row 1 plots: original B-factors
 # duet and affinity
 #=============
 hist(df_duet$b
     , xlab = "" 
     , main = "Bfactor stability")
 plot(density(df_duet$b)
     , xlab = ""
     , main = "Bfactor stability")
 hist(df_lig$b
     , xlab = "" 
     , main = "Bfactor affinity")
 plot(density(df_lig$b)
     , xlab = ""
     , main = "Bfactor affinity")
 #=============
 # Row 2 plots: original mean stability values
 # duet and affinity
 #=============
 #hist(my_df$averaged_duet
 hist(my_df$avg_ens_stability_scaled
     , xlab = "" 
     , main = "mean stability values")
 #plot(density(my_df$averaged_duet)
 plot(density(my_df$avg_ens_stability_scaled)
     , xlab = ""
     , main = "mean stability values")
 #hist(my_df$averaged_affinity
 hist(my_df$avg_ens_affinity_scaled
     , xlab = "" 
     , main = "mean affinity values")
 #plot(density(my_df$averaged_affinity)
 plot(density(my_df$avg_ens_affinity_scaled)
     , xlab = ""
     , main = "mean affinity values")
 #==============
 # Row 3 plots: replaced B-factors with mean stability values
 # After actual replacement in the b factor column
 #===============
 ################################################################
 #=========
 # step 0_P1: DONT RUN once you have double checked the matched output
 #=========
 # sanity check:  match and assign to a separate column to double check
 # colnames(my_df)
 # df_duet$duet_scaled = my_df$averge_duet_scaled[match(df_duet$resno, my_df$position)]
 #=========
 # step 1_P1
 #=========
 # Be brave and replace in place now (don"t run sanity check)
 # this makes all the B-factor values in the non-matched positions as NA
 #df_duet$b = my_df$averaged_duet_scaled[match(df_duet$resno, my_df$position)]
 #df_lig$b = my_df$averaged_affinity_scaled[match(df_lig$resno, my_df$position)]
 df_duet$b = my_df$avg_ens_stability_scaled[match(df_duet$resno, my_df$position)]
 df_lig$b  = my_df$avg_ens_affinity_scaled[match(df_lig$resno, my_df$position)]
 #=========
 # step 2_P1
 #=========
 # count NA in Bfactor
 b_na_duet = sum(is.na(df_duet$b)) ; b_na_duet
 b_na_lig  = sum(is.na(df_lig$b)) ; b_na_lig 
 # count number of 0"s in Bactor
 sum(df_duet$b == 0)
 sum(df_lig$b  == 0)
 # replace all NA in b factor with 0
 na_rep = 2
 df_duet$b[is.na(df_duet$b)] = na_rep
 df_lig$b[is.na(df_lig$b)] = na_rep
 # # sanity check: should be 0 and True
 # # duet and lig
 # if ( (sum(df_duet$b == na_rep) == b_na_duet) && (sum(df_lig$b == na_rep) == b_na_lig) ) {
 #   print ("PASS: NA's replaced with 0s successfully in df_duet and df_lig")
 # } else {
 #   print("FAIL: NA replacement in df_duet NOT successful")
 #   quit()
 # }
 # 
 # max(df_duet$b); min(df_duet$b)
 # 
 # # sanity checks: should be True
 # if( (max(df_duet$b) == max(my_df$avg_ens_stability_scaled)) & (min(df_duet$b) == min(my_df$avg_ens_stability_scaled)) ){
 #   print("PASS: B-factors replaced correctly in df_duet")
 # } else {
 #   print ("FAIL: To replace B-factors in df_duet")
 #   quit()
 # }
 # if( (max(df_lig$b) == max(my_df$avg_ens_affinity_scaled)) & (min(df_lig$b) == min(my_df$avg_ens_affinity_scaled)) ){
 #   print("PASS: B-factors replaced correctly in df_lig")
 # } else {
 #   print ("FAIL: To replace B-factors in df_lig")
 #   quit()
 # }
 #=========
 # step 3_P1
 #=========
 # sanity check: dim should be same before reassignment
 if ( (dim(df_duet)[1] == dim(d2_duet)[1]) & (dim(df_lig)[1] == dim(d2_lig)[1]) &
     (dim(df_duet)[2] == dim(d2_duet)[2]) & (dim(df_lig)[2] == dim(d2_lig)[2])
    ){
  print("PASS: Dims of both dfs as expected")
 } else {
  print ("FAIL: Dims mismatch")
  quit()}
 #=========
 # step 4_P1:
 # VERY important
 #=========
 # assign it back to the pdb file
 my_pdb_duet[['atom']] = df_duet
 max(df_duet$b); min(df_duet$b)
 table(df_duet$b)
 sum(is.na(df_duet$b))
 my_pdb_lig[['atom']] = df_lig
 max(df_lig$b); min(df_lig$b)
 #=========
 # step 5_P1
 #=========
 cat(paste0("output file duet mean stability pdb:", outfile_duet_mspdb))
 write.pdb(my_pdb_duet, outfile_duet_mspdb)
 cat(paste0("output file ligand mean stability pdb:", outfile_lig_mspdb))
 write.pdb(my_pdb_lig, outfile_lig_mspdb)
 #============================
 # Add the 3rd histogram and density plots for comparisons
 #============================
 # Plots continued...
 # Row 3 plots: hist and density of replaced B-factors with stability values
 hist(df_duet$b
     , xlab = ""
     , main = "repalcedB duet")
 plot(density(df_duet$b)
     , xlab = ""
     , main = "replacedB duet")
 hist(df_lig$b
     , xlab = ""
     , main = "repalcedB affinity")
 plot(density(df_lig$b)
     , xlab = ""
     , main = "replacedB affinity")
 # graph titles
 mtext(text = "Frequency"
      , side = 2
      , line = 0
      , outer = TRUE)
 mtext(text = paste0(tolower(gene), ": Stability Distribution")
      , side = 3
      , line = 0
      , outer = TRUE)
 #============================================
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # NOTE: This replaced B-factor distribution has the same
 # x-axis as the PredAff normalised values, but the distribution
 # is affected since 0 is overinflated/or hs an additional blip because
 # of the positions not associated with resistance. This is because all the positions
 # where there are no SNPs have been assigned 0???
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!