import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@ -0,0 +1,250 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(cowplot)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp
+#my_df = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df$OR)
+#[1] TRUE
+
+#<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR PS Plots
+#<<<<<<<<<<<<<<<<<<<
+
+PS_df  = my_df
+
+rm(my_df)
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+getwd()
+
+source("combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df2  = merged_df3_comp
+#my_df2 = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df2)
+str(my_df2)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df2$OR)
+#[1] TRUE
+
+# sanity check: should be <10
+if (max(my_df2$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+#<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR Lig Plots
+#<<<<<<<<<<<<<<<<
+
+Lig_df  = my_df2
+
+rm(my_df2)
+
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+#############
+# Plots: Bubble plot
+# x = Position, Y = stability
+# size of dots = OR
+# col: stability
+#############
+
+#=================
+# generate plot 1: DUET vs OR by position as geom_points
+#=================  
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
+
+g = ggplot(PS_df, aes(x = factor(Position)
+                   , y = ratioDUET))
+
+p1 = g + 
+  geom_point(aes(col = DUET_outcome
+                 , size = OR)) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "DUET(PS)"
+       , size = "Odds Ratio"
+       , colour = "DUET Outcome") +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p1 
+
+#=================
+# generate plot 2: Lig vs OR by position as geom_points
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(Lig_df$Lig_outcome)
+
+g = ggplot(Lig_df, aes(x = factor(Position)
+                   , y = ratioPredAff))
+
+p2 = g + 
+  geom_point(aes(col = Lig_outcome
+                   , size = OR))+
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Ligand Affinity"
+       , size = "Odds Ratio"
+       , colour = "Ligand Outcome"
+       ) +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p2
+
+#======================
+#combine using cowplot
+#======================
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
+#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
+theme_set(theme_gray()) # to preserve default theme
+
+printFile = cowplot::plot_grid(plot_grid(p1, p2
+                             , ncol = 1
+                             , align = 'v'
+                             , labels = c("A", "B")
+                             , label_size = my_als+5))
+print(printFile)
+dev.off()
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@ -0,0 +1,154 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# Lig_outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig_outcome.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df  = my_df 
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(my_df)
+
+# sanity checks
+upos = unique(my_df$Position)
+
+# should be a factor
+is.factor(df$Lig_outcome)
+#TRUE
+
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@ -0,0 +1,149 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot 2: Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# DUET outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET_outcome
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(my_df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
+  
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@ -0,0 +1,202 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+# very important!!!!
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = Lig_outcome
+# subgroup = normalised score i.e ratioPredAff
+
+# Prepare data: round off ratioLig scores
+# round off to 3 significant digits:
+# 165 if no rounding is performed: used to generate the originalgraph
+# 156 if rounded to 3 places
+# FIXME: check if reducing precision creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioPredAff) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioLigR = round(df$ratioPredAff, n) 
+u = unique(df$ratioLigR) # 156
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioLigR
+df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioLig
+#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@ -0,0 +1,192 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = DUET_outcome
+# subgroup = normalised score i.e ratioDUET
+
+# Prepare data: round off ratioDUET scores
+# round off to 3 significant digits:
+# 323 if no rounding is performed: used to generate the original graph
+# 287 if rounded to 3 places
+# FIXME: check if reducing precicion creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioDUET) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioDUETR = round(df$ratioDUET, n)
+u = unique(df$ratioDUETR)
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioDUETR
+df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioDUET
+#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@ -0,0 +1,215 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+#require(data.table)
+#require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$lig_outcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+}
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = Lig_outcome))
+#prinfFile = g + geom_bar(
+  g + geom_bar(
+  aes(fill = Lig_outcome)
+  , show.legend = TRUE
+) + geom_label(
+  stat = "count"
+  , aes(label = ..count..)
+  , color = "black"
+  , show.legend = FALSE
+  , size = 10) + theme(
+    axis.text.x = element_blank()
+    , axis.title.x = element_blank()
+    , axis.title.y = element_text(size=my_als)
+    , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+  ) + labs(
+    title = ""
+    , y = "Number of SNPs"
+    #, fill='Ligand Outcome'
+  )  + scale_fill_discrete(name = "Ligand Outcome"
+                           , labels = c("Destabilising", "Stabilising"))
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#require(data.table)
+setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
+
+head(df$pos_count)
+table(df$pos_count)
+# this is cummulative
+#1  2  3  4  5  6 
+#5 24 36 56 30 18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count)) 
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#5 12 12 14  6  3
+# this is what will get plotted
+
+svg('position_count_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+  #g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+  ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+    )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+    )
+    , axis.title.x = element_text(size = my_als)
+    , axis.title.y = element_text(size = my_als)
+    , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+  )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of Lig barplots         			   #
+########################################################################
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@ -0,0 +1,211 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+  }
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_DUET.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+theme_set(theme_grey())
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = DUET_outcome))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes(fill = DUET_outcome)
+  , show.legend = TRUE
+  ) + geom_label(
+    stat = "count"
+    , aes(label = ..count..)
+    , color = "black"
+    , show.legend = FALSE
+    , size = 10) + theme(
+      axis.text.x = element_blank()
+      , axis.title.x = element_blank()
+      , axis.title.y = element_text(size=my_als)
+      , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+    ) + labs(
+      title = ""
+      , y = "Number of SNPs"
+      #, fill='DUET Outcome'
+      ) + scale_fill_discrete(name = "DUET Outcome"
+                              , labels = c("Destabilising", "Stabilising"))
+
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
+
+setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
+table(df$pos_count)
+# this is cummulative
+#1   2   3   4   5   6 
+#34  76  63 104  40  18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#34 38 21 26  8  3 
+
+foo = select(df, Mutationinformation
+             , WildPos
+             , wild_type
+             , mutant_type
+             , mutation_info
+             , position
+             , pos_count) #335, 5
+
+getwd()
+write.csv(foo, "../Data/pos_count_freq.csv")
+
+svg('position_count_DUET.svg')
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+  ) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+    ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+      )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+      )
+  , axis.title.x = element_text(size = my_als)
+  , axis.title.y = element_text(size = my_als)
+  , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+    )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of DUET barplots         			   #
+########################################################################
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@ -0,0 +1,175 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for PS Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$DUET_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+
+# subset data to generate pairwise correlations
+corr_data = df[, c("ratioDUET"
+#                  , "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+                   , "DUET_outcome"
+#                  , "Lig_outcome"
+                   , "pyrazinamide"
+                   )]
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+#                    , "Ligand Affinity"
+#                    , "DUET_raw"
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+                     , "DUET_outcome"
+#                    , "Lig_outcome"
+                     , "pyrazinamide")
+
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+#default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('DUET_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(printFile)
+dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@ -0,0 +1,187 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages			   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$Lig_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+# subset data to generate pairwise correlations
+corr_data = df[, c(#"ratioDUET",
+                  "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+#                  , "DUET_outcome"
+                   , "Lig_outcome"
+                   , "pyrazinamide"
+                   )] 
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c(#"DUET",
+                     "Ligand Affinity"
+#                    ,"DUET_raw" 
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+#                    , "DUET_outcome"
+                     , "Lig_outcome"
+                     , "pyrazinamide")
+                     
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+
+# default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('Lig_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
+             , pch = 21
+             , jitter = T
+#            , alpha = .05
+#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+print(printFile)
+dev.off()
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@ -0,0 +1,227 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+
+require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df		   	  		   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for plots
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+
+#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
+#3        104       1293        264       1311          6          6        105 
+
+#===========================
+# Plot: Lineage Barplots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+df_lin = subset(df, subset = lineage %in% sel_lineages )
+
+#FIXME; add sanity check for numbers.
+# Done this manually
+
+############################################################
+
+#########
+# Data for barplot: Lineage barplot
+# to show total samples and number of unique mutations 
+# within each linege
+##########
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$Mutationinformation)))
+  curr_count = length(unique(foo$Mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+bar$num_snps_u = y1
+bar$total_samples = y2
+sel_lineages = x
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+melted = melt(to_plot, id = "x")
+melted
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_basic_barplot.svg')
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted
+           , aes(x = x
+                 , y = value
+                 , fill = variable)
+           )
+
+
+printFile = g + geom_bar(
+  
+#g + geom_bar(
+  stat = "identity"
+  , position = position_stack(reverse = TRUE)
+  , alpha=.75
+  , colour='grey75'
+    ) + theme(
+    axis.text.x = element_text(
+      size = my_ats
+#      , angle= 30
+    )
+  , axis.text.y = element_text(size = my_ats
+  #, angle = 30
+  , hjust = 1
+  , vjust = 0)
+  , axis.title.x = element_text(
+    size = my_als
+    , colour = 'black'
+    )
+  , axis.title.y = element_text(
+    size = my_als
+    , colour = 'black'
+  )
+  , legend.position = "top"
+  , legend.text = element_text(size = my_als)
+  
+  #) + geom_text(
+  ) + geom_label(
+    aes(label = value)
+    , size = 5
+    , hjust = 0.5
+    , vjust = 0.5
+    , colour = 'black'
+    , show.legend = FALSE
+    #, check_overlap = TRUE
+    , position = position_stack(reverse = T)
+    #, position = ('
+
+  ) + labs(
+    title = ''
+    , x = ''
+    , y = "Number"
+    , fill = 'Variable'
+    , colour = 'black'
+  ) + scale_fill_manual(
+      values = c('grey50', 'gray75')
+      , name=''
+      , labels=c('Mutations', 'Total Samples')
+    ) + scale_x_discrete(
+      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+    )
+print(printFile)
+dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@ -0,0 +1,233 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for Lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#78     961      195     803 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#77     955      194     770
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+g <- ggplot(df, aes(x = ratioPredAff)) + 
+  geom_density(aes(fill = Lig_outcome)
+               , alpha = 0.5) + 
+  facet_wrap( ~ lineage
+             , scales = "free"
+             , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian(xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+) 
+    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_LIG.svg')
+
+printFile = ggplot( df, aes(x = ratioPredAff
+                          , y = Lig_outcome) ) +
+  
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#              , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+                  ) +
+
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "Ligand Affinity" ) +
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+      ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioPredAff
+lin2 = df[df$lineage == "lineage2",]$ratioPredAff
+lin3 = df[df$lineage == "lineage3",]$ratioPredAff
+lin4 = df[df$lineage == "lineage4",]$ratioPredAff
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3) 
+ks.test(lin2,lin4) 
+
+ks.test(lin3,lin4) 
+
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,212 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#104     1293      264     1311 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#99     1275      263     1255
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+g <- ggplot(df, aes(x = ratioDUET)) + 
+  geom_density(aes(fill = DUET_outcome)
+               , alpha = 0.5) + facet_wrap(~ lineage,
+                                           scales = "free") +
+  ggtitle("Kernel Density estimates of Protein stability by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_PS.svg')
+
+printFile = ggplot( df, aes(x = ratioDUET
+                            , y = DUET_outcome) )+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off" 
+                ) +
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size=my_als)
+         , legend.text = element_text(size=10)
+         , legend.title = element_text(size=my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+        ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioDUET
+lin2 = df[df$lineage == "lineage2",]$ratioDUET
+lin3 = df[df$lineage == "lineage3",]$ratioDUET
+lin4 = df[df$lineage == "lineage4",]$ratioDUET
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3)
+ks.test(lin2,lin4)  
+
+ks.test(lin3,lin4)  
+
+
+