import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -0,0 +1,512 @@
+###########################
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+# quick checks
+colnames(my_df)
+str(my_df)
+###########################
+# Data for bfactor figure
+# PS average
+# Lig average
+###########################
+head(my_df$Position)
+head(my_df$ratioDUET)
+# order data frame
+df = my_df[order(my_df$Position),]
+head(df$Position)
+head(df$ratioDUET)
+#***********
+# PS: average by position
+#***********
+mean_DUET_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.DUET = mean(ratioDUET))
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.Lig = mean(ratioPredAff))
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+# sanity check
+# mean_PS_Lig_Bfactor
+colnames(combined)
+colnames(combined) = c("Position"
+, "average_DUETR"
+, "Position2"
+, "average_PredAffR")
+colnames(combined)
+identical(combined$Position, combined$Position2)
+n = which(colnames(combined) == "Position2"); n
+combined_df = combined[,-n]
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+#=============
+# output csv
+#============
+outDir = "~/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+head(combined_df$Position); tail(combined_df$Position)
+write.csv(combined_df, outFile
+, row.names = F)
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+source("../combining_two_df.R")
+###########################
+# This will return:
+# df with NA:
+# merged_df2
+# merged_df3
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+###########################
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+# quick checks
+colnames(my_df)
+str(my_df)
+###########################
+# Data for bfactor figure
+# PS average
+# Lig average
+###########################
+head(my_df$Position)
+head(my_df$ratioDUET)
+# order data frame
+df = my_df[order(my_df$Position),]
+head(df$Position)
+head(df$ratioDUET)
+#***********
+# PS: average by position
+#***********
+mean_DUET_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.DUET = mean(ratioDUET))
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.Lig = mean(ratioPredAff))
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+# sanity check
+# mean_PS_Lig_Bfactor
+colnames(combined)
+colnames(combined) = c("Position"
+, "average_DUETR"
+, "Position2"
+, "average_PredAffR")
+colnames(combined)
+identical(combined$Position, combined$Position2)
+n = which(colnames(combined) == "Position2"); n
+combined_df = combined[,-n]
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+head(combined_df$Position); tail(combined_df$Position)
+write.csv(combined_df, outFile
+, row.names = F)
+# read in pdb file complex1
+inDir = "~/git/Data/pyrazinamide/input/structure"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+# read in pdb file complex1
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+#########################
+#3: Read complex pdb file
+##########################
+source("Header_TT.R")
+# list of 8
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+rm(inDir, inFile)
+#====== end of script
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+complex1 = inFile
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+inFile
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+#inFile2 = paste0(inDir, "complex2_no_water.pdb")
+#complex2 = inFile2
+# list of 8
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+rm(inDir, inFile, complex1)
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+source("Header_TT.R")
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+source("Header_TT.R")
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+source("read_pdb.R") # list of 8
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+# make a copy: required for downstream sanity checks
+d2 = d
+# sanity checks: B factor
+max(d$b); min(d$b)
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+#1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: DUET scores
+hist(my_df$average_DUETR
+, xlab = ""
+, main = "Norm_DUET")
+plot(density(my_df$average_DUETR)
+, xlab = ""
+, main = "Norm_DUET")
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+#1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: DUET scores
+hist(my_df$average_DUETR
+, xlab = ""
+, main = "Norm_DUET")
+plot(density(my_df$average_DUETR)
+, xlab = ""
+, main = "Norm_DUET")
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+# count number of 0's in Bactor
+sum(d$b == 0)
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+# sanity check: should be 0
+sum(is.na(d$b))
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+print("Error: NA replacement NOT successful, Debug code!")
+}
+max(d$b); min(d$b)
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+if (min(d$b) == min(my_df$average_DUETR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d
+max(d$b); min(d$b)
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/output/"
+getwd()
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+outDir = "~/git/Data/pyrazinamide/input/structure"
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+hist(d$b
+, xlab = ""
+, main = "repalced-B")
+plot(density(d$b)
+, xlab = ""
+, main = "replaced-B")
+# graph titles
+mtext(text = "Frequency"
+, side = 2
+, line = 0
+, outer = TRUE)
+mtext(text = "DUET_stability"
+, side = 3
+, line = 0
+, outer = TRUE)
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace
+rm(list = ls())
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace
+rm(list = ls())
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+#=========================================================
+# Processing P2: Replacing B factor with mean ratioLig scores
+#=========================================================
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+source("read_pdb.R") # list of 8
+# extract atom list into a vari
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+# make a copy: required for downstream sanity checks
+d2 = d
+# sanity checks: B factor
+max(d$b); min(d$b)
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+# 1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+, xlab = ""
+, main = "Norm_lig_average")
+plot(density(my_df$average_PredAffR)
+, xlab = ""
+, main = "Norm_lig_average")
+# 3: After the following replacement
+#********************************
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+# 1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+, xlab = ""
+, main = "Norm_lig_average")
+plot(density(my_df$average_PredAffR)
+, xlab = ""
+, main = "Norm_lig_average")
+# 3: After the following replacement
+#********************************
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+# count number of 0's in Bactor
+sum(d$b == 0)
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+# sanity check: should be 0
+sum(is.na(d$b))
+if (sum(d$b == 0) == b_na){
+print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+print("Error: NA replacement NOT successful, Debug code!")
+}
+max(d$b); min(d$b)
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+if (min(d$b) == min(my_df$average_PredAffR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d
+max(d$b); min(d$b)
+#=========
+# step 5_P2
+#=========
+write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
+# output dir
+getwd()
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@ -0,0 +1,129 @@
+#########################################################
+### A) Installing and loading required packages
+#########################################################
+
+#if (!require("gplots")) {
+#  install.packages("gplots", dependencies = TRUE)
+#  library(gplots)
+#}
+
+if (!require("tidyverse")) {
+  install.packages("tidyverse", dependencies = TRUE)
+  library(tidyverse)
+}
+
+if (!require("ggplot2")) {
+  install.packages("ggplot2", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("cowplot")) {
+  install.packages("copwplot", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("ggcorrplot")) {
+  install.packages("ggcorrplot", dependencies = TRUE)
+  library(ggcorrplot)
+}
+
+if (!require("ggpubr")) {
+  install.packages("ggpubr", dependencies = TRUE)
+  library(ggpubr)
+}
+
+if (!require("RColorBrewer")) {
+  install.packages("RColorBrewer", dependencies = TRUE)
+  library(RColorBrewer)
+}
+
+if (!require ("GOplot")) {
+  install.packages("GOplot")
+  library(GOplot)
+}
+
+if(!require("VennDiagram")) {
+  
+  install.packages("VennDiagram", dependencies = T)
+  library(VennDiagram)
+}
+
+if(!require("scales")) {
+  
+  install.packages("scales", dependencies = T)
+  library(scales)
+}
+
+if(!require("plotrix")) {
+  
+  install.packages("plotrix", dependencies = T)
+  library(plotrix)
+}
+
+if(!require("stats")) {
+  
+  install.packages("stats", dependencies = T)
+  library(stats)
+}
+
+if(!require("stats4")) {
+  
+  install.packages("stats4", dependencies = T)
+  library(stats4)
+}
+
+if(!require("data.table")) {
+  library(stats4)
+}
+
+if (!require("PerformanceAnalytics")){
+  install.packages("PerformanceAnalytics", dependencies = T)
+  library(PerformaceAnalytics)
+}
+
+if (!require ("GGally")){
+  install.packages("GGally")
+  library(GGally)
+}
+
+if (!require ("corrr")){
+  install.packages("corrr")
+  library(corrr)
+}
+
+if (!require ("psych")){
+  install.packages("psych")
+  library(psych)
+}
+
+if (!require ("dplyr")){
+  install.packages("dplyr")
+  library(psych)
+}
+
+
+if (!require ("compare")){
+  install.packages("compare")
+  library(psych)
+}
+
+if (!require ("arsenal")){
+  install.packages("arsenal")
+  library(psych)
+}
+
+
+####TIDYVERSE
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+library(ggcorrplot)
+
+
+###for PDB files
+#install.packages("bio3d") 
+if(!require(bio3d)){
+  install.packages("bio3d")
+  library(bio3d)
+}
--- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
+++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
@ -0,0 +1,27 @@
+#########################################################
+# 1b: Define function: coloured barplot by subgroup
+# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
+#########################################################
+
+ColourPalleteMulti <- function(df, group, subgroup){
+  
+  # Find how many colour categories to create and the number of colours in each
+  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
+                          , df
+                          , function(x) length(unique(x)))
+  #  return(categories) }
+  
+  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
+  
+  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
+  
+  #return(category.start); return(category.end)}
+  
+  # Build Colour pallette
+  colours <- unlist(lapply(1:nrow(categories),
+                           function(i){
+                             colorRampPalette(colors = c(category.start[i]
+                                                         , category.end[i]))(categories[i,2])}))
+  return(colours)
+}
+#########################################################
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -0,0 +1,299 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
+getwd()
+
+#########################################################
+# TASK: To combine mcsm and meta data with af and or
+#########################################################
+
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data = read.csv(inFile
+                     , row.names = 1
+                     , stringsAsFactors = F
+                     , header = T) 
+rm(inDir, inFile)
+
+str(mcsm_data)
+
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+
+# spelling Correction 1: DUET
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
+
+# spelling Correction 2: Ligand
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
+
+# count na in each column
+na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
+
+# sort by Mutationinformation
+mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
+head(mcsm_data$Mutationinformation)
+
+# get freq count of positions and add to the df
+setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
+
+pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
+
+###########################
+# 2: Read file: meta data with AFandOR
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
+
+meta_with_afor <- read.csv(inFile2
+                      , stringsAsFactors = F
+                      , header = T)
+
+rm(inDir, inFile2)
+
+str(meta_with_afor)
+
+# sort by Mutationinformation
+head(meta_with_afor$Mutationinformation)
+meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
+head(meta_with_afor$Mutationinformation)
+
+# sanity check: should be True for all the mentioned columns
+#is.numeric(meta_with_afor$OR)
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+
+c1 = NULL
+for (i in na_var){
+  print(i)
+  c0 = is.numeric(meta_with_afor[,i])
+  c1 = c(c0, c1)
+  if ( all(c1) ){
+    print("Sanity check passed: These are all numeric cols")
+  } else{
+    print("Error: Please check your respective data types")
+  }
+}
+
+# If OR, and P value are not numeric, then convert to numeric and then count
+# else they will say 0
+na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
+str(na_count)
+
+# compare if the No of "NA" are the same for all these cols
+na_len = NULL
+for (i in na_var){
+  temp = na_count[[i]]
+  na_len = c(na_len, temp)
+}
+
+# extract how many NAs: 
+# should be all TRUE 
+# should be a single number since 
+# all the cols should have "equal" and "same" no. of NAs
+
+my_nrows = NULL
+for ( i in 1: (length(na_len)-1) ){
+  #print(compare(na_len[i]), na_len[i+1])
+  c = compare(na_len[i], na_len[i+1])
+  if ( c$result ) {
+    my_nrows = na_len[i] }
+  else { 
+  print("Error: Please check your numbers") 
+  }
+}
+
+my_nrows
+
+#=#=#=#=#=#=#=#=#
+# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
+# these are the same 7 ones
+#=#=#=#=#=#=#=#=#
+
+# sanity check
+#which(is.na(meta_with_afor$OR)) 
+
+# initialise an empty df with nrows as extracted above
+na_count_df = data.frame(matrix(vector(mode = 'numeric'
+#                                       , length = length(na_var)
+                                       )
+                                , nrow = my_nrows
+#                                , ncol = length(na_var)
+                              ))
+
+# populate the df with the indices of the cols that are NA
+for (i in na_var){
+  print(i)
+  na_i = which(is.na(meta_with_afor[i]))
+  na_count_df = cbind(na_count_df, na_i)
+  colnames(na_count_df)[which(na_var == i)] <- i
+}
+
+# Now compare these indices to ensure these are the same
+c2 = NULL
+for ( i in 1: ( length(na_count_df)-1 ) ) {
+#  print(na_count_df[i] == na_count_df[i+1])
+  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
+  c2 = c(c1, c2)
+  if ( all(c2) ) {
+    print("Sanity check passed: The indices for AF, OR, etc are all the same")
+  } else {
+    print ("Error: Please check indices which are NA")
+  }
+}
+
+rm( c, c0, c1, c2, i, my_nrows
+    , na_count, na_i, na_len
+    , na_var, temp
+    , na_count_df
+    , pos_count_check )
+
+###########################
+# 3:merging two dfs: with NA
+###########################
+
+# link col name  = Mutationinforamtion
+head(mcsm_data$Mutationinformation)
+head(meta_with_afor$Mutationinformation)
+
+#########
+# merge 1a: meta data with mcsm
+#########
+merged_df2 = merge(x = meta_with_afor
+                  ,y = mcsm_data
+                  , by = "Mutationinformation"
+                  , all.y = T)
+
+head(merged_df2$Position)
+
+# sort by Position
+head(merged_df2$Position)
+merged_df2 = merged_df2[order(merged_df2$Position),]
+head(merged_df2$Position)
+
+merged_df2v2 = merge(x = meta_with_afor
+                   ,y = mcsm_data
+                   , by = "Mutationinformation"
+                   , all.x = T) 
+#!=!=!=!=!=!=!=!
+# COMMENT: used all.y since position 186 is not part of the struc,
+# hence doesn't have a mcsm value
+# but 186 is associated with with mutation
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$Position%in%merged_df2v2$Position)
+
+rm(merged_df2v2)
+
+#########
+# merge 1b:remove duplicate mutation information
+#########
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
+head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
+
+# sanity checks
+# nrows of merged_df3 should be the same as the nrows of mcsm_data
+if(nrow(mcsm_data) == nrow(merged_df3)){
+  print("sanity check: Passed")
+} else {
+  print("Error!: check data, nrows is not as expected")
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# uncomment as necessary
+# only need to run this if merged_df2v2 i.e non structural pos included
+#mcsm = mcsm_data$Mutationinformation
+#my_merged = merged_df3$Mutationinformation
+
+# find the index where it differs
+#diff_n = which(!my_merged%in%mcsm)
+
+#check if it is indeed pos 186
+#merged_df3[diff_n,]
+
+# remove this entry
+#merged_df3 = merged_df3[-diff_n,]]
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+###########################
+# 3b :merging two dfs: without NA
+###########################
+
+#########
+# merge 2a:same as merge 1 but excluding NA
+#########
+merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
+
+#########
+# merge 2b: remove duplicate mutation information
+#########
+merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
+
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$AF),]
+# compare dfs: foo and merged_df3_com
+all.equal(foo, merged_df3)
+
+summary(comparedf(foo, merged_df3))
+
+#=============== end of combining df
+#clear variables
+rm(mcsm_data
+   , meta_with_afor
+   , foo)
+
+#rm(diff_n, my_merged, mcsm)
+
+#=====================
+# write_output files
+#=====================
+# output dir
+outDir = "~/git/Data/pyrazinamide/output/"
+getwd()
+
+outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
+write.csv(merged_df3, outFile1)
+
+#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
+#write.csv(merged_df3_comp, outFile2)
+
+rm(outDir
+   , outFile1
+#   , outFile2
+)
+#============================= end of script
+
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
@ -0,0 +1,348 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
+getwd()
+
+#########################################################
+# TASK: To combine mcsm and meta data with af and or
+# by filtering for distance to ligand (<10Ang)
+#########################################################
+
+#########################################################
+# Installing and loading required packages
+#########################################################
+
+#source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data = read.csv(inFile
+                     , row.names = 1
+                     , stringsAsFactors = F
+                     , header = T) 
+rm(inDir, inFile)
+
+str(mcsm_data)
+
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+
+# spelling Correction 1: DUET
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
+
+# spelling Correction 2: Ligand
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
+
+########################### !!! only for mcsm_lig
+# 4: Filter/subset data 
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# count
+table(mcsm_data$Dis_lig_Ang<10)
+
+# subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+# sanity checks
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+# count no of unique positions
+length(unique(mcsm_data2$Position))
+
+# count no of unique mutations
+length(unique(mcsm_data2$Mutationinformation))
+
+# count Destabilisinga and stabilising
+table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT: so as not to alter the script
+mcsm_data = mcsm_data2
+#<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(mcsm_data$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+# clear variables
+rm(mcsm_data2)
+
+# count na in each column
+na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
+
+head(mcsm_data$Mutationinformation)
+mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
+mcsm_data[mcsm_data$Mutationinformation=="L4S",]
+
+# sort by Mutationinformation
+mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
+head(mcsm_data$Mutationinformation)
+
+# check
+mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
+mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
+
+# get freq count of positions and add to the df
+setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
+
+pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
+
+###########################
+# 2: Read file: meta data with AFandOR
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
+
+meta_with_afor <- read.csv(inFile2
+                      , stringsAsFactors = F
+                      , header = T)
+
+str(meta_with_afor)
+
+# sort by Mutationinformation
+head(meta_with_afor$Mutationinformation)
+meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
+head(meta_with_afor$Mutationinformation)
+
+# sanity check: should be True for all the mentioned columns
+#is.numeric(meta_with_afor$OR)
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+
+c1 = NULL
+for (i in na_var){
+  print(i)
+  c0 = is.numeric(meta_with_afor[,i])
+  c1 = c(c0, c1)
+  if ( all(c1) ){
+    print("Sanity check passed: These are all numeric cols")
+  } else{
+    print("Error: Please check your respective data types")
+  }
+}
+
+# If OR, and P value are not numeric, then convert to numeric and then count
+# else they will say 0
+
+# NOW count na in each column: if you did it before, then 
+# OR and Pvalue column would say 0 na since these were not numeric
+na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
+str(na_count)
+
+# compare if the No of "NA" are the same for all these cols
+na_len = NULL
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+for (i in na_var){
+  temp = na_count[[i]]
+  na_len = c(na_len, temp)
+}
+
+my_nrows = NULL
+
+for ( i in 1: (length(na_len)-1) ){
+  #print(compare(na_len[i]), na_len[i+1])
+  c = compare(na_len[i], na_len[i+1])
+  if ( c$result ) {
+    my_nrows = na_len[i] }
+  else { 
+    print("Error: Please check your numbers") 
+  }
+}
+
+my_nrows
+
+#=#=#=#=#=#=#=#=#
+# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
+# all have 81 NA, with pyrazinamide with 960
+# and these are the same 7 ones
+#=#=#=#=#=#=#=#=#
+
+# sanity check
+#which(is.na(meta_with_afor$OR)) 
+
+# initialise an empty df with nrows as extracted above
+na_count_df = data.frame(matrix(vector(mode = 'numeric'
+#                                      , length = length(na_var) 
+                                      )
+                                , nrow = my_nrows
+#                                , ncol = length(na_var)
+                                ))
+
+# populate the df with the indices of the cols that are NA
+for (i in na_var){
+  print(i)
+  na_i = which(is.na(meta_with_afor[i]))
+  na_count_df = cbind(na_count_df, na_i)
+  colnames(na_count_df)[which(na_var == i)] <- i
+} 
+
+# Now compare these indices to ensure these are the same
+c2 = NULL
+for ( i in 1: ( length(na_count_df)-1 ) ) {
+  #  print(na_count_df[i] == na_count_df[i+1])
+  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
+  c2 = c(c1, c2)
+  if ( all(c2) ) {
+    print("Sanity check passed: The indices for AF, OR, etc are all the same")
+  } else {
+    print ("Error: Please check indices which are NA")
+  }
+}
+
+rm( c, c1, c2, i, my_nrows
+    , na_count, na_i, na_len
+    , na_var, temp
+    , na_count_df
+    , pos_count_check )
+
+###########################
+# 3:merging two dfs: with NA
+###########################
+
+# link col name  = Mutationinforamtion
+head(mcsm_data$Mutationinformation)
+head(meta_with_afor$Mutationinformation)
+
+#########
+# merge 1a: meta data with mcsm
+#########
+merged_df2 = merge(x = meta_with_afor
+                  , y = mcsm_data
+                  , by = "Mutationinformation"
+                  , all.y = T)
+
+head(merged_df2$Position)
+
+# sort by Position
+head(merged_df2$Position)
+merged_df2 = merged_df2[order(merged_df2$Position),]
+head(merged_df2$Position)
+
+merged_df2v2 = merge(x = meta_with_afor
+                   ,y = mcsm_data
+                   , by = "Mutationinformation"
+                   , all.x = T) 
+
+#!=!=!=!=!=!=!=!
+# COMMENT: used all.y since position 186 is not part of the struc,
+# hence doesn't have a mcsm value
+# but 186 is associated with with mutation
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$Position%in%merged_df2v2$Position)
+
+rm(merged_df2v2)
+
+#########
+# merge 1b:remove duplicate mutation information
+#########
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
+head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
+
+# sanity checks
+# nrows of merged_df3 should be the same as the nrows of mcsm_data
+if(nrow(mcsm_data) == nrow(merged_df3)){
+  print("sanity check: Passed")
+} else {
+  print("Error!: check data, nrows is not as expected")
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# uncomment as necessary
+# only need to run this if merged_df2v2 i.e non structural pos included
+#mcsm = mcsm_data$Mutationinformation
+#my_merged = merged_df3$Mutationinformation
+
+# find the index where it differs
+#diff_n = which(!my_merged%in%mcsm)
+
+#check if it is indeed pos 186
+#merged_df3[diff_n,]
+
+# remove this entry
+#merged_df3 = merged_df3[-diff_n,] 
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+###########################
+# 3b :merging two dfs: without NA
+###########################
+
+#########
+# merge 2a:same as merge 1 but excluding NA
+#########
+merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
+
+#########
+# merge 2b: remove duplicate mutation information
+#########
+merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
+
+# FIXME: add this as a sanity check. I have manually checked!
+
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$AF),]
+
+# compare dfs: foo and merged_df3_com
+all.equal(foo, merged_df3)
+
+summary(comparedf(foo, merged_df3))
+
+#=============== end of combining df
+#clear variables
+rm(mcsm_data
+   , meta_with_afor
+   , foo)
+
+#rm(diff_n, my_merged, mcsm)
+
+#===============end of script
+
+#=====================
+# write_output files
+#=====================
+ 
+# Not required as this is a subset of the "combining_two_df.R" script
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+#*************************************
+# need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
+# per line. Sort by unique, which automatically removes duplicates.
+# sace file in current directory
+#**********************************************************************
+infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
+outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
+
+# sort unique entries and output to current directory
+sort -u ${infile} > ${outfile}
+
+# count no. of unique snps mCSM will run on
+count=$(wc -l < ${outfile})
+
+# print to console no. of unique snps mCSM will run on
+echo "${count} unique mutations for mCSM to run on"
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -0,0 +1,72 @@
+#!/bin/bash
+
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: submit requests using curl: HANDLE redirects and refresh url. 
+# Iterate over mutation file and write/append result urls to a file
+# result url file: stored in the /Results directory
+# mutation file: one mutation per line, no chain ID
+# output: in a file, should be n urls (n=no. of mutations in file)
+# NOTE: these are just result urls, not actual values for results
+#**********************************************************************
+## iterate over mutation file; line by line and submit query using curl
+filename="../Data/pnca_mis_SNPs_v2_unique.csv"
+
+## some useful messages
+echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
+COUNT=0
+while read -r line; do
+((COUNT++))
+    mutation="${line}"
+#    echo "${mutation}"
+pdb='../Data/complex1_no_water.pdb'
+mutation="${mutation}"
+chain="A"
+lig_id="PZA"
+affin_wt="0.99"
+host="http://biosig.unimelb.edu.au"
+call_url="/mcsm_lig/prediction"
+
+##=========================================
+##html field_names names required for curl
+##complex_field:wild=@
+##mutation_field:mutation=@
+##chain_field:chain=@
+##ligand_field:lig_id@
+##energy_field:affin_wt
+#=========================================
+refresh_url=$(curl -L \
+     -sS \
+     -F "wild=@${pdb}" \
+     -F "mutation=${mutation}" \
+     -F "chain=${chain}" \
+     -F "lig_id=${lig_id}" \
+     -F "affin_wt=${affin_wt}" \
+     ${host}${call_url} | grep "http-equiv")
+
+#echo $refresh_url
+#echo ${host}${refresh_url}
+
+#use regex to extract the relevant bit from the refresh url
+#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
+
+#Now build: result url using host and refresh url and write the urls to a file in the Results dir
+result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
+sleep 10
+
+echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
+
+echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -n '.'
+done < "${filename}"
+
+echo
+echo "Processing Complete"
+
+##end of submitting query, receiving result url and storing results url in a file
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: submit result urls and fetch actual results using curl
+# iterate over each result url from the output of step1 in the stored
+# in file in /Results.
+# Use curl to fetch results and extract relevant sections using hxtools
+# and store these in another file in /Results 
+# This script takes two arguments:
+# 	input file: file containing results url
+#				In this case: 336_mCSM_lig_complex1_result_url.txt
+# 	output file: name of the file where extracted results will be stored
+#				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
+#*********************************************************************
+
+#if [ "$#" -ne 2 ]; then
+  #if [ -Z $1 ]; then
+#  echo "
+#  Please provide both Input and Output files.
+
+#  Usage: batch_read_urls.sh INFILE OUTFILE
+#  "
+#  exit 1
+#fi
+
+# First argument: Input File
+# Second argument: Output File
+#infile=$1
+#outfile=$2
+
+infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
+outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
+
+echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
+echo
+COUNT=0
+while read -r line; do
+#COUNT=$(($COUNT+1))
+((COUNT++))
+  curl --silent ${line} \
+    | hxnormalize -x \
+    | hxselect -c div.span4 \
+    | hxselect -c div.well \
+    | sed -r -e 's/<[^>]*>//g' \
+    | sed -re 's/ +//g' \
+    >> ${outfile}
+  #| tee -a ${outfile}
+#  echo -n '.'
+echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
+  
+done < "${infile}"
+
+echo
+echo "Processing Complete"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: Intermediate results processing
+# output file has a convenient delimiter of ":" that can be used to 
+# format the file into two columns (col1: field_desc and col2: values)
+# However the section "PredictedAffinityChange:...." and 
+# "DUETstabilitychange:.." are split over multiple lines and 
+# prevent this from happening.Additionally there are other empty lines
+# that need to be omiited. In order ensure these sections are not split
+# over multiple lines, this script is written.
+#*********************************************************************
+
+infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+
+#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
+# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
+
+# Outputs records separated by a newline, that look something like this:
+# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
+# Mutationinformation:
+# Wild-type:L
+# Position:4
+# Mutant-type:W
+# Chain:A
+# LigandID:PZA
+# Distancetoligand:15.911&Aring;
+# DUETstabilitychange:-2.169Kcal/mol
+# 
+# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
+# (...etc)
+
+# This script brings everything in a convenient format for further processing in python.
+# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
+sed -i '/PredictedAffinityChange/ {
+N
+N
+N
+N
+s/\n//g
+}
+/DUETstabilitychange:/ {
+N
+N
+s/\n//g
+}
+/^$/d' ${infile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@ -0,0 +1,29 @@
+#!/usr/bin/python
+import pandas as pd
+from collections import defaultdict
+
+#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
+
+outCols=[
+        'PredictedAffinityChange',
+        'Mutationinformation',
+        'Wild-type',
+        'Position',
+        'Mutant-type',
+        'Chain',
+        'LigandID',
+        'Distancetoligand',
+        'DUETstabilitychange'
+        ]
+
+lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
+
+outputs = defaultdict(list)
+
+for item in lines:
+	col, val = item.split(':')
+	outputs[col].append(val)
+
+dfOut=pd.DataFrame(outputs)
+
+pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
@ -0,0 +1,207 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK: To tidy the columns so you can generate figures
+#=======================================================
+####################
+#### read file #####: this will be the output from python script (csv file)
+####################
+data = read.csv("336_complex1_formatted_results.csv"
+              , header = T
+              , stringsAsFactors = FALSE)
+dim(data)
+#335, 10
+str(data)
+
+###########################
+##### Data processing #####
+###########################
+
+# populate mutation information columns as currently it is empty
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+
+# should not be blank: create muation information
+data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
+
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+#write.csv(data, 'test.csv')
+##########################################
+# Remove duplicate SNPs as a sanity check
+##########################################
+#very important
+table(duplicated(data$Mutationinformation))
+#FALSE   
+#335
+
+#extract duplicated entries
+dups = data[duplicated(data$Mutationinformation),] #0
+
+#No of dups should match with the no. of TRUE in the above table 
+#u_dups = unique(dups$Mutationinformation) #10
+sum( table(dups$Mutationinformation) ) #13
+
+rm(dups)
+
+#***************************************************************
+#select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),] #309, 10
+#***************************************************************
+#sanity check
+u = unique(df$Mutationinformation)
+u2 = unique(data$Mutationinformation)
+table(u%in%u2)
+#TRUE 
+#309 
+#should all be 1, hence 309 1's
+sum(table(df$Mutationinformation) == 1)
+
+#sort df by Position
+#MANUAL CHECKPOINT:  
+#foo <- df[order(df$Position),]
+#df <- df[order(df$Position),]
+
+rm(u, u2, dups)
+
+####################
+#### give meaningful colnames to reflect units to enable correct data type
+####################
+
+#=======
+#STEP 1
+#========
+#make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange #335, 11
+
+#make Predicted...column numeric and outcome column categorical
+head(df$PredictedAffinityChange)
+df$PredictedAffinityChange = gsub("log.*"
+                                  , ""
+                                  , df$PredictedAffinityChange)
+
+#sanity checks
+head(df$PredictedAffinityChange)
+
+#should be numeric, check and if not make it numeric
+is.numeric( df$PredictedAffinityChange )
+#change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+#should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+#change the column name to indicate units
+n = which(colnames(df) == "PredictedAffinityChange"); n
+colnames(df)[n] = "PredAffLog"
+colnames(df)[n]
+
+#========
+#STEP 2
+#========
+#make Lig_outcome column categorical showing effect of mutation
+head(df$Lig_outcome)
+df$Lig_outcome = gsub("^.*-"
+                  , "",
+                  df$Lig_outcome)
+#sanity checks
+head(df$Lig_outcome)
+#should be factor, check and if not change it to factor
+is.factor(df$Lig_outcome) 
+#change to factor
+df$Lig_outcome = as.factor(df$Lig_outcome)
+#should be TRUE
+is.factor(df$Lig_outcome) 
+
+#========
+#STEP 3
+#========
+#gsub
+head(df$Distancetoligand)
+df$Distancetoligand = gsub("&Aring;"
+                           , ""
+                           , df$Distancetoligand)
+#sanity checks
+head(df$Distancetoligand)
+#should be numeric, check if not change it to numeric
+is.numeric(df$Distancetoligand)
+#change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+#should be TRUE
+is.numeric(df$Distancetoligand)
+
+#change the column name to indicate units
+n = which(colnames(df) == "Distancetoligand")
+colnames(df)[n] <- "Dis_lig_Ang"
+colnames(df)[n]
+
+#========
+#STEP 4
+#========
+#gsub
+head(df$DUETstabilitychange)
+df$DUETstabilitychange = gsub("Kcal/mol"
+                              , ""
+                              , df$DUETstabilitychange)
+#sanity checks
+head(df$DUETstabilitychange)
+#should be numeric, check if not change it to numeric
+is.numeric(df$DUETstabilitychange)
+#change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+#should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+#change the column name to indicate units
+n = which(colnames(df) == "DUETstabilitychange"); n
+colnames(df)[n] = "DUETStability_Kcalpermol"
+colnames(df)[n]
+
+#========
+#STEP 5
+#========
+#create yet another extra column: classification of DUET stability only
+df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
+                         , "Stabilizing"
+                         , "Destabilizing")  #335, 12
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281             54 
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47 
+#==============================
+#FIXME
+#Insert a venn diagram
+
+#================================
+
+
+#========
+#STEP 6
+#========
+# assign wild and mutant colnames correctly
+
+wt = which(colnames(df) == "Wild.type"); wt
+colnames(df)[wt] <- "Wild_type"
+colnames(df[wt])
+
+mut = which(colnames(df) == "Mutant.type"); mut
+colnames(df)[mut] <- "Mutant_type"
+colnames(df[mut])
+
+#========
+#STEP 7
+#========
+#create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+
+#clear variables
+rm(n, wt, mut)
+
+################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
@ -0,0 +1,252 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK:read cleaned data and perform rescaling
+  # of DUET stability scores
+  # of Pred affinity
+#compare scaling methods with plots
+#output normalised file
+#=======================================================
+
+####################
+#### read file #####: this will be the output of my R script that cleans the data columns
+####################
+source("../Scripts/step3c_data_cleaning.R")
+##This will outut two dataframes:
+##data: unclean data: 335, 10
+##df : cleaned df 335, 13
+## you can remove data if you want as you will not need it
+rm(data)
+
+colnames(df)
+
+#===================
+#3a: PredAffLog
+#===================
+n = which(colnames(df) == "PredAffLog"); n
+group = which(colnames(df) == "Lig_outcome"); group 
+
+#===================================================
+# order according to PredAffLog values
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$PredAffLog)
+
+#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+df = df[order(df$PredAffLog),] 
+head(df$PredAffLog)
+
+#sanity checks
+head(df[,n]) #all negatives
+tail(df[,n]) #all positives
+
+#sanity checks
+mean(df[,n])
+#-0.9526746
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.2112100      0.3926667 
+#===========================
+#Same as above: in 2 steps
+#===========================
+
+#find range of your data
+my_min = min(df[,n]); my_min #-3.948
+my_max = max(df[,n]); my_max #2.23
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+df$ratioPredAff = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                      )#335 14
+#sanity checks
+head(df$ratioPredAff)
+tail(df$ratioPredAff)
+
+min(df$ratioPredAff); max(df$ratioPredAff)
+
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.000000000   0.005381166 
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.001266464   1.000000000
+
+#should be the same as below (281 and 54)
+sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281              54
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+my_title = "Ligand_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioPredAff
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioPredAff )
+     , main = "ratio rescaling"
+)
+
+# titles
+mtext(text = "Frequency"
+       , side = 2
+       , line = 0
+       , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+
+#clear variables 
+rm(my_min, my_max, my_title, n, group)
+
+#===================
+# 3b: DUET stability
+#===================
+dim(df) #335, 14
+
+n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
+group = which(colnames(df) == "DUET_outcome"); group #12
+
+#===================================================
+# order according to DUET scores
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$DUETStability_Kcalpermol)
+
+#ORDER BY DUET scores: negative values at the top and positive at the bottom
+df = df[order(df$DUETStability_Kcalpermol),] 
+
+#sanity checks
+head(df[,n]) #negatives
+tail(df[,n]) #positives
+
+#sanity checks
+mean(df[,n])
+#[1] -1.173316
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.4297257     0.3978723
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+#find range of your data
+my_min = min(df[,n]); my_min #-3.87
+my_max = max(df[,n]); my_max #1.689
+
+df$ratioDUET = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                    ) #335, 15
+#sanity check
+head(df$ratioDUET)
+tail(df$ratioDUET)
+
+min(df$ratioDUET); max(df$ratioDUET)
+
+#sanity checks
+tapply(df$ratioDUET, df$DUET_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.00000000    0.01065719
+
+tapply(df$ratioDUET, df$DUET_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.003875969   1.000000000 
+
+#should be the same as below (267 and 42)
+sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+
+my_title = "DUET_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioDUET
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioDUET )
+     , main = "ratio rescaling"
+)
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#===================
+# write output as csv file
+#===================
+write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
@ -0,0 +1,131 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+###########################
+# This will return:
+
+# df with NA:
+# merged_df2 
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+###########################
+# you need merged_df3 
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+###########################
+# Data for bfactor figure
+# PS average 
+# Lig average
+###########################
+
+head(my_df$Position)
+head(my_df$ratioDUET)
+
+# order data frame 
+df = my_df[order(my_df$Position),]
+
+head(df$Position)
+head(df$ratioDUET)
+
+#***********
+# PS: average by position
+#***********
+
+mean_DUET_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.DUET = mean(ratioDUET))
+
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.Lig = mean(ratioPredAff))
+
+
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+
+# sanity check
+# mean_PS_Lig_Bfactor
+
+colnames(combined)
+
+colnames(combined) = c("Position"
+                       , "average_DUETR"
+                       , "Position2"
+                       , "average_PredAffR")
+
+colnames(combined)
+
+identical(combined$Position, combined$Position2)
+
+n = which(colnames(combined) == "Position2"); n
+
+combined_df = combined[,-n]
+
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+
+head(combined_df$Position); tail(combined_df$Position)
+
+write.csv(combined_df, outFile
+          , row.names = F)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@ -0,0 +1,250 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(cowplot)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp
+#my_df = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df$OR)
+#[1] TRUE
+
+#<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR PS Plots
+#<<<<<<<<<<<<<<<<<<<
+
+PS_df  = my_df
+
+rm(my_df)
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+getwd()
+
+source("combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df2  = merged_df3_comp
+#my_df2 = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df2)
+str(my_df2)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df2$OR)
+#[1] TRUE
+
+# sanity check: should be <10
+if (max(my_df2$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+#<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR Lig Plots
+#<<<<<<<<<<<<<<<<
+
+Lig_df  = my_df2
+
+rm(my_df2)
+
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+#############
+# Plots: Bubble plot
+# x = Position, Y = stability
+# size of dots = OR
+# col: stability
+#############
+
+#=================
+# generate plot 1: DUET vs OR by position as geom_points
+#=================  
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
+
+g = ggplot(PS_df, aes(x = factor(Position)
+                   , y = ratioDUET))
+
+p1 = g + 
+  geom_point(aes(col = DUET_outcome
+                 , size = OR)) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "DUET(PS)"
+       , size = "Odds Ratio"
+       , colour = "DUET Outcome") +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p1 
+
+#=================
+# generate plot 2: Lig vs OR by position as geom_points
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(Lig_df$Lig_outcome)
+
+g = ggplot(Lig_df, aes(x = factor(Position)
+                   , y = ratioPredAff))
+
+p2 = g + 
+  geom_point(aes(col = Lig_outcome
+                   , size = OR))+
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Ligand Affinity"
+       , size = "Odds Ratio"
+       , colour = "Ligand Outcome"
+       ) +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p2
+
+#======================
+#combine using cowplot
+#======================
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
+#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
+theme_set(theme_gray()) # to preserve default theme
+
+printFile = cowplot::plot_grid(plot_grid(p1, p2
+                             , ncol = 1
+                             , align = 'v'
+                             , labels = c("A", "B")
+                             , label_size = my_als+5))
+print(printFile)
+dev.off()
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@ -0,0 +1,154 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# Lig_outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig_outcome.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df  = my_df 
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(my_df)
+
+# sanity checks
+upos = unique(my_df$Position)
+
+# should be a factor
+is.factor(df$Lig_outcome)
+#TRUE
+
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@ -0,0 +1,149 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot 2: Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# DUET outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET_outcome
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(my_df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
+  
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@ -0,0 +1,202 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+# very important!!!!
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = Lig_outcome
+# subgroup = normalised score i.e ratioPredAff
+
+# Prepare data: round off ratioLig scores
+# round off to 3 significant digits:
+# 165 if no rounding is performed: used to generate the originalgraph
+# 156 if rounded to 3 places
+# FIXME: check if reducing precision creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioPredAff) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioLigR = round(df$ratioPredAff, n) 
+u = unique(df$ratioLigR) # 156
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioLigR
+df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioLig
+#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@ -0,0 +1,192 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = DUET_outcome
+# subgroup = normalised score i.e ratioDUET
+
+# Prepare data: round off ratioDUET scores
+# round off to 3 significant digits:
+# 323 if no rounding is performed: used to generate the original graph
+# 287 if rounded to 3 places
+# FIXME: check if reducing precicion creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioDUET) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioDUETR = round(df$ratioDUET, n)
+u = unique(df$ratioDUETR)
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioDUETR
+df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioDUET
+#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@ -0,0 +1,215 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+#require(data.table)
+#require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$lig_outcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+}
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = Lig_outcome))
+#prinfFile = g + geom_bar(
+  g + geom_bar(
+  aes(fill = Lig_outcome)
+  , show.legend = TRUE
+) + geom_label(
+  stat = "count"
+  , aes(label = ..count..)
+  , color = "black"
+  , show.legend = FALSE
+  , size = 10) + theme(
+    axis.text.x = element_blank()
+    , axis.title.x = element_blank()
+    , axis.title.y = element_text(size=my_als)
+    , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+  ) + labs(
+    title = ""
+    , y = "Number of SNPs"
+    #, fill='Ligand Outcome'
+  )  + scale_fill_discrete(name = "Ligand Outcome"
+                           , labels = c("Destabilising", "Stabilising"))
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#require(data.table)
+setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
+
+head(df$pos_count)
+table(df$pos_count)
+# this is cummulative
+#1  2  3  4  5  6 
+#5 24 36 56 30 18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count)) 
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#5 12 12 14  6  3
+# this is what will get plotted
+
+svg('position_count_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+  #g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+  ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+    )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+    )
+    , axis.title.x = element_text(size = my_als)
+    , axis.title.y = element_text(size = my_als)
+    , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+  )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of Lig barplots         			   #
+########################################################################
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@ -0,0 +1,211 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+  }
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_DUET.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+theme_set(theme_grey())
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = DUET_outcome))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes(fill = DUET_outcome)
+  , show.legend = TRUE
+  ) + geom_label(
+    stat = "count"
+    , aes(label = ..count..)
+    , color = "black"
+    , show.legend = FALSE
+    , size = 10) + theme(
+      axis.text.x = element_blank()
+      , axis.title.x = element_blank()
+      , axis.title.y = element_text(size=my_als)
+      , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+    ) + labs(
+      title = ""
+      , y = "Number of SNPs"
+      #, fill='DUET Outcome'
+      ) + scale_fill_discrete(name = "DUET Outcome"
+                              , labels = c("Destabilising", "Stabilising"))
+
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
+
+setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
+table(df$pos_count)
+# this is cummulative
+#1   2   3   4   5   6 
+#34  76  63 104  40  18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#34 38 21 26  8  3 
+
+foo = select(df, Mutationinformation
+             , WildPos
+             , wild_type
+             , mutant_type
+             , mutation_info
+             , position
+             , pos_count) #335, 5
+
+getwd()
+write.csv(foo, "../Data/pos_count_freq.csv")
+
+svg('position_count_DUET.svg')
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+  ) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+    ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+      )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+      )
+  , axis.title.x = element_text(size = my_als)
+  , axis.title.y = element_text(size = my_als)
+  , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+    )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of DUET barplots         			   #
+########################################################################
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@ -0,0 +1,175 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for PS Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$DUET_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+
+# subset data to generate pairwise correlations
+corr_data = df[, c("ratioDUET"
+#                  , "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+                   , "DUET_outcome"
+#                  , "Lig_outcome"
+                   , "pyrazinamide"
+                   )]
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+#                    , "Ligand Affinity"
+#                    , "DUET_raw"
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+                     , "DUET_outcome"
+#                    , "Lig_outcome"
+                     , "pyrazinamide")
+
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+#default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('DUET_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(printFile)
+dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@ -0,0 +1,187 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages			   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$Lig_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+# subset data to generate pairwise correlations
+corr_data = df[, c(#"ratioDUET",
+                  "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+#                  , "DUET_outcome"
+                   , "Lig_outcome"
+                   , "pyrazinamide"
+                   )] 
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c(#"DUET",
+                     "Ligand Affinity"
+#                    ,"DUET_raw" 
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+#                    , "DUET_outcome"
+                     , "Lig_outcome"
+                     , "pyrazinamide")
+                     
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+
+# default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('Lig_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
+             , pch = 21
+             , jitter = T
+#            , alpha = .05
+#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+print(printFile)
+dev.off()
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@ -0,0 +1,227 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+
+require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df		   	  		   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for plots
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+
+#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
+#3        104       1293        264       1311          6          6        105 
+
+#===========================
+# Plot: Lineage Barplots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+df_lin = subset(df, subset = lineage %in% sel_lineages )
+
+#FIXME; add sanity check for numbers.
+# Done this manually
+
+############################################################
+
+#########
+# Data for barplot: Lineage barplot
+# to show total samples and number of unique mutations 
+# within each linege
+##########
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$Mutationinformation)))
+  curr_count = length(unique(foo$Mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+bar$num_snps_u = y1
+bar$total_samples = y2
+sel_lineages = x
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+melted = melt(to_plot, id = "x")
+melted
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_basic_barplot.svg')
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted
+           , aes(x = x
+                 , y = value
+                 , fill = variable)
+           )
+
+
+printFile = g + geom_bar(
+  
+#g + geom_bar(
+  stat = "identity"
+  , position = position_stack(reverse = TRUE)
+  , alpha=.75
+  , colour='grey75'
+    ) + theme(
+    axis.text.x = element_text(
+      size = my_ats
+#      , angle= 30
+    )
+  , axis.text.y = element_text(size = my_ats
+  #, angle = 30
+  , hjust = 1
+  , vjust = 0)
+  , axis.title.x = element_text(
+    size = my_als
+    , colour = 'black'
+    )
+  , axis.title.y = element_text(
+    size = my_als
+    , colour = 'black'
+  )
+  , legend.position = "top"
+  , legend.text = element_text(size = my_als)
+  
+  #) + geom_text(
+  ) + geom_label(
+    aes(label = value)
+    , size = 5
+    , hjust = 0.5
+    , vjust = 0.5
+    , colour = 'black'
+    , show.legend = FALSE
+    #, check_overlap = TRUE
+    , position = position_stack(reverse = T)
+    #, position = ('
+
+  ) + labs(
+    title = ''
+    , x = ''
+    , y = "Number"
+    , fill = 'Variable'
+    , colour = 'black'
+  ) + scale_fill_manual(
+      values = c('grey50', 'gray75')
+      , name=''
+      , labels=c('Mutations', 'Total Samples')
+    ) + scale_x_discrete(
+      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+    )
+print(printFile)
+dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@ -0,0 +1,233 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for Lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#78     961      195     803 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#77     955      194     770
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+g <- ggplot(df, aes(x = ratioPredAff)) + 
+  geom_density(aes(fill = Lig_outcome)
+               , alpha = 0.5) + 
+  facet_wrap( ~ lineage
+             , scales = "free"
+             , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian(xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+) 
+    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_LIG.svg')
+
+printFile = ggplot( df, aes(x = ratioPredAff
+                          , y = Lig_outcome) ) +
+  
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#              , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+                  ) +
+
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "Ligand Affinity" ) +
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+      ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioPredAff
+lin2 = df[df$lineage == "lineage2",]$ratioPredAff
+lin3 = df[df$lineage == "lineage3",]$ratioPredAff
+lin4 = df[df$lineage == "lineage4",]$ratioPredAff
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3) 
+ks.test(lin2,lin4) 
+
+ks.test(lin3,lin4) 
+
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,212 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#104     1293      264     1311 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#99     1275      263     1255
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+g <- ggplot(df, aes(x = ratioDUET)) + 
+  geom_density(aes(fill = DUET_outcome)
+               , alpha = 0.5) + facet_wrap(~ lineage,
+                                           scales = "free") +
+  ggtitle("Kernel Density estimates of Protein stability by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_PS.svg')
+
+printFile = ggplot( df, aes(x = ratioDUET
+                            , y = DUET_outcome) )+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off" 
+                ) +
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size=my_als)
+         , legend.text = element_text(size=10)
+         , legend.title = element_text(size=my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+        ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioDUET
+lin2 = df[df$lineage == "lineage2",]$ratioDUET
+lin3 = df[df$lineage == "lineage3",]$ratioDUET
+lin4 = df[df$lineage == "lineage4",]$ratioDUET
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3)
+ks.test(lin2,lin4)  
+
+ks.test(lin3,lin4)  
+
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
@ -0,0 +1,27 @@
+#########################
+#3: Read complex pdb file
+##########################
+source("Header_TT.R")
+# This script only reads the pdb file of your complex
+
+# read in pdb file complex1 
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+
+#inFile2 = paste0(inDir, "complex2_no_water.pdb")
+#complex2 = inFile2
+
+# list of 8
+my_pdb = read.pdb(complex1
+                  , maxlines = -1
+                  , multi = FALSE 
+                  , rm.insert = FALSE
+                  , rm.alt = TRUE
+                  , ATOM.only = FALSE 
+                  , hex = FALSE
+                  , verbose = TRUE)
+
+rm(inDir, inFile, complex1)
+#====== end of script
+
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -0,0 +1,386 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+
+source("Header_TT.R")
+
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was 
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T)
+str(my_df)
+
+#=========================================================
+# Processing P1: Replacing B factor with mean ratioDUET scores
+#=========================================================
+
+#########################
+# Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original DUET Scores
+# 3: replaced B-factors with DUET Scores
+#*********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+ #1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: DUET scores
+hist(my_df$average_DUETR
+     , xlab = "" 
+     , main = "Norm_DUET")
+
+plot(density(my_df$average_DUETR)
+     , xlab = ""
+     , main = "Norm_DUET")
+
+# 3: After the following replacement
+#********************************
+
+#=========
+# step 0_P1: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na 
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with DUET Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "DUET_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#********************************
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# NOTE: This replaced B-factor distribution has the same
+# x-axis as the PredAff normalised values, but the distribution
+# is affected since 0 is overinflated. This is because all the positions
+# where there are no SNPs have been assigned 0.
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+
+#######################################################################
+#====================== end of section 1 ==============================
+#######################################################################
+
+
+
+
+
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace 
+rm(list = ls())
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T) 
+str(my_df)
+#rm(inDir, inFile)
+
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original Pred Aff Scores
+# 3: replaced B-factors with PredAff Scores
+#********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+# 1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+     , xlab = "" 
+     , main = "Norm_lig_average")
+
+plot(density(my_df$average_PredAffR)
+     , xlab = ""
+     , main = "Norm_lig_average")
+
+# 3: After the following replacement
+#********************************
+
+#=================================================
+# Processing P2: Replacing  B values with ratioPredAff scores
+#=================================================
+# use match to perform this replacement linking with "position no"
+# in the pdb file, this corresponds to column "resno"
+# in my_df, this corresponds to column "Position"
+
+#=========
+# step 0_P2: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
+
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P2
+#=========
+
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with PredAff Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "Lig_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#********************************
+
+###########
+# end of output files with Bfactors
+##########
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@ -0,0 +1,257 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+#########################################################
+# 1: Installing and loading required packages           #
+#########################################################
+
+source("Header_TT.R")
+#source("barplot_colour_function.R")
+
+##########################################################
+#           Checking: Entire data frame and for PS      #
+##########################################################
+
+###########################
+#2) Read file: combined one from the script
+###########################
+source("combining_two_df.R")
+
+# df with NA:
+# merged_df2
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                  , row.names = 1
+                  , stringsAsFactors = F
+                  , header = T)
+str(mcsm_data)
+my_colnames  = colnames(mcsm_data)
+
+#====================================
+# subset my_df to include only the columns in mcsm data
+my_df2 = my_df[my_colnames]
+#====================================
+# compare the two
+head(mcsm_data$Mutationinformation)
+head(mcsm_data$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
+head(mcsm_data_s$Mutationinformation)
+head(mcsm_data_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data_s, my_df2)
+
+#from lib compare
+compare(mcsm_data_s, my_df2) # seems rownames are the problem
+
+# FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+#====================================================== end of section 1
+
+
+
+##########################################################
+#             Checking: LIG(Filtered dataframe)          #
+##########################################################
+
+# clear workspace
+rm(list = ls())
+
+###########################
+#3) Read file: combined_lig from the script
+###########################
+source("combining_two_df_lig.R")
+
+# df with NA:
+# merged_df2 :
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                      , row.names = 1
+                      , stringsAsFactors = F
+                      , header = T)
+str(mcsm_data)
+
+###########################
+# 4a: Filter/subset data: ONLY for LIGand analysis
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+# sanity checks
+upos = unique(mcsm_data$Position)
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# Lig filtered: subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+rm(mcsm_data) #to avoid confusion
+
+table(mcsm_data2$Dis_lig_Ang<10)
+table(mcsm_data2$Dis_lig_Ang>10)
+
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+upos_f = unique(mcsm_data2$Position); upos_f
+
+# colnames of df that you will need to subset the bigger df from
+my_colnames  = colnames(mcsm_data2)
+#====================================
+# subset bigger df i.e my_df to include only the columns in mcsm data2
+my_df2 = my_df[my_colnames] 
+
+rm(my_df) #to avoid confusion
+#====================================
+# compare the two
+head(mcsm_data2$Mutationinformation)
+head(mcsm_data2$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
+head(mcsm_data2_s$Mutationinformation)
+head(mcsm_data2_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data2_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data2_s, my_df2)
+
+# from library compare
+compare(mcsm_data2_s, my_df2) # seems rownames are the problem
+
+#FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+##########################################################
+#  extract and write output file for SNP posn: all     #
+##########################################################
+
+head(merged_df3$Position)
+
+foo = merged_df3[order(merged_df3$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique
+
+# sanity check: 
+table(snp_pos_unique == combined_df$Position)
+
+#=====================
+# write_output files
+#=====================
+outDir = "~/Data/pyrazinamide/input/processed/"
+
+
+outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
+print(paste0("Output file name and path will be:","", outFile1))
+
+write.table(snp_pos_unique
+            , outFile1
+            , row.names = F
+            , col.names = F)
+            
+##############################################################
+#  extract and write output file for SNP posn: complete only #
+##############################################################
+head(merged_df3_comp$Position)
+
+foo = merged_df3_comp[order(merged_df3_comp$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique 
+
+# outDir = "~/Data/pyrazinamide/input/processed/" # already set
+
+outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
+print(paste0("Output file name and path will be:", outFile2))
+
+write.table(snp_pos_unique
+            , outFile2
+            , row.names = F
+            , col.names = F)
+#============================== end of script
+
+