import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,59 @@
+mCSM Analysis
+=============
+
+This repo does mCSM analysis using Python (Pandas, numpy), bash and R.
+
+Requires an additional 'Data' directory (Supplied separately). Batteries not included.
+
+ # directory struc
+
+ # Assumptions
+
+ 1. git repos are cloned to `~/git`
+ 2. The script
+
+## LSHTM\_analysis: Repo
+
+ subdirs
+
+ needs to call `../Data/input/original/`
+
+ meta\_data\_analysis/
+ 	*.R
+ 	*.py
+	
+ needs to call `../Data/input/processed/<drug>`
+ needs to output `../Data/output/results/<drug>`
+
+ mcsm\_analysis/
+	<drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
+		scripts/ (changed from `Scripts/`)
+		*.R
+		*.py
+			mcsm/
+			*.sh
+			*.py
+			*.R
+			plotting/
+			*.R
+
+
+Data: Repo:
+
+ # subdirs
+
+	input/
+		original/
+		processed/
+	
+	output/
+	    <drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
+    		results/
+        		*.csv
+        		*.xlsx
+        		*.doc
+        		*.txt
+                plots/
+                structure/
+
+More docs here as I write them.
--- a/mcsm.conf
+++ b/mcsm.conf
@ -0,0 +1,12 @@
+# This is not yet used, but will be soon :)
+[DEFAULT]
+mcsm_home = /home/tanu/git/github/LSHTM_analysis
+
+[pyrazinamide]
+  gene = 'pnca'
+  server = 'http://mcsm.melb.ac.uk/'
+
+[monkeyamide]
+  gene = 'abcs'
+  server = 'http://myserver.local/'
+
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -0,0 +1,512 @@
+###########################
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+# quick checks
+colnames(my_df)
+str(my_df)
+###########################
+# Data for bfactor figure
+# PS average
+# Lig average
+###########################
+head(my_df$Position)
+head(my_df$ratioDUET)
+# order data frame
+df = my_df[order(my_df$Position),]
+head(df$Position)
+head(df$ratioDUET)
+#***********
+# PS: average by position
+#***********
+mean_DUET_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.DUET = mean(ratioDUET))
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.Lig = mean(ratioPredAff))
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+# sanity check
+# mean_PS_Lig_Bfactor
+colnames(combined)
+colnames(combined) = c("Position"
+, "average_DUETR"
+, "Position2"
+, "average_PredAffR")
+colnames(combined)
+identical(combined$Position, combined$Position2)
+n = which(colnames(combined) == "Position2"); n
+combined_df = combined[,-n]
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+#=============
+# output csv
+#============
+outDir = "~/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+head(combined_df$Position); tail(combined_df$Position)
+write.csv(combined_df, outFile
+, row.names = F)
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+source("../combining_two_df.R")
+###########################
+# This will return:
+# df with NA:
+# merged_df2
+# merged_df3
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+###########################
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+# quick checks
+colnames(my_df)
+str(my_df)
+###########################
+# Data for bfactor figure
+# PS average
+# Lig average
+###########################
+head(my_df$Position)
+head(my_df$ratioDUET)
+# order data frame
+df = my_df[order(my_df$Position),]
+head(df$Position)
+head(df$ratioDUET)
+#***********
+# PS: average by position
+#***********
+mean_DUET_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.DUET = mean(ratioDUET))
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.Lig = mean(ratioPredAff))
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+# sanity check
+# mean_PS_Lig_Bfactor
+colnames(combined)
+colnames(combined) = c("Position"
+, "average_DUETR"
+, "Position2"
+, "average_PredAffR")
+colnames(combined)
+identical(combined$Position, combined$Position2)
+n = which(colnames(combined) == "Position2"); n
+combined_df = combined[,-n]
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+head(combined_df$Position); tail(combined_df$Position)
+write.csv(combined_df, outFile
+, row.names = F)
+# read in pdb file complex1
+inDir = "~/git/Data/pyrazinamide/input/structure"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+# read in pdb file complex1
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+#########################
+#3: Read complex pdb file
+##########################
+source("Header_TT.R")
+# list of 8
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+rm(inDir, inFile)
+#====== end of script
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+complex1 = inFile
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+inFile
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+#inFile2 = paste0(inDir, "complex2_no_water.pdb")
+#complex2 = inFile2
+# list of 8
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+rm(inDir, inFile, complex1)
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+source("Header_TT.R")
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+source("Header_TT.R")
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+source("read_pdb.R") # list of 8
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+# make a copy: required for downstream sanity checks
+d2 = d
+# sanity checks: B factor
+max(d$b); min(d$b)
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+#1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: DUET scores
+hist(my_df$average_DUETR
+, xlab = ""
+, main = "Norm_DUET")
+plot(density(my_df$average_DUETR)
+, xlab = ""
+, main = "Norm_DUET")
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+#1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: DUET scores
+hist(my_df$average_DUETR
+, xlab = ""
+, main = "Norm_DUET")
+plot(density(my_df$average_DUETR)
+, xlab = ""
+, main = "Norm_DUET")
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+# count number of 0's in Bactor
+sum(d$b == 0)
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+# sanity check: should be 0
+sum(is.na(d$b))
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+print("Error: NA replacement NOT successful, Debug code!")
+}
+max(d$b); min(d$b)
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+if (min(d$b) == min(my_df$average_DUETR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d
+max(d$b); min(d$b)
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/output/"
+getwd()
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+outDir = "~/git/Data/pyrazinamide/input/structure"
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+hist(d$b
+, xlab = ""
+, main = "repalced-B")
+plot(density(d$b)
+, xlab = ""
+, main = "replaced-B")
+# graph titles
+mtext(text = "Frequency"
+, side = 2
+, line = 0
+, outer = TRUE)
+mtext(text = "DUET_stability"
+, side = 3
+, line = 0
+, outer = TRUE)
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace
+rm(list = ls())
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace
+rm(list = ls())
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+#=========================================================
+# Processing P2: Replacing B factor with mean ratioLig scores
+#=========================================================
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+source("read_pdb.R") # list of 8
+# extract atom list into a vari
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+# make a copy: required for downstream sanity checks
+d2 = d
+# sanity checks: B factor
+max(d$b); min(d$b)
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+# 1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+, xlab = ""
+, main = "Norm_lig_average")
+plot(density(my_df$average_PredAffR)
+, xlab = ""
+, main = "Norm_lig_average")
+# 3: After the following replacement
+#********************************
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+# 1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+, xlab = ""
+, main = "Norm_lig_average")
+plot(density(my_df$average_PredAffR)
+, xlab = ""
+, main = "Norm_lig_average")
+# 3: After the following replacement
+#********************************
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+# count number of 0's in Bactor
+sum(d$b == 0)
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+# sanity check: should be 0
+sum(is.na(d$b))
+if (sum(d$b == 0) == b_na){
+print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+print("Error: NA replacement NOT successful, Debug code!")
+}
+max(d$b); min(d$b)
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+if (min(d$b) == min(my_df$average_PredAffR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d
+max(d$b); min(d$b)
+#=========
+# step 5_P2
+#=========
+write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
+# output dir
+getwd()
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@ -0,0 +1,129 @@
+#########################################################
+### A) Installing and loading required packages
+#########################################################
+
+#if (!require("gplots")) {
+#  install.packages("gplots", dependencies = TRUE)
+#  library(gplots)
+#}
+
+if (!require("tidyverse")) {
+  install.packages("tidyverse", dependencies = TRUE)
+  library(tidyverse)
+}
+
+if (!require("ggplot2")) {
+  install.packages("ggplot2", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("cowplot")) {
+  install.packages("copwplot", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("ggcorrplot")) {
+  install.packages("ggcorrplot", dependencies = TRUE)
+  library(ggcorrplot)
+}
+
+if (!require("ggpubr")) {
+  install.packages("ggpubr", dependencies = TRUE)
+  library(ggpubr)
+}
+
+if (!require("RColorBrewer")) {
+  install.packages("RColorBrewer", dependencies = TRUE)
+  library(RColorBrewer)
+}
+
+if (!require ("GOplot")) {
+  install.packages("GOplot")
+  library(GOplot)
+}
+
+if(!require("VennDiagram")) {
+  
+  install.packages("VennDiagram", dependencies = T)
+  library(VennDiagram)
+}
+
+if(!require("scales")) {
+  
+  install.packages("scales", dependencies = T)
+  library(scales)
+}
+
+if(!require("plotrix")) {
+  
+  install.packages("plotrix", dependencies = T)
+  library(plotrix)
+}
+
+if(!require("stats")) {
+  
+  install.packages("stats", dependencies = T)
+  library(stats)
+}
+
+if(!require("stats4")) {
+  
+  install.packages("stats4", dependencies = T)
+  library(stats4)
+}
+
+if(!require("data.table")) {
+  library(stats4)
+}
+
+if (!require("PerformanceAnalytics")){
+  install.packages("PerformanceAnalytics", dependencies = T)
+  library(PerformaceAnalytics)
+}
+
+if (!require ("GGally")){
+  install.packages("GGally")
+  library(GGally)
+}
+
+if (!require ("corrr")){
+  install.packages("corrr")
+  library(corrr)
+}
+
+if (!require ("psych")){
+  install.packages("psych")
+  library(psych)
+}
+
+if (!require ("dplyr")){
+  install.packages("dplyr")
+  library(psych)
+}
+
+
+if (!require ("compare")){
+  install.packages("compare")
+  library(psych)
+}
+
+if (!require ("arsenal")){
+  install.packages("arsenal")
+  library(psych)
+}
+
+
+####TIDYVERSE
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+library(ggcorrplot)
+
+
+###for PDB files
+#install.packages("bio3d") 
+if(!require(bio3d)){
+  install.packages("bio3d")
+  library(bio3d)
+}
--- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
+++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
@ -0,0 +1,27 @@
+#########################################################
+# 1b: Define function: coloured barplot by subgroup
+# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
+#########################################################
+
+ColourPalleteMulti <- function(df, group, subgroup){
+  
+  # Find how many colour categories to create and the number of colours in each
+  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
+                          , df
+                          , function(x) length(unique(x)))
+  #  return(categories) }
+  
+  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
+  
+  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
+  
+  #return(category.start); return(category.end)}
+  
+  # Build Colour pallette
+  colours <- unlist(lapply(1:nrow(categories),
+                           function(i){
+                             colorRampPalette(colors = c(category.start[i]
+                                                         , category.end[i]))(categories[i,2])}))
+  return(colours)
+}
+#########################################################
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -0,0 +1,299 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
+getwd()
+
+#########################################################
+# TASK: To combine mcsm and meta data with af and or
+#########################################################
+
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data = read.csv(inFile
+                     , row.names = 1
+                     , stringsAsFactors = F
+                     , header = T) 
+rm(inDir, inFile)
+
+str(mcsm_data)
+
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+
+# spelling Correction 1: DUET
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
+
+# spelling Correction 2: Ligand
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
+
+# count na in each column
+na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
+
+# sort by Mutationinformation
+mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
+head(mcsm_data$Mutationinformation)
+
+# get freq count of positions and add to the df
+setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
+
+pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
+
+###########################
+# 2: Read file: meta data with AFandOR
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
+
+meta_with_afor <- read.csv(inFile2
+                      , stringsAsFactors = F
+                      , header = T)
+
+rm(inDir, inFile2)
+
+str(meta_with_afor)
+
+# sort by Mutationinformation
+head(meta_with_afor$Mutationinformation)
+meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
+head(meta_with_afor$Mutationinformation)
+
+# sanity check: should be True for all the mentioned columns
+#is.numeric(meta_with_afor$OR)
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+
+c1 = NULL
+for (i in na_var){
+  print(i)
+  c0 = is.numeric(meta_with_afor[,i])
+  c1 = c(c0, c1)
+  if ( all(c1) ){
+    print("Sanity check passed: These are all numeric cols")
+  } else{
+    print("Error: Please check your respective data types")
+  }
+}
+
+# If OR, and P value are not numeric, then convert to numeric and then count
+# else they will say 0
+na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
+str(na_count)
+
+# compare if the No of "NA" are the same for all these cols
+na_len = NULL
+for (i in na_var){
+  temp = na_count[[i]]
+  na_len = c(na_len, temp)
+}
+
+# extract how many NAs: 
+# should be all TRUE 
+# should be a single number since 
+# all the cols should have "equal" and "same" no. of NAs
+
+my_nrows = NULL
+for ( i in 1: (length(na_len)-1) ){
+  #print(compare(na_len[i]), na_len[i+1])
+  c = compare(na_len[i], na_len[i+1])
+  if ( c$result ) {
+    my_nrows = na_len[i] }
+  else { 
+  print("Error: Please check your numbers") 
+  }
+}
+
+my_nrows
+
+#=#=#=#=#=#=#=#=#
+# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
+# these are the same 7 ones
+#=#=#=#=#=#=#=#=#
+
+# sanity check
+#which(is.na(meta_with_afor$OR)) 
+
+# initialise an empty df with nrows as extracted above
+na_count_df = data.frame(matrix(vector(mode = 'numeric'
+#                                       , length = length(na_var)
+                                       )
+                                , nrow = my_nrows
+#                                , ncol = length(na_var)
+                              ))
+
+# populate the df with the indices of the cols that are NA
+for (i in na_var){
+  print(i)
+  na_i = which(is.na(meta_with_afor[i]))
+  na_count_df = cbind(na_count_df, na_i)
+  colnames(na_count_df)[which(na_var == i)] <- i
+}
+
+# Now compare these indices to ensure these are the same
+c2 = NULL
+for ( i in 1: ( length(na_count_df)-1 ) ) {
+#  print(na_count_df[i] == na_count_df[i+1])
+  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
+  c2 = c(c1, c2)
+  if ( all(c2) ) {
+    print("Sanity check passed: The indices for AF, OR, etc are all the same")
+  } else {
+    print ("Error: Please check indices which are NA")
+  }
+}
+
+rm( c, c0, c1, c2, i, my_nrows
+    , na_count, na_i, na_len
+    , na_var, temp
+    , na_count_df
+    , pos_count_check )
+
+###########################
+# 3:merging two dfs: with NA
+###########################
+
+# link col name  = Mutationinforamtion
+head(mcsm_data$Mutationinformation)
+head(meta_with_afor$Mutationinformation)
+
+#########
+# merge 1a: meta data with mcsm
+#########
+merged_df2 = merge(x = meta_with_afor
+                  ,y = mcsm_data
+                  , by = "Mutationinformation"
+                  , all.y = T)
+
+head(merged_df2$Position)
+
+# sort by Position
+head(merged_df2$Position)
+merged_df2 = merged_df2[order(merged_df2$Position),]
+head(merged_df2$Position)
+
+merged_df2v2 = merge(x = meta_with_afor
+                   ,y = mcsm_data
+                   , by = "Mutationinformation"
+                   , all.x = T) 
+#!=!=!=!=!=!=!=!
+# COMMENT: used all.y since position 186 is not part of the struc,
+# hence doesn't have a mcsm value
+# but 186 is associated with with mutation
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$Position%in%merged_df2v2$Position)
+
+rm(merged_df2v2)
+
+#########
+# merge 1b:remove duplicate mutation information
+#########
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
+head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
+
+# sanity checks
+# nrows of merged_df3 should be the same as the nrows of mcsm_data
+if(nrow(mcsm_data) == nrow(merged_df3)){
+  print("sanity check: Passed")
+} else {
+  print("Error!: check data, nrows is not as expected")
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# uncomment as necessary
+# only need to run this if merged_df2v2 i.e non structural pos included
+#mcsm = mcsm_data$Mutationinformation
+#my_merged = merged_df3$Mutationinformation
+
+# find the index where it differs
+#diff_n = which(!my_merged%in%mcsm)
+
+#check if it is indeed pos 186
+#merged_df3[diff_n,]
+
+# remove this entry
+#merged_df3 = merged_df3[-diff_n,]]
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+###########################
+# 3b :merging two dfs: without NA
+###########################
+
+#########
+# merge 2a:same as merge 1 but excluding NA
+#########
+merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
+
+#########
+# merge 2b: remove duplicate mutation information
+#########
+merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
+
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$AF),]
+# compare dfs: foo and merged_df3_com
+all.equal(foo, merged_df3)
+
+summary(comparedf(foo, merged_df3))
+
+#=============== end of combining df
+#clear variables
+rm(mcsm_data
+   , meta_with_afor
+   , foo)
+
+#rm(diff_n, my_merged, mcsm)
+
+#=====================
+# write_output files
+#=====================
+# output dir
+outDir = "~/git/Data/pyrazinamide/output/"
+getwd()
+
+outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
+write.csv(merged_df3, outFile1)
+
+#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
+#write.csv(merged_df3_comp, outFile2)
+
+rm(outDir
+   , outFile1
+#   , outFile2
+)
+#============================= end of script
+
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
@ -0,0 +1,348 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
+getwd()
+
+#########################################################
+# TASK: To combine mcsm and meta data with af and or
+# by filtering for distance to ligand (<10Ang)
+#########################################################
+
+#########################################################
+# Installing and loading required packages
+#########################################################
+
+#source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data = read.csv(inFile
+                     , row.names = 1
+                     , stringsAsFactors = F
+                     , header = T) 
+rm(inDir, inFile)
+
+str(mcsm_data)
+
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+
+# spelling Correction 1: DUET
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
+
+# spelling Correction 2: Ligand
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
+
+########################### !!! only for mcsm_lig
+# 4: Filter/subset data 
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# count
+table(mcsm_data$Dis_lig_Ang<10)
+
+# subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+# sanity checks
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+# count no of unique positions
+length(unique(mcsm_data2$Position))
+
+# count no of unique mutations
+length(unique(mcsm_data2$Mutationinformation))
+
+# count Destabilisinga and stabilising
+table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT: so as not to alter the script
+mcsm_data = mcsm_data2
+#<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(mcsm_data$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+# clear variables
+rm(mcsm_data2)
+
+# count na in each column
+na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
+
+head(mcsm_data$Mutationinformation)
+mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
+mcsm_data[mcsm_data$Mutationinformation=="L4S",]
+
+# sort by Mutationinformation
+mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
+head(mcsm_data$Mutationinformation)
+
+# check
+mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
+mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
+
+# get freq count of positions and add to the df
+setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
+
+pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
+
+###########################
+# 2: Read file: meta data with AFandOR
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
+
+meta_with_afor <- read.csv(inFile2
+                      , stringsAsFactors = F
+                      , header = T)
+
+str(meta_with_afor)
+
+# sort by Mutationinformation
+head(meta_with_afor$Mutationinformation)
+meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
+head(meta_with_afor$Mutationinformation)
+
+# sanity check: should be True for all the mentioned columns
+#is.numeric(meta_with_afor$OR)
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+
+c1 = NULL
+for (i in na_var){
+  print(i)
+  c0 = is.numeric(meta_with_afor[,i])
+  c1 = c(c0, c1)
+  if ( all(c1) ){
+    print("Sanity check passed: These are all numeric cols")
+  } else{
+    print("Error: Please check your respective data types")
+  }
+}
+
+# If OR, and P value are not numeric, then convert to numeric and then count
+# else they will say 0
+
+# NOW count na in each column: if you did it before, then 
+# OR and Pvalue column would say 0 na since these were not numeric
+na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
+str(na_count)
+
+# compare if the No of "NA" are the same for all these cols
+na_len = NULL
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+for (i in na_var){
+  temp = na_count[[i]]
+  na_len = c(na_len, temp)
+}
+
+my_nrows = NULL
+
+for ( i in 1: (length(na_len)-1) ){
+  #print(compare(na_len[i]), na_len[i+1])
+  c = compare(na_len[i], na_len[i+1])
+  if ( c$result ) {
+    my_nrows = na_len[i] }
+  else { 
+    print("Error: Please check your numbers") 
+  }
+}
+
+my_nrows
+
+#=#=#=#=#=#=#=#=#
+# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
+# all have 81 NA, with pyrazinamide with 960
+# and these are the same 7 ones
+#=#=#=#=#=#=#=#=#
+
+# sanity check
+#which(is.na(meta_with_afor$OR)) 
+
+# initialise an empty df with nrows as extracted above
+na_count_df = data.frame(matrix(vector(mode = 'numeric'
+#                                      , length = length(na_var) 
+                                      )
+                                , nrow = my_nrows
+#                                , ncol = length(na_var)
+                                ))
+
+# populate the df with the indices of the cols that are NA
+for (i in na_var){
+  print(i)
+  na_i = which(is.na(meta_with_afor[i]))
+  na_count_df = cbind(na_count_df, na_i)
+  colnames(na_count_df)[which(na_var == i)] <- i
+} 
+
+# Now compare these indices to ensure these are the same
+c2 = NULL
+for ( i in 1: ( length(na_count_df)-1 ) ) {
+  #  print(na_count_df[i] == na_count_df[i+1])
+  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
+  c2 = c(c1, c2)
+  if ( all(c2) ) {
+    print("Sanity check passed: The indices for AF, OR, etc are all the same")
+  } else {
+    print ("Error: Please check indices which are NA")
+  }
+}
+
+rm( c, c1, c2, i, my_nrows
+    , na_count, na_i, na_len
+    , na_var, temp
+    , na_count_df
+    , pos_count_check )
+
+###########################
+# 3:merging two dfs: with NA
+###########################
+
+# link col name  = Mutationinforamtion
+head(mcsm_data$Mutationinformation)
+head(meta_with_afor$Mutationinformation)
+
+#########
+# merge 1a: meta data with mcsm
+#########
+merged_df2 = merge(x = meta_with_afor
+                  , y = mcsm_data
+                  , by = "Mutationinformation"
+                  , all.y = T)
+
+head(merged_df2$Position)
+
+# sort by Position
+head(merged_df2$Position)
+merged_df2 = merged_df2[order(merged_df2$Position),]
+head(merged_df2$Position)
+
+merged_df2v2 = merge(x = meta_with_afor
+                   ,y = mcsm_data
+                   , by = "Mutationinformation"
+                   , all.x = T) 
+
+#!=!=!=!=!=!=!=!
+# COMMENT: used all.y since position 186 is not part of the struc,
+# hence doesn't have a mcsm value
+# but 186 is associated with with mutation
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$Position%in%merged_df2v2$Position)
+
+rm(merged_df2v2)
+
+#########
+# merge 1b:remove duplicate mutation information
+#########
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
+head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
+
+# sanity checks
+# nrows of merged_df3 should be the same as the nrows of mcsm_data
+if(nrow(mcsm_data) == nrow(merged_df3)){
+  print("sanity check: Passed")
+} else {
+  print("Error!: check data, nrows is not as expected")
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# uncomment as necessary
+# only need to run this if merged_df2v2 i.e non structural pos included
+#mcsm = mcsm_data$Mutationinformation
+#my_merged = merged_df3$Mutationinformation
+
+# find the index where it differs
+#diff_n = which(!my_merged%in%mcsm)
+
+#check if it is indeed pos 186
+#merged_df3[diff_n,]
+
+# remove this entry
+#merged_df3 = merged_df3[-diff_n,] 
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+###########################
+# 3b :merging two dfs: without NA
+###########################
+
+#########
+# merge 2a:same as merge 1 but excluding NA
+#########
+merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
+
+#########
+# merge 2b: remove duplicate mutation information
+#########
+merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
+
+# FIXME: add this as a sanity check. I have manually checked!
+
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$AF),]
+
+# compare dfs: foo and merged_df3_com
+all.equal(foo, merged_df3)
+
+summary(comparedf(foo, merged_df3))
+
+#=============== end of combining df
+#clear variables
+rm(mcsm_data
+   , meta_with_afor
+   , foo)
+
+#rm(diff_n, my_merged, mcsm)
+
+#===============end of script
+
+#=====================
+# write_output files
+#=====================
+ 
+# Not required as this is a subset of the "combining_two_df.R" script
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+#*************************************
+# need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
+# per line. Sort by unique, which automatically removes duplicates.
+# sace file in current directory
+#**********************************************************************
+infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
+outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
+
+# sort unique entries and output to current directory
+sort -u ${infile} > ${outfile}
+
+# count no. of unique snps mCSM will run on
+count=$(wc -l < ${outfile})
+
+# print to console no. of unique snps mCSM will run on
+echo "${count} unique mutations for mCSM to run on"
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -0,0 +1,72 @@
+#!/bin/bash
+
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: submit requests using curl: HANDLE redirects and refresh url. 
+# Iterate over mutation file and write/append result urls to a file
+# result url file: stored in the /Results directory
+# mutation file: one mutation per line, no chain ID
+# output: in a file, should be n urls (n=no. of mutations in file)
+# NOTE: these are just result urls, not actual values for results
+#**********************************************************************
+## iterate over mutation file; line by line and submit query using curl
+filename="../Data/pnca_mis_SNPs_v2_unique.csv"
+
+## some useful messages
+echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
+COUNT=0
+while read -r line; do
+((COUNT++))
+    mutation="${line}"
+#    echo "${mutation}"
+pdb='../Data/complex1_no_water.pdb'
+mutation="${mutation}"
+chain="A"
+lig_id="PZA"
+affin_wt="0.99"
+host="http://biosig.unimelb.edu.au"
+call_url="/mcsm_lig/prediction"
+
+##=========================================
+##html field_names names required for curl
+##complex_field:wild=@
+##mutation_field:mutation=@
+##chain_field:chain=@
+##ligand_field:lig_id@
+##energy_field:affin_wt
+#=========================================
+refresh_url=$(curl -L \
+     -sS \
+     -F "wild=@${pdb}" \
+     -F "mutation=${mutation}" \
+     -F "chain=${chain}" \
+     -F "lig_id=${lig_id}" \
+     -F "affin_wt=${affin_wt}" \
+     ${host}${call_url} | grep "http-equiv")
+
+#echo $refresh_url
+#echo ${host}${refresh_url}
+
+#use regex to extract the relevant bit from the refresh url
+#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
+
+#Now build: result url using host and refresh url and write the urls to a file in the Results dir
+result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
+sleep 10
+
+echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
+
+echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -n '.'
+done < "${filename}"
+
+echo
+echo "Processing Complete"
+
+##end of submitting query, receiving result url and storing results url in a file
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: submit result urls and fetch actual results using curl
+# iterate over each result url from the output of step1 in the stored
+# in file in /Results.
+# Use curl to fetch results and extract relevant sections using hxtools
+# and store these in another file in /Results 
+# This script takes two arguments:
+# 	input file: file containing results url
+#				In this case: 336_mCSM_lig_complex1_result_url.txt
+# 	output file: name of the file where extracted results will be stored
+#				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
+#*********************************************************************
+
+#if [ "$#" -ne 2 ]; then
+  #if [ -Z $1 ]; then
+#  echo "
+#  Please provide both Input and Output files.
+
+#  Usage: batch_read_urls.sh INFILE OUTFILE
+#  "
+#  exit 1
+#fi
+
+# First argument: Input File
+# Second argument: Output File
+#infile=$1
+#outfile=$2
+
+infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
+outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
+
+echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
+echo
+COUNT=0
+while read -r line; do
+#COUNT=$(($COUNT+1))
+((COUNT++))
+  curl --silent ${line} \
+    | hxnormalize -x \
+    | hxselect -c div.span4 \
+    | hxselect -c div.well \
+    | sed -r -e 's/<[^>]*>//g' \
+    | sed -re 's/ +//g' \
+    >> ${outfile}
+  #| tee -a ${outfile}
+#  echo -n '.'
+echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
+  
+done < "${infile}"
+
+echo
+echo "Processing Complete"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: Intermediate results processing
+# output file has a convenient delimiter of ":" that can be used to 
+# format the file into two columns (col1: field_desc and col2: values)
+# However the section "PredictedAffinityChange:...." and 
+# "DUETstabilitychange:.." are split over multiple lines and 
+# prevent this from happening.Additionally there are other empty lines
+# that need to be omiited. In order ensure these sections are not split
+# over multiple lines, this script is written.
+#*********************************************************************
+
+infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+
+#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
+# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
+
+# Outputs records separated by a newline, that look something like this:
+# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
+# Mutationinformation:
+# Wild-type:L
+# Position:4
+# Mutant-type:W
+# Chain:A
+# LigandID:PZA
+# Distancetoligand:15.911&Aring;
+# DUETstabilitychange:-2.169Kcal/mol
+# 
+# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
+# (...etc)
+
+# This script brings everything in a convenient format for further processing in python.
+# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
+sed -i '/PredictedAffinityChange/ {
+N
+N
+N
+N
+s/\n//g
+}
+/DUETstabilitychange:/ {
+N
+N
+s/\n//g
+}
+/^$/d' ${infile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@ -0,0 +1,29 @@
+#!/usr/bin/python
+import pandas as pd
+from collections import defaultdict
+
+#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
+
+outCols=[
+        'PredictedAffinityChange',
+        'Mutationinformation',
+        'Wild-type',
+        'Position',
+        'Mutant-type',
+        'Chain',
+        'LigandID',
+        'Distancetoligand',
+        'DUETstabilitychange'
+        ]
+
+lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
+
+outputs = defaultdict(list)
+
+for item in lines:
+	col, val = item.split(':')
+	outputs[col].append(val)
+
+dfOut=pd.DataFrame(outputs)
+
+pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
@ -0,0 +1,207 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK: To tidy the columns so you can generate figures
+#=======================================================
+####################
+#### read file #####: this will be the output from python script (csv file)
+####################
+data = read.csv("336_complex1_formatted_results.csv"
+              , header = T
+              , stringsAsFactors = FALSE)
+dim(data)
+#335, 10
+str(data)
+
+###########################
+##### Data processing #####
+###########################
+
+# populate mutation information columns as currently it is empty
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+
+# should not be blank: create muation information
+data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
+
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+#write.csv(data, 'test.csv')
+##########################################
+# Remove duplicate SNPs as a sanity check
+##########################################
+#very important
+table(duplicated(data$Mutationinformation))
+#FALSE   
+#335
+
+#extract duplicated entries
+dups = data[duplicated(data$Mutationinformation),] #0
+
+#No of dups should match with the no. of TRUE in the above table 
+#u_dups = unique(dups$Mutationinformation) #10
+sum( table(dups$Mutationinformation) ) #13
+
+rm(dups)
+
+#***************************************************************
+#select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),] #309, 10
+#***************************************************************
+#sanity check
+u = unique(df$Mutationinformation)
+u2 = unique(data$Mutationinformation)
+table(u%in%u2)
+#TRUE 
+#309 
+#should all be 1, hence 309 1's
+sum(table(df$Mutationinformation) == 1)
+
+#sort df by Position
+#MANUAL CHECKPOINT:  
+#foo <- df[order(df$Position),]
+#df <- df[order(df$Position),]
+
+rm(u, u2, dups)
+
+####################
+#### give meaningful colnames to reflect units to enable correct data type
+####################
+
+#=======
+#STEP 1
+#========
+#make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange #335, 11
+
+#make Predicted...column numeric and outcome column categorical
+head(df$PredictedAffinityChange)
+df$PredictedAffinityChange = gsub("log.*"
+                                  , ""
+                                  , df$PredictedAffinityChange)
+
+#sanity checks
+head(df$PredictedAffinityChange)
+
+#should be numeric, check and if not make it numeric
+is.numeric( df$PredictedAffinityChange )
+#change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+#should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+#change the column name to indicate units
+n = which(colnames(df) == "PredictedAffinityChange"); n
+colnames(df)[n] = "PredAffLog"
+colnames(df)[n]
+
+#========
+#STEP 2
+#========
+#make Lig_outcome column categorical showing effect of mutation
+head(df$Lig_outcome)
+df$Lig_outcome = gsub("^.*-"
+                  , "",
+                  df$Lig_outcome)
+#sanity checks
+head(df$Lig_outcome)
+#should be factor, check and if not change it to factor
+is.factor(df$Lig_outcome) 
+#change to factor
+df$Lig_outcome = as.factor(df$Lig_outcome)
+#should be TRUE
+is.factor(df$Lig_outcome) 
+
+#========
+#STEP 3
+#========
+#gsub
+head(df$Distancetoligand)
+df$Distancetoligand = gsub("&Aring;"
+                           , ""
+                           , df$Distancetoligand)
+#sanity checks
+head(df$Distancetoligand)
+#should be numeric, check if not change it to numeric
+is.numeric(df$Distancetoligand)
+#change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+#should be TRUE
+is.numeric(df$Distancetoligand)
+
+#change the column name to indicate units
+n = which(colnames(df) == "Distancetoligand")
+colnames(df)[n] <- "Dis_lig_Ang"
+colnames(df)[n]
+
+#========
+#STEP 4
+#========
+#gsub
+head(df$DUETstabilitychange)
+df$DUETstabilitychange = gsub("Kcal/mol"
+                              , ""
+                              , df$DUETstabilitychange)
+#sanity checks
+head(df$DUETstabilitychange)
+#should be numeric, check if not change it to numeric
+is.numeric(df$DUETstabilitychange)
+#change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+#should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+#change the column name to indicate units
+n = which(colnames(df) == "DUETstabilitychange"); n
+colnames(df)[n] = "DUETStability_Kcalpermol"
+colnames(df)[n]
+
+#========
+#STEP 5
+#========
+#create yet another extra column: classification of DUET stability only
+df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
+                         , "Stabilizing"
+                         , "Destabilizing")  #335, 12
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281             54 
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47 
+#==============================
+#FIXME
+#Insert a venn diagram
+
+#================================
+
+
+#========
+#STEP 6
+#========
+# assign wild and mutant colnames correctly
+
+wt = which(colnames(df) == "Wild.type"); wt
+colnames(df)[wt] <- "Wild_type"
+colnames(df[wt])
+
+mut = which(colnames(df) == "Mutant.type"); mut
+colnames(df)[mut] <- "Mutant_type"
+colnames(df[mut])
+
+#========
+#STEP 7
+#========
+#create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+
+#clear variables
+rm(n, wt, mut)
+
+################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
@ -0,0 +1,252 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK:read cleaned data and perform rescaling
+  # of DUET stability scores
+  # of Pred affinity
+#compare scaling methods with plots
+#output normalised file
+#=======================================================
+
+####################
+#### read file #####: this will be the output of my R script that cleans the data columns
+####################
+source("../Scripts/step3c_data_cleaning.R")
+##This will outut two dataframes:
+##data: unclean data: 335, 10
+##df : cleaned df 335, 13
+## you can remove data if you want as you will not need it
+rm(data)
+
+colnames(df)
+
+#===================
+#3a: PredAffLog
+#===================
+n = which(colnames(df) == "PredAffLog"); n
+group = which(colnames(df) == "Lig_outcome"); group 
+
+#===================================================
+# order according to PredAffLog values
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$PredAffLog)
+
+#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+df = df[order(df$PredAffLog),] 
+head(df$PredAffLog)
+
+#sanity checks
+head(df[,n]) #all negatives
+tail(df[,n]) #all positives
+
+#sanity checks
+mean(df[,n])
+#-0.9526746
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.2112100      0.3926667 
+#===========================
+#Same as above: in 2 steps
+#===========================
+
+#find range of your data
+my_min = min(df[,n]); my_min #-3.948
+my_max = max(df[,n]); my_max #2.23
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+df$ratioPredAff = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                      )#335 14
+#sanity checks
+head(df$ratioPredAff)
+tail(df$ratioPredAff)
+
+min(df$ratioPredAff); max(df$ratioPredAff)
+
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.000000000   0.005381166 
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.001266464   1.000000000
+
+#should be the same as below (281 and 54)
+sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281              54
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+my_title = "Ligand_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioPredAff
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioPredAff )
+     , main = "ratio rescaling"
+)
+
+# titles
+mtext(text = "Frequency"
+       , side = 2
+       , line = 0
+       , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+
+#clear variables 
+rm(my_min, my_max, my_title, n, group)
+
+#===================
+# 3b: DUET stability
+#===================
+dim(df) #335, 14
+
+n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
+group = which(colnames(df) == "DUET_outcome"); group #12
+
+#===================================================
+# order according to DUET scores
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$DUETStability_Kcalpermol)
+
+#ORDER BY DUET scores: negative values at the top and positive at the bottom
+df = df[order(df$DUETStability_Kcalpermol),] 
+
+#sanity checks
+head(df[,n]) #negatives
+tail(df[,n]) #positives
+
+#sanity checks
+mean(df[,n])
+#[1] -1.173316
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.4297257     0.3978723
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+#find range of your data
+my_min = min(df[,n]); my_min #-3.87
+my_max = max(df[,n]); my_max #1.689
+
+df$ratioDUET = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                    ) #335, 15
+#sanity check
+head(df$ratioDUET)
+tail(df$ratioDUET)
+
+min(df$ratioDUET); max(df$ratioDUET)
+
+#sanity checks
+tapply(df$ratioDUET, df$DUET_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.00000000    0.01065719
+
+tapply(df$ratioDUET, df$DUET_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.003875969   1.000000000 
+
+#should be the same as below (267 and 42)
+sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+
+my_title = "DUET_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioDUET
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioDUET )
+     , main = "ratio rescaling"
+)
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#===================
+# write output as csv file
+#===================
+write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
@ -0,0 +1,131 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+###########################
+# This will return:
+
+# df with NA:
+# merged_df2 
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+###########################
+# you need merged_df3 
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+###########################
+# Data for bfactor figure
+# PS average 
+# Lig average
+###########################
+
+head(my_df$Position)
+head(my_df$ratioDUET)
+
+# order data frame 
+df = my_df[order(my_df$Position),]
+
+head(df$Position)
+head(df$ratioDUET)
+
+#***********
+# PS: average by position
+#***********
+
+mean_DUET_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.DUET = mean(ratioDUET))
+
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.Lig = mean(ratioPredAff))
+
+
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+
+# sanity check
+# mean_PS_Lig_Bfactor
+
+colnames(combined)
+
+colnames(combined) = c("Position"
+                       , "average_DUETR"
+                       , "Position2"
+                       , "average_PredAffR")
+
+colnames(combined)
+
+identical(combined$Position, combined$Position2)
+
+n = which(colnames(combined) == "Position2"); n
+
+combined_df = combined[,-n]
+
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+
+head(combined_df$Position); tail(combined_df$Position)
+
+write.csv(combined_df, outFile
+          , row.names = F)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@ -0,0 +1,250 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(cowplot)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp
+#my_df = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df$OR)
+#[1] TRUE
+
+#<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR PS Plots
+#<<<<<<<<<<<<<<<<<<<
+
+PS_df  = my_df
+
+rm(my_df)
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+getwd()
+
+source("combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df2  = merged_df3_comp
+#my_df2 = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df2)
+str(my_df2)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df2$OR)
+#[1] TRUE
+
+# sanity check: should be <10
+if (max(my_df2$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+#<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR Lig Plots
+#<<<<<<<<<<<<<<<<
+
+Lig_df  = my_df2
+
+rm(my_df2)
+
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+#############
+# Plots: Bubble plot
+# x = Position, Y = stability
+# size of dots = OR
+# col: stability
+#############
+
+#=================
+# generate plot 1: DUET vs OR by position as geom_points
+#=================  
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
+
+g = ggplot(PS_df, aes(x = factor(Position)
+                   , y = ratioDUET))
+
+p1 = g + 
+  geom_point(aes(col = DUET_outcome
+                 , size = OR)) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "DUET(PS)"
+       , size = "Odds Ratio"
+       , colour = "DUET Outcome") +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p1 
+
+#=================
+# generate plot 2: Lig vs OR by position as geom_points
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(Lig_df$Lig_outcome)
+
+g = ggplot(Lig_df, aes(x = factor(Position)
+                   , y = ratioPredAff))
+
+p2 = g + 
+  geom_point(aes(col = Lig_outcome
+                   , size = OR))+
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Ligand Affinity"
+       , size = "Odds Ratio"
+       , colour = "Ligand Outcome"
+       ) +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p2
+
+#======================
+#combine using cowplot
+#======================
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
+#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
+theme_set(theme_gray()) # to preserve default theme
+
+printFile = cowplot::plot_grid(plot_grid(p1, p2
+                             , ncol = 1
+                             , align = 'v'
+                             , labels = c("A", "B")
+                             , label_size = my_als+5))
+print(printFile)
+dev.off()
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@ -0,0 +1,154 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# Lig_outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig_outcome.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df  = my_df 
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(my_df)
+
+# sanity checks
+upos = unique(my_df$Position)
+
+# should be a factor
+is.factor(df$Lig_outcome)
+#TRUE
+
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@ -0,0 +1,149 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot 2: Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# DUET outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET_outcome
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(my_df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
+  
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@ -0,0 +1,202 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+# very important!!!!
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = Lig_outcome
+# subgroup = normalised score i.e ratioPredAff
+
+# Prepare data: round off ratioLig scores
+# round off to 3 significant digits:
+# 165 if no rounding is performed: used to generate the originalgraph
+# 156 if rounded to 3 places
+# FIXME: check if reducing precision creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioPredAff) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioLigR = round(df$ratioPredAff, n) 
+u = unique(df$ratioLigR) # 156
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioLigR
+df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioLig
+#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@ -0,0 +1,192 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = DUET_outcome
+# subgroup = normalised score i.e ratioDUET
+
+# Prepare data: round off ratioDUET scores
+# round off to 3 significant digits:
+# 323 if no rounding is performed: used to generate the original graph
+# 287 if rounded to 3 places
+# FIXME: check if reducing precicion creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioDUET) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioDUETR = round(df$ratioDUET, n)
+u = unique(df$ratioDUETR)
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioDUETR
+df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioDUET
+#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@ -0,0 +1,215 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+#require(data.table)
+#require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$lig_outcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+}
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = Lig_outcome))
+#prinfFile = g + geom_bar(
+  g + geom_bar(
+  aes(fill = Lig_outcome)
+  , show.legend = TRUE
+) + geom_label(
+  stat = "count"
+  , aes(label = ..count..)
+  , color = "black"
+  , show.legend = FALSE
+  , size = 10) + theme(
+    axis.text.x = element_blank()
+    , axis.title.x = element_blank()
+    , axis.title.y = element_text(size=my_als)
+    , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+  ) + labs(
+    title = ""
+    , y = "Number of SNPs"
+    #, fill='Ligand Outcome'
+  )  + scale_fill_discrete(name = "Ligand Outcome"
+                           , labels = c("Destabilising", "Stabilising"))
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#require(data.table)
+setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
+
+head(df$pos_count)
+table(df$pos_count)
+# this is cummulative
+#1  2  3  4  5  6 
+#5 24 36 56 30 18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count)) 
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#5 12 12 14  6  3
+# this is what will get plotted
+
+svg('position_count_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+  #g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+  ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+    )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+    )
+    , axis.title.x = element_text(size = my_als)
+    , axis.title.y = element_text(size = my_als)
+    , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+  )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of Lig barplots         			   #
+########################################################################
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@ -0,0 +1,211 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+  }
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_DUET.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+theme_set(theme_grey())
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = DUET_outcome))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes(fill = DUET_outcome)
+  , show.legend = TRUE
+  ) + geom_label(
+    stat = "count"
+    , aes(label = ..count..)
+    , color = "black"
+    , show.legend = FALSE
+    , size = 10) + theme(
+      axis.text.x = element_blank()
+      , axis.title.x = element_blank()
+      , axis.title.y = element_text(size=my_als)
+      , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+    ) + labs(
+      title = ""
+      , y = "Number of SNPs"
+      #, fill='DUET Outcome'
+      ) + scale_fill_discrete(name = "DUET Outcome"
+                              , labels = c("Destabilising", "Stabilising"))
+
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
+
+setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
+table(df$pos_count)
+# this is cummulative
+#1   2   3   4   5   6 
+#34  76  63 104  40  18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#34 38 21 26  8  3 
+
+foo = select(df, Mutationinformation
+             , WildPos
+             , wild_type
+             , mutant_type
+             , mutation_info
+             , position
+             , pos_count) #335, 5
+
+getwd()
+write.csv(foo, "../Data/pos_count_freq.csv")
+
+svg('position_count_DUET.svg')
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+  ) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+    ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+      )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+      )
+  , axis.title.x = element_text(size = my_als)
+  , axis.title.y = element_text(size = my_als)
+  , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+    )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of DUET barplots         			   #
+########################################################################
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@ -0,0 +1,175 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for PS Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$DUET_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+
+# subset data to generate pairwise correlations
+corr_data = df[, c("ratioDUET"
+#                  , "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+                   , "DUET_outcome"
+#                  , "Lig_outcome"
+                   , "pyrazinamide"
+                   )]
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+#                    , "Ligand Affinity"
+#                    , "DUET_raw"
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+                     , "DUET_outcome"
+#                    , "Lig_outcome"
+                     , "pyrazinamide")
+
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+#default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('DUET_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(printFile)
+dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@ -0,0 +1,187 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages			   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$Lig_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+# subset data to generate pairwise correlations
+corr_data = df[, c(#"ratioDUET",
+                  "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+#                  , "DUET_outcome"
+                   , "Lig_outcome"
+                   , "pyrazinamide"
+                   )] 
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c(#"DUET",
+                     "Ligand Affinity"
+#                    ,"DUET_raw" 
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+#                    , "DUET_outcome"
+                     , "Lig_outcome"
+                     , "pyrazinamide")
+                     
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+
+# default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('Lig_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
+             , pch = 21
+             , jitter = T
+#            , alpha = .05
+#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+print(printFile)
+dev.off()
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@ -0,0 +1,227 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+
+require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df		   	  		   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for plots
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+
+#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
+#3        104       1293        264       1311          6          6        105 
+
+#===========================
+# Plot: Lineage Barplots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+df_lin = subset(df, subset = lineage %in% sel_lineages )
+
+#FIXME; add sanity check for numbers.
+# Done this manually
+
+############################################################
+
+#########
+# Data for barplot: Lineage barplot
+# to show total samples and number of unique mutations 
+# within each linege
+##########
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$Mutationinformation)))
+  curr_count = length(unique(foo$Mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+bar$num_snps_u = y1
+bar$total_samples = y2
+sel_lineages = x
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+melted = melt(to_plot, id = "x")
+melted
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_basic_barplot.svg')
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted
+           , aes(x = x
+                 , y = value
+                 , fill = variable)
+           )
+
+
+printFile = g + geom_bar(
+  
+#g + geom_bar(
+  stat = "identity"
+  , position = position_stack(reverse = TRUE)
+  , alpha=.75
+  , colour='grey75'
+    ) + theme(
+    axis.text.x = element_text(
+      size = my_ats
+#      , angle= 30
+    )
+  , axis.text.y = element_text(size = my_ats
+  #, angle = 30
+  , hjust = 1
+  , vjust = 0)
+  , axis.title.x = element_text(
+    size = my_als
+    , colour = 'black'
+    )
+  , axis.title.y = element_text(
+    size = my_als
+    , colour = 'black'
+  )
+  , legend.position = "top"
+  , legend.text = element_text(size = my_als)
+  
+  #) + geom_text(
+  ) + geom_label(
+    aes(label = value)
+    , size = 5
+    , hjust = 0.5
+    , vjust = 0.5
+    , colour = 'black'
+    , show.legend = FALSE
+    #, check_overlap = TRUE
+    , position = position_stack(reverse = T)
+    #, position = ('
+
+  ) + labs(
+    title = ''
+    , x = ''
+    , y = "Number"
+    , fill = 'Variable'
+    , colour = 'black'
+  ) + scale_fill_manual(
+      values = c('grey50', 'gray75')
+      , name=''
+      , labels=c('Mutations', 'Total Samples')
+    ) + scale_x_discrete(
+      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+    )
+print(printFile)
+dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@ -0,0 +1,233 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for Lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#78     961      195     803 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#77     955      194     770
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+g <- ggplot(df, aes(x = ratioPredAff)) + 
+  geom_density(aes(fill = Lig_outcome)
+               , alpha = 0.5) + 
+  facet_wrap( ~ lineage
+             , scales = "free"
+             , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian(xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+) 
+    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_LIG.svg')
+
+printFile = ggplot( df, aes(x = ratioPredAff
+                          , y = Lig_outcome) ) +
+  
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#              , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+                  ) +
+
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "Ligand Affinity" ) +
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+      ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioPredAff
+lin2 = df[df$lineage == "lineage2",]$ratioPredAff
+lin3 = df[df$lineage == "lineage3",]$ratioPredAff
+lin4 = df[df$lineage == "lineage4",]$ratioPredAff
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3) 
+ks.test(lin2,lin4) 
+
+ks.test(lin3,lin4) 
+
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,212 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#104     1293      264     1311 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#99     1275      263     1255
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+g <- ggplot(df, aes(x = ratioDUET)) + 
+  geom_density(aes(fill = DUET_outcome)
+               , alpha = 0.5) + facet_wrap(~ lineage,
+                                           scales = "free") +
+  ggtitle("Kernel Density estimates of Protein stability by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_PS.svg')
+
+printFile = ggplot( df, aes(x = ratioDUET
+                            , y = DUET_outcome) )+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off" 
+                ) +
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size=my_als)
+         , legend.text = element_text(size=10)
+         , legend.title = element_text(size=my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+        ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioDUET
+lin2 = df[df$lineage == "lineage2",]$ratioDUET
+lin3 = df[df$lineage == "lineage3",]$ratioDUET
+lin4 = df[df$lineage == "lineage4",]$ratioDUET
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3)
+ks.test(lin2,lin4)  
+
+ks.test(lin3,lin4)  
+
+
+
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
@ -0,0 +1,27 @@
+#########################
+#3: Read complex pdb file
+##########################
+source("Header_TT.R")
+# This script only reads the pdb file of your complex
+
+# read in pdb file complex1 
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+
+#inFile2 = paste0(inDir, "complex2_no_water.pdb")
+#complex2 = inFile2
+
+# list of 8
+my_pdb = read.pdb(complex1
+                  , maxlines = -1
+                  , multi = FALSE 
+                  , rm.insert = FALSE
+                  , rm.alt = TRUE
+                  , ATOM.only = FALSE 
+                  , hex = FALSE
+                  , verbose = TRUE)
+
+rm(inDir, inFile, complex1)
+#====== end of script
+
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -0,0 +1,386 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+
+source("Header_TT.R")
+
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was 
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T)
+str(my_df)
+
+#=========================================================
+# Processing P1: Replacing B factor with mean ratioDUET scores
+#=========================================================
+
+#########################
+# Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original DUET Scores
+# 3: replaced B-factors with DUET Scores
+#*********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+ #1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: DUET scores
+hist(my_df$average_DUETR
+     , xlab = "" 
+     , main = "Norm_DUET")
+
+plot(density(my_df$average_DUETR)
+     , xlab = ""
+     , main = "Norm_DUET")
+
+# 3: After the following replacement
+#********************************
+
+#=========
+# step 0_P1: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na 
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with DUET Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "DUET_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#********************************
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# NOTE: This replaced B-factor distribution has the same
+# x-axis as the PredAff normalised values, but the distribution
+# is affected since 0 is overinflated. This is because all the positions
+# where there are no SNPs have been assigned 0.
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+
+#######################################################################
+#====================== end of section 1 ==============================
+#######################################################################
+
+
+
+
+
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace 
+rm(list = ls())
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T) 
+str(my_df)
+#rm(inDir, inFile)
+
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original Pred Aff Scores
+# 3: replaced B-factors with PredAff Scores
+#********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+# 1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+     , xlab = "" 
+     , main = "Norm_lig_average")
+
+plot(density(my_df$average_PredAffR)
+     , xlab = ""
+     , main = "Norm_lig_average")
+
+# 3: After the following replacement
+#********************************
+
+#=================================================
+# Processing P2: Replacing  B values with ratioPredAff scores
+#=================================================
+# use match to perform this replacement linking with "position no"
+# in the pdb file, this corresponds to column "resno"
+# in my_df, this corresponds to column "Position"
+
+#=========
+# step 0_P2: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
+
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P2
+#=========
+
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with PredAff Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "Lig_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#********************************
+
+###########
+# end of output files with Bfactors
+##########
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@ -0,0 +1,257 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+#########################################################
+# 1: Installing and loading required packages           #
+#########################################################
+
+source("Header_TT.R")
+#source("barplot_colour_function.R")
+
+##########################################################
+#           Checking: Entire data frame and for PS      #
+##########################################################
+
+###########################
+#2) Read file: combined one from the script
+###########################
+source("combining_two_df.R")
+
+# df with NA:
+# merged_df2
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                  , row.names = 1
+                  , stringsAsFactors = F
+                  , header = T)
+str(mcsm_data)
+my_colnames  = colnames(mcsm_data)
+
+#====================================
+# subset my_df to include only the columns in mcsm data
+my_df2 = my_df[my_colnames]
+#====================================
+# compare the two
+head(mcsm_data$Mutationinformation)
+head(mcsm_data$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
+head(mcsm_data_s$Mutationinformation)
+head(mcsm_data_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data_s, my_df2)
+
+#from lib compare
+compare(mcsm_data_s, my_df2) # seems rownames are the problem
+
+# FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+#====================================================== end of section 1
+
+
+
+##########################################################
+#             Checking: LIG(Filtered dataframe)          #
+##########################################################
+
+# clear workspace
+rm(list = ls())
+
+###########################
+#3) Read file: combined_lig from the script
+###########################
+source("combining_two_df_lig.R")
+
+# df with NA:
+# merged_df2 :
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                      , row.names = 1
+                      , stringsAsFactors = F
+                      , header = T)
+str(mcsm_data)
+
+###########################
+# 4a: Filter/subset data: ONLY for LIGand analysis
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+# sanity checks
+upos = unique(mcsm_data$Position)
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# Lig filtered: subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+rm(mcsm_data) #to avoid confusion
+
+table(mcsm_data2$Dis_lig_Ang<10)
+table(mcsm_data2$Dis_lig_Ang>10)
+
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+upos_f = unique(mcsm_data2$Position); upos_f
+
+# colnames of df that you will need to subset the bigger df from
+my_colnames  = colnames(mcsm_data2)
+#====================================
+# subset bigger df i.e my_df to include only the columns in mcsm data2
+my_df2 = my_df[my_colnames] 
+
+rm(my_df) #to avoid confusion
+#====================================
+# compare the two
+head(mcsm_data2$Mutationinformation)
+head(mcsm_data2$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
+head(mcsm_data2_s$Mutationinformation)
+head(mcsm_data2_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data2_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data2_s, my_df2)
+
+# from library compare
+compare(mcsm_data2_s, my_df2) # seems rownames are the problem
+
+#FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+##########################################################
+#  extract and write output file for SNP posn: all     #
+##########################################################
+
+head(merged_df3$Position)
+
+foo = merged_df3[order(merged_df3$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique
+
+# sanity check: 
+table(snp_pos_unique == combined_df$Position)
+
+#=====================
+# write_output files
+#=====================
+outDir = "~/Data/pyrazinamide/input/processed/"
+
+
+outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
+print(paste0("Output file name and path will be:","", outFile1))
+
+write.table(snp_pos_unique
+            , outFile1
+            , row.names = F
+            , col.names = F)
+            
+##############################################################
+#  extract and write output file for SNP posn: complete only #
+##############################################################
+head(merged_df3_comp$Position)
+
+foo = merged_df3_comp[order(merged_df3_comp$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique 
+
+# outDir = "~/Data/pyrazinamide/input/processed/" # already set
+
+outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
+print(paste0("Output file name and path will be:", outFile2))
+
+write.table(snp_pos_unique
+            , outFile2
+            , row.names = F
+            , col.names = F)
+#============================== end of script
+
+
--- a/meta_data_analysis/.Rhistory
+++ b/meta_data_analysis/.Rhistory
@ -0,0 +1,512 @@
+, stringsAsFactors = F)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
+my_logor
+pnca_snps_or$Mutationinformation == i
+View(pnca_snps_or)
+#===============
+# Step 4: Calculate for one snp
+# using i, when you run the loop, it is easy
+#===============
+i = "pnca_p.trp68gly"
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+# uncomment as necessary
+pnca_snps_or = pnca_snps_or[1:5,]
+pnca_snps_or = pnca_snps_or[c(1:5),]
+#===============
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+pnca_snps_or = pnca_snps_or[1:5,]
+pnca_snps_or = pnca_snps_or[c(1:5),]
+pnca_snps_or = pnca_snps_or[1:5]
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+pnca_snps_or = pnca_snps_or[1:5]
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+foo = pnca_snps_or[c(1:5,)]
+foo = pnca_snps_or[c(1:5),]
+foo = as.data.frame(pnca_snps_or[c(1:5),])
+View(foo)
+# create an empty dataframe
+pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR, but only for one snp
+# this is test before you apply it all others
+#===============
+pnca_snps_or$mutation == i
+View(pnca_snps_or)
+# create an empty dataframe
+pnca_snps_or = data.frame(mutation = i)
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+View(pnca_snps_or_copy)
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR, but only for one snp
+# this is test before you apply it all others
+#===============
+#reset original df so you don't make a mistake
+pnca_snps_or = pnca_snps_or_copy
+for (i in pnca_snps_unique){
+print(i)
+}
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
+View(pnca_snps_or)
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+View(pnca_snps_or)
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+warnings()
+View(pnca_snps_or)
+View(pnca_snps_or_copy)
+#sanity check
+pnca_snps_or$mutation == i1
+#sanity check
+pnca_snps_or[pnca_snps_or$mutation == i1]
+pnca_snps_or[pnca_snps_or$mutation == i2]
+pnca_snps_or[pnca_snps_or$mutation == i2,]
+pnca_snps_or1 = unique(pnca_snps_or)
+write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
+# you only need it for the unique mutations
+pnca_snps_or = unique(pnca_snps_or) #2133, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+View(pnca_snps_or)
+2.290256e+01
+1.561132e+06
+3.242285e-04
+#sanity check
+pnca_snps_or[pnca_snps_or$mutation == i1]
+pnca_snps_or[pnca_snps_or$mutation == i2,]
+write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+View(my_data)
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+snps_all = unique(my_data$mutation)# 337
+pnca_snps_or = snps_all
+pnca_snps_or = as.data.frame(snps_all)
+View(pnca_snps_or)
+snps_all[-"true_wt"]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+View(pnca_snps_or)
+snps_all = as.data.frame(snps_all)
+View(snps_all)
+#remove true_wt entry
+w1 = which(rownames(snps_all) == "true_wt")
+View(snps_all)
+#remove true_wt entry
+w1 = which(snps_all$snps_all == "true_wt")
+rm(pnca_snps_or)
+pnca_snps_or = snps_all[-w1]
+pnca_snps_or = snps_all[,-w1]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+#remove true_wt entry
+w1 = which(snps_all) == "true_wt"
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+snps_all = unique(my_data$mutation)# 337
+snps_all = as.data.frame(snps_all)
+snps_all[-c(1,1)]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
+#remove true_wt entry
+#w1 = which(snps_all) == "true_wt"
+pnca_snps_or = snps_all
+pnca_snps_or = pnca_snps_or_copy
+#remove true_wt entry
+#w1 = which(snps_all) == "true_wt"
+pnca_snps_or = snps_all
+pnca_snps_or -> pnca_snps_or_copy
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR for each snp
+# and assign to the pnca_snps_or df that has
+# each row as a unique snp
+#===============
+# reset original df so you don't make a mistake: IMPORTANT
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+# you only need it for the unique mutations
+pnca_snps_or = unique(pnca_snps_or) #337, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+getwd()
+#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
+setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
+getwd()
+#===============
+# Step 1: read raw data
+#===============
+raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
+,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
+raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
+# combine the two mutation columns
+raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
+head(raw_data$all_mutations_pyrazinamide)
+# create yet another column that contains all the mutations but in lower case
+raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
+table(grepl("pnca_p",raw_data$all_muts_pza))
+#FALSE  TRUE
+#10603  1908
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+# subset a snall section to test
+#pnca_snps_or_copy = pnca_snps_or
+#pnca_snps_or = pnca_snps_or_copy
+pnca_snps_unique = unique(pnca_snps_or$mutation) #293
+i2 = "pnca_p.trp68gly" # Should exist
+grep(i2, pnca_snps_unique)
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+head(my_data$mutation)
+my_data = unique(my_data$mutation)
+my_data[!duplicated(my_data$mutation)]
+my_data_unique = my_data[!duplicated(my_data$mutation),]
+my_data[!duplicated('mutation'),]
+my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
+my_data_unique = my_data[!duplicated(my_data['mutation']),]
+getwd()
+setwd("/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+getwd()
+setwd("/git/github/LSHTM_analysis/meta_data_analysis")
+getwd()
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
+c = file.choose()
+c = file.choose(../Data_original)
+c = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F))
+c = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
+outdir = paste0("../mcsm_analysis",drug,"/Data/")
+# define output variables
+drug  = 'pyrazinamide'
+outdir = paste0("../mcsm_analysis",drug,"/Data/")
+outdir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+output_filename
--- a/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
+++ b/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -0,0 +1,7 @@
+#!/usr/bin/python3
+# Initialise a blank 'Data' directory and drug subdirs etc.
+# TODO:
+# - Read base dir from config file
+# - Create eg: '~/git/Data/{original,processed}
+# - Create eg: '~/git/Data/processed/' + drug (for each drug)
+# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
--- a/meta_data_analysis/pnca_AF_and_OR_calcs.R
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@ -0,0 +1,241 @@
+getwd()
+setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+
+raw_data = infile[,c("id"
+                     , "pyrazinamide"
+                     , "dr_mutations_pyrazinamide"
+                     , "other_mutations_pyrazinamide")]
+
+#####
+# 1a: exclude na
+#####
+raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
+
+total_samples = length(unique(raw_data$id))
+print(total_samples)
+
+# sanity check: should  be true
+is.numeric(total_samples) 
+
+#####
+# 1b: combine the two mutation columns
+#####
+raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
+                                            , raw_data$other_mutations_pyrazinamide)
+head(raw_data$all_mutations_pyrazinamide)
+
+#####
+# 1c: create yet another column that contains all the mutations but in lower case
+#####
+raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
+
+# sanity checks
+table(grepl("pnca_p",raw_data$all_muts_pnca))
+
+# sanity check: should be TRUE
+sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
+
+# set up variables: can be used for logistic regression as well
+i  = "pnca_p.ala134gly" # has a NA, should NOT exist
+table(grepl(i,raw_data$all_muts_pnca))
+
+i = "pnca_p.trp68gly"
+table(grepl(i,raw_data$all_muts_pnca))
+
+mut = grepl(i,raw_data$all_muts_pnca)
+dst = raw_data$pyrazinamide
+table(mut, dst)
+
+#chisq.test(table(mut,dst))
+#fisher.test(table(mut, dst))
+#table(mut)
+
+###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
+pnca_snps_or = read.csv(file.choose()
+                        , stringsAsFactors = F
+                        , header = T)
+
+# extract unique snps to iterate over for AF and OR calcs
+# total no of unique snps
+# AF and OR calculations
+
+pnca_snps_unique = unique(pnca_snps_or$mutation) 
+
+# Define OR function
+x = as.numeric(mut)
+y = dst
+or = function(x,y){
+  tab = as.matrix(table(x,y))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+  }
+
+dst = raw_data$pyrazinamide
+ors = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  or(mut,dst)
+})
+
+ors
+
+pvals = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  fisher.test(mut,dst)$p.value
+})
+
+pvals
+
+afs = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  mean(mut)
+})
+
+afs
+
+# check ..hmmm
+afs['pnca_p.trp68gly']
+afs['pnca_p.gln10pro'] 
+afs['pnca_p.leu4ser'] 
+
+#plot(density(log(ors)))
+#plot(-log10(pvals))
+#hist(log(ors)
+#     ,breaks = 100
+#     )
+
+# subset df cols to add to the calc param df
+pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
+pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
+
+rownames(pnca_snps_cols) = pnca_snps_cols$mutation
+head(rownames(pnca_snps_cols))
+#snps_with_AF_and_OR
+
+# combine
+comb_AF_and_OR = data.frame(ors, pvals, afs)
+head(rownames(comb_AF_and_OR))
+
+# sanity checks: should be the same
+dim(comb_AF_and_OR); dim(pnca_snps_cols)
+table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
+
+table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
+
+# merge the above two df whose dim you checked
+snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
+                            , by = "row.names"
+#                            , all.x = T
+                            )
+
+#rm(pnca_snps_cols, pnca_snps_or, raw_data)
+
+#===============
+# Step 3: Read data file where you will add the calculated OR 
+# Note: this is the big file with one-many relationship between snps and lineages
+# i.e fname4 from 'pnca_extraction.py'
+#===============
+my_data = read.csv(file.choose()
+                   , row.names = 1
+                   , stringsAsFactors = FALSE)
+
+head(my_data)
+length(unique(my_data$id))
+
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+
+# sanity check
+head(my_data$mutation)
+
+# FILES TO MERGE:
+# comb_AF_and_OR: file containing OR
+# my_data = big meta data file 
+# linking column: mutation
+
+head(my_data)
+merged_df = merge(my_data # big file
+                  , snps_with_AF_and_OR # small (afor file)
+                  , by = "mutation"
+                  , all.x = T) # because you want all the entries of the meta data 
+
+# sanity checks: should be True 
+# FIXME: I have checked this manually, but make it so it is a pass or a fail!
+comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
+merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
+
+merged_df[merged_df$Mutationinformation.x == "Q10P",]
+
+# sanity check: very important!
+colnames(merged_df)
+
+table(merged_df$mutation_info.x == merged_df$mutation_info.y)
+
+#FIXME: what happened to other 7 and FALSE
+table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
+
+# problem
+identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
+
+#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
+
+#throw away the y because that is a smaller df
+d1 = which(colnames(merged_df) == "mutation_info.y") #21
+d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
+
+merged_df2 = merged_df[-c(d1, d2)] #3093 20
+colnames(merged_df2)
+
+# rename cols 
+colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
+colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
+
+colnames(merged_df2)
+
+# should be 0
+sum(is.na(merged_df2$Mutationinformation))
+
+# count na in each column
+na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
+# only some or and Af should be NA
+#Row.names           ors               pvals               afs 
+#81                  81                81                  81 
+
+
+colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
+colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
+colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
+
+colnames(merged_df2)
+
+# add log OR and neglog pvalue
+merged_df2$logor = log(merged_df2$OR)
+is.numeric(merged_df2$logor)
+
+merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
+is.numeric(merged_df2$neglog10pvalue)
+
+# write file out
+#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
+
+# define output variables
+drug  = 'pyrazinamide'
+out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+
+write.csv(merged_df2, output_filename
+          , row.names = F)
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+"""
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+#%% load libraries
+###################
+# load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
+#from pandas.api.types import is_string_dtype
+#from pandas.api.types import is_numeric_dtype
+
+# to create dir
+#my_dir = os.path.expanduser('~/some_dir')
+#make sure mcsm_analysis/ exists
+#or specify the output directory
+
+#%%
+#%%
+#%%
+#========================================================
+# TASK: extract ALL pncA mutations from GWAS data
+#========================================================
+#%%
+####################
+# my working dir
+os.getcwd()
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#%%
+from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+#%%
+#NOTE: Out_dir MUST exis
+# User defined dir strpyrazinamide
+drug = 'pyrazinamide'
+gene = 'pnca'
+out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
+# = out_dir + drug
+data_dir = homedir + '/git/Data'
+
+if not os.path.exists(data_dir):
+    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
+    os.makedirs(data_dir)
+    die()
+
+if not os.path.exists(out_dir):
+    print('Error!', out_dir, 'does not exist. Please create it')
+    exit()
+    
+#if not os.path.exists(work_dir):
+#    print('creating dir that does not exist', 'dir_name:', work_dir)
+#    os.makedirs(work_dir)
+else:
+    print('Dir exists: Carrying on')
+
+# now create sub dir structure within work_dir
+# pyrazinamide/mcsm_analysis
+
+# we need three dir
+# Data
+# Scripts
+    # Plotting
+# Results
+    # Plots
+    
+# create a list of dir names
+#dir_names = ['Data', 'Scripts', 'Results']
+
+
+#for i in dir_names:
+#    this_dir = (work_dir + '/' + i)
+#    if not os.path.exists(this_dir):
+#        print('creating dir that does not exist:', this_dir)
+#        os.makedirs(this_dir)
+#else:
+#    print('Dir exists: Carrying on')
+      
+# Create sub dirs
+# 1)        
+# Scripts
+    # Plotting
+#subdir_plotting = work_dir + '/Scripts/Plotting'
+#if not os.path.exists(subdir_plotting):
+#      print('creating dir that does not exist:', subdir_plotting)
+#      os.makedirs(subdir_plotting)
+#else:
+#    print('Dir exists: Carrying on')
+ 
+# 2)    
+# Results
+    # Plots
+#subdir_plots = work_dir + '/Results/Plots'        
+#if not os.path.exists(subdir_plots):
+#        print('creating dir that does not exist:', subdir_plots)
+#        os.makedirs(subdir_plots)    
+#else:
+#    print('Dir exists: Carrying on')
+         
+# clear varaibles
+#del(dir_names, drug, i, subdir_plots, subdir_plotting)
+
+#exit()
+#%%
+#==============================================================================
+############
+# STEP 1: Read file original_tanushree_data_v2.csv
+############
+data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+meta_data = pd.read_csv(data_file, sep = ',') 
+
+# column names
+list(meta_data.columns)
+
+# extract elevant columns to extract from meta data related to the pyrazinamide
+meta_data = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+# checks
+total_samples = meta_data['id'].nunique() # 19265
+
+# counts NA per column
+meta_data.isna().sum()
+
+# glance
+meta_data.head()
+
+# equivalent of table in R
+# pyrazinamide counts
+meta_data.pyrazinamide.value_counts() 
+
+#%%
+############
+# STEP 2: extract entries containing selected genes: 
+# pyrazinamide: pnca_p.
+# in the dr_mutations and other mutations"
+# as we are interested in the mutations in the protein coding region only 
+# (corresponding to a structure)
+# and drop the entries with NA
+#############
+meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+
+del(meta_pza)
+
+##########################
+# pyrazinamide: pnca_p.
+##########################
+meta_data_pnca = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+del(meta_data)
+
+# sanity checks
+
+# dr_mutations only
+meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_dr['id'].nunique() 
+del(meta_pnca_dr)
+
+# other mutations
+meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_other['id'].nunique() 
+del(meta_pnca_other)
+
+# Now extract "all" mutations
+meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
+
+meta_pnca_all['id'].nunique() 
+pnca_samples = len(meta_pnca_all)
+pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
+comp_pnca_samples = pnca_samples - pnca_na 
+
+#=#=#=#=#=#=#
+# COMMENT: use it later to check number of complete samples from LF data
+#=#=#=#=#=#=#
+
+# sanity checks
+meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
+meta_pnca_all.other_mutations_pyrazinamide.value_counts()
+
+# more sanity checks 
+# !CAUTION!: muts will change depending on your gene
+
+# dr muts : insert
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
+
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+
+# other_muts
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
+
+#=#=#=#=#=#=#=#=#=#
+# FIXME
+# COMMENTS: both mutations columns are separated by ; 
+# CHECK if there are mutations that exist both in dr and other_muts!
+# doesn't make any sense for the same mut to exist in both, I would have thought!
+#=#=#=#=#=#=#=#=#=#
+
+# remove not required variables
+del(meta_data_pnca)
+
+############
+# STEP 3: split the columns of 
+# a) dr_mutation_... (;) as 
+# the column has snps related to multiple genes.
+# useful links
+# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
+# this one works beautifully
+# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
+############
+
+# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
+meta_pnca_all.isna().sum()
+
+#=#=#=#=#=#=#=#=#=#
+# COMMENT: no NA's in dr_mutations/other_mutations_columns
+#=#=#=#=#=#=#=#=#=#
+# define the split function
+def tidy_split(df, column, sep='|', keep=False):
+    """
+    Split the values of a column and expand so the new DataFrame has one split
+    value per row. Filters rows where the column is missing.
+
+    Params
+    ------
+    df : pandas.DataFrame
+        dataframe with the column to split and expand
+    column : str
+        the column to split and expand
+    sep : str
+        the string used to split the column's values
+    keep : bool
+        whether to retain the presplit value as it's own row
+
+    Returns
+    -------
+    pandas.DataFrame
+        Returns a dataframe with the same columns as `df`.
+    """
+    indexes = list()
+    new_values = list()
+    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
+    for i, presplit in enumerate(df[column].astype(str)):
+        values = presplit.split(sep)
+        if keep and len(values) > 1:
+            indexes.append(i)
+            new_values.append(presplit)
+        for value in values:
+            indexes.append(i)
+            new_values.append(value)
+    new_df = df.iloc[indexes, :].copy()
+    new_df[column] = new_values
+    return new_df
+
+########
+# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
+#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+########    
+meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 
+
+# remove leading white space else these are counted as distinct mutations as well
+meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
+
+########
+# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
+######## 
+meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 
+
+# remove the leading white spaces in the column
+meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 
+
+##########
+# Step 4: Reshape data so that all mutations are in one column and the 
+# annotations for the mutation reflect the column name
+# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
+
+# data frame “df” is passed to melt() function
+# id_vars is the variable which need to be left unaltered
+# var_name are the column names so we named it as 'mutation_info'
+# value_name are its values so we named it as 'mutation'
+##########
+meta_pnca_WF1.columns
+
+meta_pnca_LF0 = pd.melt(meta_pnca_WF1
+                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
+                      , var_name = 'mutation_info'
+                      , value_name = 'mutation') 
+
+# sanity check: should be true
+if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
+    print('sanity check passed: Long format df has the expected length')
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 5: This is still dirty data. Filter LF data so that you only have
+# mutations corresponding to pnca_p. 
+# this will be your list you run OR calcs 
+###########
+meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 
+
+# sanity checks
+# unique samples
+meta_pnca_LF1['id'].nunique()
+if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
+    print("Sanity check passed: No of samples with pncA mutations match")
+else:
+    print("Sanity check failed: Debug please!")
+
+# count if all the mutations are indeed in the protein coding region 
+# i.e begin with pnca_p
+meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
+
+# should  be true.
+# and check against the length of the df, which should match
+if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
+    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 6: Filter dataframe with "na" in the drug column
+# This is because for OR, you can't use the snps that have the
+# NA in the specified drug column
+# it creates problems when performing calcs in R inside the loop
+# so best to filter it here
+###########
+# NOT NEEDED FOR all snps, only for extracting valid OR snps
+del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
+
+###########
+# Step 7: count unique pncA_p mutations (all and comp cases)
+###########
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('mutation_info').nunique()
+
+meta_pnca_LF1['id'].nunique()  
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('id').nunique()
+
+###########
+# Step 8: convert all snps only (IN LOWERCASE)
+# because my mcsm file intergated has lowercase
+###########
+# convert mutation to lower case as it needs to exactly match the dict key
+#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
+meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
+
+###########
+# Step 9 : Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one from the 
+# referece_dict imported pncaeady. First convert to mutation to lowercase
+# to allow to match entries from dict 
+###########
+#=======
+# Step 9a: iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a regex match from the dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#=======
+# initialise a sub dict that is a lookup dict for three letter code to one
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
+
+# extract position info from mutation column separetly using regex
+meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#=========
+# Step 9b: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
+    
+# added two more cols
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#========
+# Step 9c: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+# added two more cols
+    
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+########
+# Step 10: combine the wild_type+poistion+mutant_type columns to generate 
+# Mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+#########
+meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
+
+#=#=#=#=#=#=#
+# Step 11:
+# COMMENT: there is more processing in the older version of this script
+# consult if necessary
+# possibly due to the presence of true_wt
+# since this file doesn't contain any true_wt, we won't need it(hopefully!)
+#=#=#=#=#=#=#
+
+#%%
+###########
+# Step 12: Output files for only SNPs to run mCSM
+###########
+
+#=========
+# Step 12a: all SNPs to run mCSM
+#=========
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+
+# assign meaningful colnames 
+#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
+#list(snps_only.columns)
+snps_only.isna().sum() # should be 0
+
+# output csv: all SNPS for mCSM analysis
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_snps_'
+nrows = len(snps_only) 
+
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+output_file_path = data_dir + '/input/processed/' + drug + '/'
+
+if not os.path.exists(output_file_path):
+    print( output_file_path, 'does not exist. Creating')
+    os.makedirs(output_file_path)
+    exit()
+
+output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+# Bad practice: numbers at the start of a filename
+snps_only.to_csv(output_filename, header = False, index = False)
+
+#=========
+# Step 12b: all snps with annotation
+#=========
+# all snps, selected cols
+#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
+#                               , 'drtype', 'pyrazinamide'
+#                               , 'mutation_info', 'mutation', 'Mutationinformation']] 
+
+#len(pnca_snps_ALL) 
+
+# sanity check
+#meta_pnca_LF1['mutation'].nunique() 
+
+# output csv: WITH column but WITHOUT row names(all snps with meta data)
+# specify variable name for output file
+#gene = 'pnca'
+#drug = 'pyrazinamide'
+#my_fname2 = '_snps_with_metadata_'
+#nrows = len(pnca_snps_ALL) 
+
+#output_file_path = work_dir + '/Data/'
+#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
+#print(output_filename)  #<<<- check
+
+# write out file
+#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+
+#=========
+# Step 12c: comp snps for OR calcs with annotation
+#=========
+# remove all NA's from pyrazinamide column from LF1
+    
+# counts NA per column
+meta_pnca_LF1.isna().sum()
+
+# remove NA
+meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
+
+# sanity checks
+# should be True
+len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
+
+# unique counts
+meta_pnca_LF2['mutation'].nunique() 
+
+meta_pnca_LF2.groupby('mutation_info').nunique() 
+
+# sanity check
+meta_pnca_LF2['id'].nunique() 
+
+# should be True
+if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
+    print ('sanity check passed: complete numbers match')
+else:
+    print('Error: Please Debug!')
+
+# value counts
+meta_pnca_LF2.mutation.value_counts()
+#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
+
+# valid/comp snps
+# uncomment as necessary
+pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+len(pnca_snps_COMP) 
+
+# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
+# specify variable name for output file
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname3 = '_comp_snps_with_metadata_'
+nrows = len(pnca_snps_COMP) 
+
+#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
+#print(output_filename) #<<<-check
+
+# write out file
+#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
+
+
+#=========
+# Step 12d: comp snps only
+#=========
+# output csv: comp SNPS for info (i.e snps for which OR exist)
+# specify variable name for output file
+
+snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_comp_snps_'
+nrows = len(snps_only) 
+
+output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+snps_only.to_csv(output_filename, header = False, index = False)
+
+
+#=#=#=#=#=#=#=#
+# COMMENT: LF1 is the file to extract all unique snps for mcsm 
+# but you have that already in file called pnca_snps...
+# LF2: is the file for extracting snps tested for DS and hence OR calcs
+#=#=#=#=#=#=#=#
+
+###########
+# Step 13 : Output the whole df i.e 
+# file for meta_data which is now formatted with
+# each row as a unique snp rather than the original version where
+# each row is a unique id
+# ***** This is the file you will ADD the AF and OR calculations to *****
+###########
+
+# output csv: the entire DF
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname4 = '_metadata'
+#nrows = len(meta_pnca_LF1)
+output_filename = output_file_path + gene + my_fname4  + '.csv'
+print(output_filename) #<<<-check
+
+# write out file
+meta_pnca_LF1.to_csv(output_filename) 
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun 18 11:32:28 2019
+
+@author: tanushree
+"""
+############################################
+#load libraries
+import pandas as pd
+import os
+#############################################
+
+#!#########################!
+# REQUIREMNETS:
+# Data_original/ must exist
+# containing GWAS data
+#!#########################!
+
+print(os.getcwd()) 
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/Data/input/original') 
+print(os.getcwd())
+#==========
+#read file
+#==========
+my_aa = pd.read_csv('aa_codes.csv') #20, 6
+#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
+#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
+my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
+
+#=========================================================
+#convert file to  dict of dicts
+#=========================================================
+#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
+#with your choice of column name that you have assigned to index as the "primary key". 
+#using 'index' creates a dict of dicts
+#using 'records' creates a list of dicts
+my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
+
+#================================================
+#dict of aa with their corresponding properties
+#This is defined twice
+#================================================
+#7 categories: no overlap
+qualities1 = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidic'
+             , ('N', 'Q'): 'Amidic'
+             , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
+             , ('S', 'T'): 'Hydroxylic'
+             , ('F', 'W', 'Y'): 'Aromatic'
+             , ('C', 'M'): 'Sulphur'
+}
+
+#9 categories: allowing for overlap
+qualities2 = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidc'
+             , ('S', 'T', 'N', 'Q'): 'Polar'
+             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
+             , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
+             , ('S', 'G', 'A', 'P'): 'Small'
+             , ('F', 'W', 'Y', 'H'): 'Aromatic'
+             , ('V', 'I', 'L', 'M'): 'Aliphatic'
+             , ('C', 'G', 'P'): 'Special'
+}
+
+qualities_taylor = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidc'
+             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
+             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
+             #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
+             , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' 
+             , ('F', 'W', 'Y', 'H'): 'Aromatic'
+             , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
+             , ('C', 'G', 'P'): 'Special'
+}
+
+qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
+                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
+}
+
+qualities_polarity = { ('D', 'E'): 'acidic'
+                      , ('H', 'K', 'R'): 'basic'
+                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
+                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
+}
+
+#==============================================================================                
+#adding amino acid properties to my dict of dicts                      
+for k, v in my_aa_dict.items():
+    #print (k,v)
+    v['aa_prop1'] = str() #initialise keys 
+    v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
+    v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
+    v['aa_prop_water'] = str() #initialise keys
+    v['aa_prop_polarity'] = str() #initialise keys
+    
+    for group in qualities1:
+        if v['one_letter_code'] in group:
+            v['aa_prop1']+= qualities1[group] # += for str concat   
+
+    for group in qualities2:
+        if v['one_letter_code'] in group:
+            v['aa_prop2'].append(qualities2[group]) # append to list
+ 
+    for group in qualities_taylor:
+        if v['one_letter_code'] in group:
+            v['aa_taylor'].append(qualities_taylor[group]) # append to list           
+            
+    for group in qualities_water:
+        if v['one_letter_code'] in group:
+            v['aa_prop_water']+= qualities_water[group] # += for str concat          
+
+    for group in qualities_polarity:
+        if v['one_letter_code'] in group:
+            v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat 
+             
+#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
+#==============================================================================
+            
+  
--- a/mk-drug-dirs.sh
+++ b/mk-drug-dirs.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+# Create a drug directory structure for processing
+#
+#
+# Structure:
+#
+# $DATA_DIR/$DRUG/input
+#                 |- original
+#                 |- processed
+#                 |- structure
+#                 
+# $DATA_DIR/$DRUG/output
+#                 |- plots
+#                 |- structure
+
+DATA_DIR=~/git/Data
+
+if [[ $1 == '' ]]; then
+    echo "usage: mk-drug-dirs.sh <drug name>";
+    exit;
+else
+    DRUG=$1
+    echo Creating structure for: $DRUG
+
+    if [ -d $DATA_DIR ]
+    then
+        echo Doing creation in $DATA_DIR
+        mkdir -p $DATA_DIR/$DRUG/input/original
+        mkdir -p $DATA_DIR/$DRUG/input/processed
+        mkdir -p $DATA_DIR/$DRUG/input/structure
+        mkdir -p $DATA_DIR/$DRUG/output/plots
+        mkdir -p $DATA_DIR/$DRUG/output/structure
+        
+    else
+        echo "Error: $DATA_DIR does not exist. Did you check it out of git?"
+        exit
+    fi
+
+fi
+