From bccfe68192b50ea9089f03bfc26dca775f14a30a Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 8 Jan 2020 16:15:33 +0000
Subject: [PATCH] import commit

---
 README.md                                     |  59 ++
 mcsm.conf                                     |  12 +
 mcsm_analysis/pyrazinamide/scripts/.Rhistory  | 512 ++++++++++++++
 .../pyrazinamide/scripts/Header_TT.R          | 129 ++++
 .../scripts/barplot_colour_function.R         |  27 +
 .../pyrazinamide/scripts/combining_two_df.R   | 299 +++++++++
 .../scripts/combining_two_df_lig.R            | 348 ++++++++++
 .../mcsm/step0_check_duplicate_SNPs.sh        |  25 +
 ...ep1_mCSMLig_curl_submit_store_outputurl.sh |  72 ++
 ...step2_mCSM_LIG_batch_outputurls_results.sh |  59 ++
 ...step3a_mCSM_LIG_regex_output_formatting.sh |  52 ++
 .../scripts/mcsm/step3b_format_results.py     |  29 +
 .../scripts/mcsm/step3c_data_cleaning.R       | 207 ++++++
 .../scripts/mcsm/step4_normalise.R            | 252 +++++++
 .../scripts/mcsm_mean_stability.R             | 131 ++++
 .../pyrazinamide/scripts/plotting/.RData      | Bin 0 -> 43777 bytes
 .../pyrazinamide/scripts/plotting/.Rhistory   |   0
 .../plotting/OR_PS_Ligand_combined_plot.R     | 250 +++++++
 .../scripts/plotting/barplots_2colours_LIG.R  | 154 +++++
 .../scripts/plotting/barplots_2colours_PS.R   | 149 +++++
 .../plotting/barplots_subcolours_LIG.R        | 202 ++++++
 .../scripts/plotting/barplots_subcolours_PS.R | 192 ++++++
 .../scripts/plotting/basic_barplots_LIG.R     | 215 ++++++
 .../scripts/plotting/basic_barplots_PS.R      | 211 ++++++
 .../scripts/plotting/corr_plots_v3_PS.R       | 175 +++++
 .../scripts/plotting/corr_plots_v3_lig.R      | 187 ++++++
 .../scripts/plotting/lineage_basic_barplot.R  | 227 +++++++
 .../scripts/plotting/lineage_dist_LIG.R       | 233 +++++++
 .../scripts/plotting/lineage_dist_PS.R        | 212 ++++++
 mcsm_analysis/pyrazinamide/scripts/read_pdb.R |  27 +
 .../pyrazinamide/scripts/replaceBfactor_pdb.R | 386 +++++++++++
 .../pyrazinamide/scripts/source_data_checks.R | 257 +++++++
 meta_data_analysis/.Rhistory                  | 512 ++++++++++++++
 .../__pycache__/reference_dict.cpython-37.pyc | Bin 0 -> 2203 bytes
 meta_data_analysis/init_data_dirs.py          |   7 +
 meta_data_analysis/pnca_AF_and_OR_calcs.R     | 241 +++++++
 meta_data_analysis/pnca_data_extraction.py    | 626 ++++++++++++++++++
 meta_data_analysis/reference_dict.py          | 121 ++++
 mk-drug-dirs.sh                               |  40 ++
 39 files changed, 6837 insertions(+)
 create mode 100644 README.md
 create mode 100644 mcsm.conf
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/.Rhistory
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/Header_TT.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
 create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
 create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
 create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
 create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
 create mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.RData
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/read_pdb.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
 create mode 100644 mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
 create mode 100644 meta_data_analysis/.Rhistory
 create mode 100644 meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc
 create mode 100755 meta_data_analysis/init_data_dirs.py
 create mode 100644 meta_data_analysis/pnca_AF_and_OR_calcs.R
 create mode 100755 meta_data_analysis/pnca_data_extraction.py
 create mode 100755 meta_data_analysis/reference_dict.py
 create mode 100755 mk-drug-dirs.sh

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1fe45f7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,59 @@
+mCSM Analysis
+=============
+
+This repo does mCSM analysis using Python (Pandas, numpy), bash and R.
+
+Requires an additional 'Data' directory (Supplied separately). Batteries not included.
+
+ # directory struc
+
+ # Assumptions
+
+ 1. git repos are cloned to `~/git`
+ 2. The script
+
+## LSHTM\_analysis: Repo
+
+ subdirs
+
+ needs to call `../Data/input/original/`
+
+ meta\_data\_analysis/
+ 	*.R
+ 	*.py
+	
+ needs to call `../Data/input/processed/<drug>`
+ needs to output `../Data/output/results/<drug>`
+
+ mcsm\_analysis/
+	<drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
+		scripts/ (changed from `Scripts/`)
+		*.R
+		*.py
+			mcsm/
+			*.sh
+			*.py
+			*.R
+			plotting/
+			*.R
+
+
+Data: Repo:
+
+ # subdirs
+
+	input/
+		original/
+		processed/
+	
+	output/
+	    <drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
+    		results/
+        		*.csv
+        		*.xlsx
+        		*.doc
+        		*.txt
+                plots/
+                structure/
+
+More docs here as I write them.
diff --git a/mcsm.conf b/mcsm.conf
new file mode 100644
index 0000000..5310227
--- /dev/null
+++ b/mcsm.conf
@@ -0,0 +1,12 @@
+# This is not yet used, but will be soon :)
+[DEFAULT]
+mcsm_home = /home/tanu/git/github/LSHTM_analysis
+
+[pyrazinamide]
+  gene = 'pnca'
+  server = 'http://mcsm.melb.ac.uk/'
+
+[monkeyamide]
+  gene = 'abcs'
+  server = 'http://myserver.local/'
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
new file mode 100644
index 0000000..8055405
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@@ -0,0 +1,512 @@
+###########################
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+# quick checks
+colnames(my_df)
+str(my_df)
+###########################
+# Data for bfactor figure
+# PS average
+# Lig average
+###########################
+head(my_df$Position)
+head(my_df$ratioDUET)
+# order data frame
+df = my_df[order(my_df$Position),]
+head(df$Position)
+head(df$ratioDUET)
+#***********
+# PS: average by position
+#***********
+mean_DUET_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.DUET = mean(ratioDUET))
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.Lig = mean(ratioPredAff))
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+# sanity check
+# mean_PS_Lig_Bfactor
+colnames(combined)
+colnames(combined) = c("Position"
+, "average_DUETR"
+, "Position2"
+, "average_PredAffR")
+colnames(combined)
+identical(combined$Position, combined$Position2)
+n = which(colnames(combined) == "Position2"); n
+combined_df = combined[,-n]
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+#=============
+# output csv
+#============
+outDir = "~/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+head(combined_df$Position); tail(combined_df$Position)
+write.csv(combined_df, outFile
+, row.names = F)
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+source("../combining_two_df.R")
+###########################
+# This will return:
+# df with NA:
+# merged_df2
+# merged_df3
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+###########################
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+# quick checks
+colnames(my_df)
+str(my_df)
+###########################
+# Data for bfactor figure
+# PS average
+# Lig average
+###########################
+head(my_df$Position)
+head(my_df$ratioDUET)
+# order data frame
+df = my_df[order(my_df$Position),]
+head(df$Position)
+head(df$ratioDUET)
+#***********
+# PS: average by position
+#***********
+mean_DUET_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.DUET = mean(ratioDUET))
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+group_by(Position) %>%
+summarize(averaged.Lig = mean(ratioPredAff))
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+# sanity check
+# mean_PS_Lig_Bfactor
+colnames(combined)
+colnames(combined) = c("Position"
+, "average_DUETR"
+, "Position2"
+, "average_PredAffR")
+colnames(combined)
+identical(combined$Position, combined$Position2)
+n = which(colnames(combined) == "Position2"); n
+combined_df = combined[,-n]
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+head(combined_df$Position); tail(combined_df$Position)
+write.csv(combined_df, outFile
+, row.names = F)
+# read in pdb file complex1
+inDir = "~/git/Data/pyrazinamide/input/structure"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+# read in pdb file complex1
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+#########################
+#3: Read complex pdb file
+##########################
+source("Header_TT.R")
+# list of 8
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+rm(inDir, inFile)
+#====== end of script
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+complex1 = inFile
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+inFile
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+#inFile2 = paste0(inDir, "complex2_no_water.pdb")
+#complex2 = inFile2
+# list of 8
+my_pdb = read.pdb(complex1
+, maxlines = -1
+, multi = FALSE
+, rm.insert = FALSE
+, rm.alt = TRUE
+, ATOM.only = FALSE
+, hex = FALSE
+, verbose = TRUE)
+rm(inDir, inFile, complex1)
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+source("Header_TT.R")
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+source("Header_TT.R")
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+source("read_pdb.R") # list of 8
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+# make a copy: required for downstream sanity checks
+d2 = d
+# sanity checks: B factor
+max(d$b); min(d$b)
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+#1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: DUET scores
+hist(my_df$average_DUETR
+, xlab = ""
+, main = "Norm_DUET")
+plot(density(my_df$average_DUETR)
+, xlab = ""
+, main = "Norm_DUET")
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+#1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: DUET scores
+hist(my_df$average_DUETR
+, xlab = ""
+, main = "Norm_DUET")
+plot(density(my_df$average_DUETR)
+, xlab = ""
+, main = "Norm_DUET")
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+# count number of 0's in Bactor
+sum(d$b == 0)
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+# sanity check: should be 0
+sum(is.na(d$b))
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+print("Error: NA replacement NOT successful, Debug code!")
+}
+max(d$b); min(d$b)
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+if (min(d$b) == min(my_df$average_DUETR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d
+max(d$b); min(d$b)
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/output/"
+getwd()
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+outDir = "~/git/Data/pyrazinamide/input/structure"
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+hist(d$b
+, xlab = ""
+, main = "repalced-B")
+plot(density(d$b)
+, xlab = ""
+, main = "replaced-B")
+# graph titles
+mtext(text = "Frequency"
+, side = 2
+, line = 0
+, outer = TRUE)
+mtext(text = "DUET_stability"
+, side = 3
+, line = 0
+, outer = TRUE)
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace
+rm(list = ls())
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace
+rm(list = ls())
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+#=========================================================
+# Processing P2: Replacing B factor with mean ratioLig scores
+#=========================================================
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+source("read_pdb.R") # list of 8
+# extract atom list into a vari
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+, header = T)
+str(my_df)
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+# make a copy: required for downstream sanity checks
+d2 = d
+# sanity checks: B factor
+max(d$b); min(d$b)
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+# 1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+, xlab = ""
+, main = "Norm_lig_average")
+plot(density(my_df$average_PredAffR)
+, xlab = ""
+, main = "Norm_lig_average")
+# 3: After the following replacement
+#********************************
+par(oma = c(3,2,3,0)
+, mar = c(1,3,5,2)
+, mfrow = c(3,2))
+#par(mfrow = c(3,2))
+# 1: Original B-factor
+hist(d$b
+, xlab = ""
+, main = "B-factor")
+plot(density(d$b)
+, xlab = ""
+, main = "B-factor")
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+, xlab = ""
+, main = "Norm_lig_average")
+plot(density(my_df$average_PredAffR)
+, xlab = ""
+, main = "Norm_lig_average")
+# 3: After the following replacement
+#********************************
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+# count number of 0's in Bactor
+sum(d$b == 0)
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+# sanity check: should be 0
+sum(is.na(d$b))
+if (sum(d$b == 0) == b_na){
+print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+print("Error: NA replacement NOT successful, Debug code!")
+}
+max(d$b); min(d$b)
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+if (min(d$b) == min(my_df$average_PredAffR)){
+print("Sanity check passed: B-factors replaced correctly")
+} else {
+print ("Error: Debug code please")
+}
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d
+max(d$b); min(d$b)
+#=========
+# step 5_P2
+#=========
+write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
+# output dir
+getwd()
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
new file mode 100644
index 0000000..2069fa7
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@@ -0,0 +1,129 @@
+#########################################################
+### A) Installing and loading required packages
+#########################################################
+
+#if (!require("gplots")) {
+#  install.packages("gplots", dependencies = TRUE)
+#  library(gplots)
+#}
+
+if (!require("tidyverse")) {
+  install.packages("tidyverse", dependencies = TRUE)
+  library(tidyverse)
+}
+
+if (!require("ggplot2")) {
+  install.packages("ggplot2", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("cowplot")) {
+  install.packages("copwplot", dependencies = TRUE)
+  library(ggplot2)
+}
+
+if (!require("ggcorrplot")) {
+  install.packages("ggcorrplot", dependencies = TRUE)
+  library(ggcorrplot)
+}
+
+if (!require("ggpubr")) {
+  install.packages("ggpubr", dependencies = TRUE)
+  library(ggpubr)
+}
+
+if (!require("RColorBrewer")) {
+  install.packages("RColorBrewer", dependencies = TRUE)
+  library(RColorBrewer)
+}
+
+if (!require ("GOplot")) {
+  install.packages("GOplot")
+  library(GOplot)
+}
+
+if(!require("VennDiagram")) {
+  
+  install.packages("VennDiagram", dependencies = T)
+  library(VennDiagram)
+}
+
+if(!require("scales")) {
+  
+  install.packages("scales", dependencies = T)
+  library(scales)
+}
+
+if(!require("plotrix")) {
+  
+  install.packages("plotrix", dependencies = T)
+  library(plotrix)
+}
+
+if(!require("stats")) {
+  
+  install.packages("stats", dependencies = T)
+  library(stats)
+}
+
+if(!require("stats4")) {
+  
+  install.packages("stats4", dependencies = T)
+  library(stats4)
+}
+
+if(!require("data.table")) {
+  library(stats4)
+}
+
+if (!require("PerformanceAnalytics")){
+  install.packages("PerformanceAnalytics", dependencies = T)
+  library(PerformaceAnalytics)
+}
+
+if (!require ("GGally")){
+  install.packages("GGally")
+  library(GGally)
+}
+
+if (!require ("corrr")){
+  install.packages("corrr")
+  library(corrr)
+}
+
+if (!require ("psych")){
+  install.packages("psych")
+  library(psych)
+}
+
+if (!require ("dplyr")){
+  install.packages("dplyr")
+  library(psych)
+}
+
+
+if (!require ("compare")){
+  install.packages("compare")
+  library(psych)
+}
+
+if (!require ("arsenal")){
+  install.packages("arsenal")
+  library(psych)
+}
+
+
+####TIDYVERSE
+# Install
+#if(!require(devtools)) install.packages("devtools")
+#devtools::install_github("kassambara/ggcorrplot")
+
+library(ggcorrplot)
+
+
+###for PDB files
+#install.packages("bio3d") 
+if(!require(bio3d)){
+  install.packages("bio3d")
+  library(bio3d)
+}
diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
new file mode 100644
index 0000000..a3cc403
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
@@ -0,0 +1,27 @@
+#########################################################
+# 1b: Define function: coloured barplot by subgroup
+# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
+#########################################################
+
+ColourPalleteMulti <- function(df, group, subgroup){
+  
+  # Find how many colour categories to create and the number of colours in each
+  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
+                          , df
+                          , function(x) length(unique(x)))
+  #  return(categories) }
+  
+  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
+  
+  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
+  
+  #return(category.start); return(category.end)}
+  
+  # Build Colour pallette
+  colours <- unlist(lapply(1:nrow(categories),
+                           function(i){
+                             colorRampPalette(colors = c(category.start[i]
+                                                         , category.end[i]))(categories[i,2])}))
+  return(colours)
+}
+#########################################################
\ No newline at end of file
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
new file mode 100644
index 0000000..57e476a
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@@ -0,0 +1,299 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
+getwd()
+
+#########################################################
+# TASK: To combine mcsm and meta data with af and or
+#########################################################
+
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data = read.csv(inFile
+                     , row.names = 1
+                     , stringsAsFactors = F
+                     , header = T) 
+rm(inDir, inFile)
+
+str(mcsm_data)
+
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+
+# spelling Correction 1: DUET
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
+
+# spelling Correction 2: Ligand
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
+
+# count na in each column
+na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
+
+# sort by Mutationinformation
+mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
+head(mcsm_data$Mutationinformation)
+
+# get freq count of positions and add to the df
+setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
+
+pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
+
+###########################
+# 2: Read file: meta data with AFandOR
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
+
+meta_with_afor <- read.csv(inFile2
+                      , stringsAsFactors = F
+                      , header = T)
+
+rm(inDir, inFile2)
+
+str(meta_with_afor)
+
+# sort by Mutationinformation
+head(meta_with_afor$Mutationinformation)
+meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
+head(meta_with_afor$Mutationinformation)
+
+# sanity check: should be True for all the mentioned columns
+#is.numeric(meta_with_afor$OR)
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+
+c1 = NULL
+for (i in na_var){
+  print(i)
+  c0 = is.numeric(meta_with_afor[,i])
+  c1 = c(c0, c1)
+  if ( all(c1) ){
+    print("Sanity check passed: These are all numeric cols")
+  } else{
+    print("Error: Please check your respective data types")
+  }
+}
+
+# If OR, and P value are not numeric, then convert to numeric and then count
+# else they will say 0
+na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
+str(na_count)
+
+# compare if the No of "NA" are the same for all these cols
+na_len = NULL
+for (i in na_var){
+  temp = na_count[[i]]
+  na_len = c(na_len, temp)
+}
+
+# extract how many NAs: 
+# should be all TRUE 
+# should be a single number since 
+# all the cols should have "equal" and "same" no. of NAs
+
+my_nrows = NULL
+for ( i in 1: (length(na_len)-1) ){
+  #print(compare(na_len[i]), na_len[i+1])
+  c = compare(na_len[i], na_len[i+1])
+  if ( c$result ) {
+    my_nrows = na_len[i] }
+  else { 
+  print("Error: Please check your numbers") 
+  }
+}
+
+my_nrows
+
+#=#=#=#=#=#=#=#=#
+# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
+# these are the same 7 ones
+#=#=#=#=#=#=#=#=#
+
+# sanity check
+#which(is.na(meta_with_afor$OR)) 
+
+# initialise an empty df with nrows as extracted above
+na_count_df = data.frame(matrix(vector(mode = 'numeric'
+#                                       , length = length(na_var)
+                                       )
+                                , nrow = my_nrows
+#                                , ncol = length(na_var)
+                              ))
+
+# populate the df with the indices of the cols that are NA
+for (i in na_var){
+  print(i)
+  na_i = which(is.na(meta_with_afor[i]))
+  na_count_df = cbind(na_count_df, na_i)
+  colnames(na_count_df)[which(na_var == i)] <- i
+}
+
+# Now compare these indices to ensure these are the same
+c2 = NULL
+for ( i in 1: ( length(na_count_df)-1 ) ) {
+#  print(na_count_df[i] == na_count_df[i+1])
+  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
+  c2 = c(c1, c2)
+  if ( all(c2) ) {
+    print("Sanity check passed: The indices for AF, OR, etc are all the same")
+  } else {
+    print ("Error: Please check indices which are NA")
+  }
+}
+
+rm( c, c0, c1, c2, i, my_nrows
+    , na_count, na_i, na_len
+    , na_var, temp
+    , na_count_df
+    , pos_count_check )
+
+###########################
+# 3:merging two dfs: with NA
+###########################
+
+# link col name  = Mutationinforamtion
+head(mcsm_data$Mutationinformation)
+head(meta_with_afor$Mutationinformation)
+
+#########
+# merge 1a: meta data with mcsm
+#########
+merged_df2 = merge(x = meta_with_afor
+                  ,y = mcsm_data
+                  , by = "Mutationinformation"
+                  , all.y = T)
+
+head(merged_df2$Position)
+
+# sort by Position
+head(merged_df2$Position)
+merged_df2 = merged_df2[order(merged_df2$Position),]
+head(merged_df2$Position)
+
+merged_df2v2 = merge(x = meta_with_afor
+                   ,y = mcsm_data
+                   , by = "Mutationinformation"
+                   , all.x = T) 
+#!=!=!=!=!=!=!=!
+# COMMENT: used all.y since position 186 is not part of the struc,
+# hence doesn't have a mcsm value
+# but 186 is associated with with mutation
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$Position%in%merged_df2v2$Position)
+
+rm(merged_df2v2)
+
+#########
+# merge 1b:remove duplicate mutation information
+#########
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
+head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
+
+# sanity checks
+# nrows of merged_df3 should be the same as the nrows of mcsm_data
+if(nrow(mcsm_data) == nrow(merged_df3)){
+  print("sanity check: Passed")
+} else {
+  print("Error!: check data, nrows is not as expected")
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# uncomment as necessary
+# only need to run this if merged_df2v2 i.e non structural pos included
+#mcsm = mcsm_data$Mutationinformation
+#my_merged = merged_df3$Mutationinformation
+
+# find the index where it differs
+#diff_n = which(!my_merged%in%mcsm)
+
+#check if it is indeed pos 186
+#merged_df3[diff_n,]
+
+# remove this entry
+#merged_df3 = merged_df3[-diff_n,]]
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+###########################
+# 3b :merging two dfs: without NA
+###########################
+
+#########
+# merge 2a:same as merge 1 but excluding NA
+#########
+merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
+
+#########
+# merge 2b: remove duplicate mutation information
+#########
+merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
+
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$AF),]
+# compare dfs: foo and merged_df3_com
+all.equal(foo, merged_df3)
+
+summary(comparedf(foo, merged_df3))
+
+#=============== end of combining df
+#clear variables
+rm(mcsm_data
+   , meta_with_afor
+   , foo)
+
+#rm(diff_n, my_merged, mcsm)
+
+#=====================
+# write_output files
+#=====================
+# output dir
+outDir = "~/git/Data/pyrazinamide/output/"
+getwd()
+
+outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
+write.csv(merged_df3, outFile1)
+
+#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
+#write.csv(merged_df3_comp, outFile2)
+
+rm(outDir
+   , outFile1
+#   , outFile2
+)
+#============================= end of script
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
new file mode 100644
index 0000000..1dccd65
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
@@ -0,0 +1,348 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
+getwd()
+
+#########################################################
+# TASK: To combine mcsm and meta data with af and or
+# by filtering for distance to ligand (<10Ang)
+#########################################################
+
+#########################################################
+# Installing and loading required packages
+#########################################################
+
+#source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data = read.csv(inFile
+                     , row.names = 1
+                     , stringsAsFactors = F
+                     , header = T) 
+rm(inDir, inFile)
+
+str(mcsm_data)
+
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+
+# spelling Correction 1: DUET
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks
+table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
+head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
+
+# spelling Correction 2: Ligand
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+# checks: should be the same as above
+table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
+head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
+
+########################### !!! only for mcsm_lig
+# 4: Filter/subset data 
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# count
+table(mcsm_data$Dis_lig_Ang<10)
+
+# subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+# sanity checks
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+# count no of unique positions
+length(unique(mcsm_data2$Position))
+
+# count no of unique mutations
+length(unique(mcsm_data2$Mutationinformation))
+
+# count Destabilisinga and stabilising
+table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT: so as not to alter the script
+mcsm_data = mcsm_data2
+#<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(mcsm_data$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+# clear variables
+rm(mcsm_data2)
+
+# count na in each column
+na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
+
+head(mcsm_data$Mutationinformation)
+mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
+mcsm_data[mcsm_data$Mutationinformation=="L4S",]
+
+# sort by Mutationinformation
+mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
+head(mcsm_data$Mutationinformation)
+
+# check
+mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
+mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
+
+# get freq count of positions and add to the df
+setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
+
+pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
+
+###########################
+# 2: Read file: meta data with AFandOR
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
+
+meta_with_afor <- read.csv(inFile2
+                      , stringsAsFactors = F
+                      , header = T)
+
+str(meta_with_afor)
+
+# sort by Mutationinformation
+head(meta_with_afor$Mutationinformation)
+meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
+head(meta_with_afor$Mutationinformation)
+
+# sanity check: should be True for all the mentioned columns
+#is.numeric(meta_with_afor$OR)
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+
+c1 = NULL
+for (i in na_var){
+  print(i)
+  c0 = is.numeric(meta_with_afor[,i])
+  c1 = c(c0, c1)
+  if ( all(c1) ){
+    print("Sanity check passed: These are all numeric cols")
+  } else{
+    print("Error: Please check your respective data types")
+  }
+}
+
+# If OR, and P value are not numeric, then convert to numeric and then count
+# else they will say 0
+
+# NOW count na in each column: if you did it before, then 
+# OR and Pvalue column would say 0 na since these were not numeric
+na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
+str(na_count)
+
+# compare if the No of "NA" are the same for all these cols
+na_len = NULL
+na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
+for (i in na_var){
+  temp = na_count[[i]]
+  na_len = c(na_len, temp)
+}
+
+my_nrows = NULL
+
+for ( i in 1: (length(na_len)-1) ){
+  #print(compare(na_len[i]), na_len[i+1])
+  c = compare(na_len[i], na_len[i+1])
+  if ( c$result ) {
+    my_nrows = na_len[i] }
+  else { 
+    print("Error: Please check your numbers") 
+  }
+}
+
+my_nrows
+
+#=#=#=#=#=#=#=#=#
+# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
+# all have 81 NA, with pyrazinamide with 960
+# and these are the same 7 ones
+#=#=#=#=#=#=#=#=#
+
+# sanity check
+#which(is.na(meta_with_afor$OR)) 
+
+# initialise an empty df with nrows as extracted above
+na_count_df = data.frame(matrix(vector(mode = 'numeric'
+#                                      , length = length(na_var) 
+                                      )
+                                , nrow = my_nrows
+#                                , ncol = length(na_var)
+                                ))
+
+# populate the df with the indices of the cols that are NA
+for (i in na_var){
+  print(i)
+  na_i = which(is.na(meta_with_afor[i]))
+  na_count_df = cbind(na_count_df, na_i)
+  colnames(na_count_df)[which(na_var == i)] <- i
+} 
+
+# Now compare these indices to ensure these are the same
+c2 = NULL
+for ( i in 1: ( length(na_count_df)-1 ) ) {
+  #  print(na_count_df[i] == na_count_df[i+1])
+  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
+  c2 = c(c1, c2)
+  if ( all(c2) ) {
+    print("Sanity check passed: The indices for AF, OR, etc are all the same")
+  } else {
+    print ("Error: Please check indices which are NA")
+  }
+}
+
+rm( c, c1, c2, i, my_nrows
+    , na_count, na_i, na_len
+    , na_var, temp
+    , na_count_df
+    , pos_count_check )
+
+###########################
+# 3:merging two dfs: with NA
+###########################
+
+# link col name  = Mutationinforamtion
+head(mcsm_data$Mutationinformation)
+head(meta_with_afor$Mutationinformation)
+
+#########
+# merge 1a: meta data with mcsm
+#########
+merged_df2 = merge(x = meta_with_afor
+                  , y = mcsm_data
+                  , by = "Mutationinformation"
+                  , all.y = T)
+
+head(merged_df2$Position)
+
+# sort by Position
+head(merged_df2$Position)
+merged_df2 = merged_df2[order(merged_df2$Position),]
+head(merged_df2$Position)
+
+merged_df2v2 = merge(x = meta_with_afor
+                   ,y = mcsm_data
+                   , by = "Mutationinformation"
+                   , all.x = T) 
+
+#!=!=!=!=!=!=!=!
+# COMMENT: used all.y since position 186 is not part of the struc,
+# hence doesn't have a mcsm value
+# but 186 is associated with with mutation
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$Position%in%merged_df2v2$Position)
+
+rm(merged_df2v2)
+
+#########
+# merge 1b:remove duplicate mutation information
+#########
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
+head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
+
+# sanity checks
+# nrows of merged_df3 should be the same as the nrows of mcsm_data
+if(nrow(mcsm_data) == nrow(merged_df3)){
+  print("sanity check: Passed")
+} else {
+  print("Error!: check data, nrows is not as expected")
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# uncomment as necessary
+# only need to run this if merged_df2v2 i.e non structural pos included
+#mcsm = mcsm_data$Mutationinformation
+#my_merged = merged_df3$Mutationinformation
+
+# find the index where it differs
+#diff_n = which(!my_merged%in%mcsm)
+
+#check if it is indeed pos 186
+#merged_df3[diff_n,]
+
+# remove this entry
+#merged_df3 = merged_df3[-diff_n,] 
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+###########################
+# 3b :merging two dfs: without NA
+###########################
+
+#########
+# merge 2a:same as merge 1 but excluding NA
+#########
+merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
+
+#########
+# merge 2b: remove duplicate mutation information
+#########
+merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
+
+# FIXME: add this as a sanity check. I have manually checked!
+
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$AF),]
+
+# compare dfs: foo and merged_df3_com
+all.equal(foo, merged_df3)
+
+summary(comparedf(foo, merged_df3))
+
+#=============== end of combining df
+#clear variables
+rm(mcsm_data
+   , meta_with_afor
+   , foo)
+
+#rm(diff_n, my_merged, mcsm)
+
+#===============end of script
+
+#=====================
+# write_output files
+#=====================
+ 
+# Not required as this is a subset of the "combining_two_df.R" script
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
new file mode 100755
index 0000000..22fada7
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#*************************************
+# need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
+# per line. Sort by unique, which automatically removes duplicates.
+# sace file in current directory
+#**********************************************************************
+infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
+outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
+
+# sort unique entries and output to current directory
+sort -u ${infile} > ${outfile}
+
+# count no. of unique snps mCSM will run on
+count=$(wc -l < ${outfile})
+
+# print to console no. of unique snps mCSM will run on
+echo "${count} unique mutations for mCSM to run on"
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
new file mode 100755
index 0000000..87f4265
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#**********************************************************************
+# TASK: submit requests using curl: HANDLE redirects and refresh url. 
+# Iterate over mutation file and write/append result urls to a file
+# result url file: stored in the /Results directory
+# mutation file: one mutation per line, no chain ID
+# output: in a file, should be n urls (n=no. of mutations in file)
+# NOTE: these are just result urls, not actual values for results
+#**********************************************************************
+## iterate over mutation file; line by line and submit query using curl
+filename="../Data/pnca_mis_SNPs_v2_unique.csv"
+
+## some useful messages
+echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
+COUNT=0
+while read -r line; do
+((COUNT++))
+    mutation="${line}"
+#    echo "${mutation}"
+pdb='../Data/complex1_no_water.pdb'
+mutation="${mutation}"
+chain="A"
+lig_id="PZA"
+affin_wt="0.99"
+host="http://biosig.unimelb.edu.au"
+call_url="/mcsm_lig/prediction"
+
+##=========================================
+##html field_names names required for curl
+##complex_field:wild=@
+##mutation_field:mutation=@
+##chain_field:chain=@
+##ligand_field:lig_id@
+##energy_field:affin_wt
+#=========================================
+refresh_url=$(curl -L \
+     -sS \
+     -F "wild=@${pdb}" \
+     -F "mutation=${mutation}" \
+     -F "chain=${chain}" \
+     -F "lig_id=${lig_id}" \
+     -F "affin_wt=${affin_wt}" \
+     ${host}${call_url} | grep "http-equiv")
+
+#echo $refresh_url
+#echo ${host}${refresh_url}
+
+#use regex to extract the relevant bit from the refresh url
+#regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
+
+#Now build: result url using host and refresh url and write the urls to a file in the Results dir
+result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
+sleep 10
+
+echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
+
+echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -n '.'
+done < "${filename}"
+
+echo
+echo "Processing Complete"
+
+##end of submitting query, receiving result url and storing results url in a file
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
new file mode 100755
index 0000000..e250fe8
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: submit result urls and fetch actual results using curl
+# iterate over each result url from the output of step1 in the stored
+# in file in /Results.
+# Use curl to fetch results and extract relevant sections using hxtools
+# and store these in another file in /Results 
+# This script takes two arguments:
+# 	input file: file containing results url
+#				In this case: 336_mCSM_lig_complex1_result_url.txt
+# 	output file: name of the file where extracted results will be stored
+#				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
+#*********************************************************************
+
+#if [ "$#" -ne 2 ]; then
+  #if [ -Z $1 ]; then
+#  echo "
+#  Please provide both Input and Output files.
+
+#  Usage: batch_read_urls.sh INFILE OUTFILE
+#  "
+#  exit 1
+#fi
+
+# First argument: Input File
+# Second argument: Output File
+#infile=$1
+#outfile=$2
+
+infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
+outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
+
+echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
+echo
+COUNT=0
+while read -r line; do
+#COUNT=$(($COUNT+1))
+((COUNT++))
+  curl --silent ${line} \
+    | hxnormalize -x \
+    | hxselect -c div.span4 \
+    | hxselect -c div.well \
+    | sed -r -e 's/<[^>]*>//g' \
+    | sed -re 's/ +//g' \
+    >> ${outfile}
+  #| tee -a ${outfile}
+#  echo -n '.'
+echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
+  
+done < "${infile}"
+
+echo
+echo "Processing Complete"
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
new file mode 100755
index 0000000..78dbdf5
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#*************************************
+#need to be in the correct directory
+#*************************************
+##: comments for code
+#: commented out code
+
+#********************************************************************
+# TASK: Intermediate results processing
+# output file has a convenient delimiter of ":" that can be used to 
+# format the file into two columns (col1: field_desc and col2: values)
+# However the section "PredictedAffinityChange:...." and 
+# "DUETstabilitychange:.." are split over multiple lines and 
+# prevent this from happening.Additionally there are other empty lines
+# that need to be omiited. In order ensure these sections are not split
+# over multiple lines, this script is written.
+#*********************************************************************
+
+infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+
+#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
+# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
+
+# Outputs records separated by a newline, that look something like this:
+# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
+# Mutationinformation:
+# Wild-type:L
+# Position:4
+# Mutant-type:W
+# Chain:A
+# LigandID:PZA
+# Distancetoligand:15.911&Aring;
+# DUETstabilitychange:-2.169Kcal/mol
+# 
+# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
+# (...etc)
+
+# This script brings everything in a convenient format for further processing in python.
+# bear in mind, this replaces the file in place, so make sure you retain a copy for your records
+sed -i '/PredictedAffinityChange/ {
+N
+N
+N
+N
+s/\n//g
+}
+/DUETstabilitychange:/ {
+N
+N
+s/\n//g
+}
+/^$/d' ${infile}
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
new file mode 100755
index 0000000..a780576
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+import pandas as pd
+from collections import defaultdict
+
+#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
+
+outCols=[
+        'PredictedAffinityChange',
+        'Mutationinformation',
+        'Wild-type',
+        'Position',
+        'Mutant-type',
+        'Chain',
+        'LigandID',
+        'Distancetoligand',
+        'DUETstabilitychange'
+        ]
+
+lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
+
+outputs = defaultdict(list)
+
+for item in lines:
+	col, val = item.split(':')
+	outputs[col].append(val)
+
+dfOut=pd.DataFrame(outputs)
+
+pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
new file mode 100644
index 0000000..4876b5e
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
@@ -0,0 +1,207 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK: To tidy the columns so you can generate figures
+#=======================================================
+####################
+#### read file #####: this will be the output from python script (csv file)
+####################
+data = read.csv("336_complex1_formatted_results.csv"
+              , header = T
+              , stringsAsFactors = FALSE)
+dim(data)
+#335, 10
+str(data)
+
+###########################
+##### Data processing #####
+###########################
+
+# populate mutation information columns as currently it is empty
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+
+# should not be blank: create muation information
+data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
+
+head(data$Mutationinformation)
+tail(data$Mutationinformation)
+#write.csv(data, 'test.csv')
+##########################################
+# Remove duplicate SNPs as a sanity check
+##########################################
+#very important
+table(duplicated(data$Mutationinformation))
+#FALSE   
+#335
+
+#extract duplicated entries
+dups = data[duplicated(data$Mutationinformation),] #0
+
+#No of dups should match with the no. of TRUE in the above table 
+#u_dups = unique(dups$Mutationinformation) #10
+sum( table(dups$Mutationinformation) ) #13
+
+rm(dups)
+
+#***************************************************************
+#select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),] #309, 10
+#***************************************************************
+#sanity check
+u = unique(df$Mutationinformation)
+u2 = unique(data$Mutationinformation)
+table(u%in%u2)
+#TRUE 
+#309 
+#should all be 1, hence 309 1's
+sum(table(df$Mutationinformation) == 1)
+
+#sort df by Position
+#MANUAL CHECKPOINT:  
+#foo <- df[order(df$Position),]
+#df <- df[order(df$Position),]
+
+rm(u, u2, dups)
+
+####################
+#### give meaningful colnames to reflect units to enable correct data type
+####################
+
+#=======
+#STEP 1
+#========
+#make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange #335, 11
+
+#make Predicted...column numeric and outcome column categorical
+head(df$PredictedAffinityChange)
+df$PredictedAffinityChange = gsub("log.*"
+                                  , ""
+                                  , df$PredictedAffinityChange)
+
+#sanity checks
+head(df$PredictedAffinityChange)
+
+#should be numeric, check and if not make it numeric
+is.numeric( df$PredictedAffinityChange )
+#change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+#should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+#change the column name to indicate units
+n = which(colnames(df) == "PredictedAffinityChange"); n
+colnames(df)[n] = "PredAffLog"
+colnames(df)[n]
+
+#========
+#STEP 2
+#========
+#make Lig_outcome column categorical showing effect of mutation
+head(df$Lig_outcome)
+df$Lig_outcome = gsub("^.*-"
+                  , "",
+                  df$Lig_outcome)
+#sanity checks
+head(df$Lig_outcome)
+#should be factor, check and if not change it to factor
+is.factor(df$Lig_outcome) 
+#change to factor
+df$Lig_outcome = as.factor(df$Lig_outcome)
+#should be TRUE
+is.factor(df$Lig_outcome) 
+
+#========
+#STEP 3
+#========
+#gsub
+head(df$Distancetoligand)
+df$Distancetoligand = gsub("&Aring;"
+                           , ""
+                           , df$Distancetoligand)
+#sanity checks
+head(df$Distancetoligand)
+#should be numeric, check if not change it to numeric
+is.numeric(df$Distancetoligand)
+#change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+#should be TRUE
+is.numeric(df$Distancetoligand)
+
+#change the column name to indicate units
+n = which(colnames(df) == "Distancetoligand")
+colnames(df)[n] <- "Dis_lig_Ang"
+colnames(df)[n]
+
+#========
+#STEP 4
+#========
+#gsub
+head(df$DUETstabilitychange)
+df$DUETstabilitychange = gsub("Kcal/mol"
+                              , ""
+                              , df$DUETstabilitychange)
+#sanity checks
+head(df$DUETstabilitychange)
+#should be numeric, check if not change it to numeric
+is.numeric(df$DUETstabilitychange)
+#change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+#should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+#change the column name to indicate units
+n = which(colnames(df) == "DUETstabilitychange"); n
+colnames(df)[n] = "DUETStability_Kcalpermol"
+colnames(df)[n]
+
+#========
+#STEP 5
+#========
+#create yet another extra column: classification of DUET stability only
+df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
+                         , "Stabilizing"
+                         , "Destabilizing")  #335, 12
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281             54 
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47 
+#==============================
+#FIXME
+#Insert a venn diagram
+
+#================================
+
+
+#========
+#STEP 6
+#========
+# assign wild and mutant colnames correctly
+
+wt = which(colnames(df) == "Wild.type"); wt
+colnames(df)[wt] <- "Wild_type"
+colnames(df[wt])
+
+mut = which(colnames(df) == "Mutant.type"); mut
+colnames(df)[mut] <- "Mutant_type"
+colnames(df[mut])
+
+#========
+#STEP 7
+#========
+#create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+
+#clear variables
+rm(n, wt, mut)
+
+################ end of data cleaning
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
new file mode 100644
index 0000000..4721e29
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
@@ -0,0 +1,252 @@
+getwd()
+#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+getwd()
+
+#=======================================================
+#TASK:read cleaned data and perform rescaling
+  # of DUET stability scores
+  # of Pred affinity
+#compare scaling methods with plots
+#output normalised file
+#=======================================================
+
+####################
+#### read file #####: this will be the output of my R script that cleans the data columns
+####################
+source("../Scripts/step3c_data_cleaning.R")
+##This will outut two dataframes:
+##data: unclean data: 335, 10
+##df : cleaned df 335, 13
+## you can remove data if you want as you will not need it
+rm(data)
+
+colnames(df)
+
+#===================
+#3a: PredAffLog
+#===================
+n = which(colnames(df) == "PredAffLog"); n
+group = which(colnames(df) == "Lig_outcome"); group 
+
+#===================================================
+# order according to PredAffLog values
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$PredAffLog)
+
+#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+df = df[order(df$PredAffLog),] 
+head(df$PredAffLog)
+
+#sanity checks
+head(df[,n]) #all negatives
+tail(df[,n]) #all positives
+
+#sanity checks
+mean(df[,n])
+#-0.9526746
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.2112100      0.3926667 
+#===========================
+#Same as above: in 2 steps
+#===========================
+
+#find range of your data
+my_min = min(df[,n]); my_min #-3.948
+my_max = max(df[,n]); my_max #2.23
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+df$ratioPredAff = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                      )#335 14
+#sanity checks
+head(df$ratioPredAff)
+tail(df$ratioPredAff)
+
+min(df$ratioPredAff); max(df$ratioPredAff)
+
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.000000000   0.005381166 
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.001266464   1.000000000
+
+#should be the same as below (281 and 54)
+sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
+
+table(df$Lig_outcome)
+#Destabilizing   Stabilizing 
+#281              54
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+my_title = "Ligand_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioPredAff
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioPredAff )
+     , main = "ratio rescaling"
+)
+
+# titles
+mtext(text = "Frequency"
+       , side = 2
+       , line = 0
+       , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+
+#clear variables 
+rm(my_min, my_max, my_title, n, group)
+
+#===================
+# 3b: DUET stability
+#===================
+dim(df) #335, 14
+
+n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
+group = which(colnames(df) == "DUET_outcome"); group #12
+
+#===================================================
+# order according to DUET scores
+#===================================================
+# This is because this makes it easier to see the results of rescaling for debugging
+head(df$DUETStability_Kcalpermol)
+
+#ORDER BY DUET scores: negative values at the top and positive at the bottom
+df = df[order(df$DUETStability_Kcalpermol),] 
+
+#sanity checks
+head(df[,n]) #negatives
+tail(df[,n]) #positives
+
+#sanity checks
+mean(df[,n])
+#[1] -1.173316
+
+tapply(df[,n], df[,group], mean)
+#Destabilizing   Stabilizing 
+#-1.4297257     0.3978723
+
+#===============================================
+# WITHIN GROUP rescaling 2: method "ratio"
+# create column to store the rescaled values
+# Rescaling separately (Less dangerous) 
+#       =====> chosen one:as Nick prefers
+#===============================================
+#find range of your data
+my_min = min(df[,n]); my_min #-3.87
+my_max = max(df[,n]); my_max #1.689
+
+df$ratioDUET = ifelse(df[,n] < 0
+                      , df[,n]/abs(my_min)
+                      , df[,n]/my_max
+                    ) #335, 15
+#sanity check
+head(df$ratioDUET)
+tail(df$ratioDUET)
+
+min(df$ratioDUET); max(df$ratioDUET)
+
+#sanity checks
+tapply(df$ratioDUET, df$DUET_outcome, min)
+#Destabilizing   Stabilizing 
+#-1.00000000    0.01065719
+
+tapply(df$ratioDUET, df$DUET_outcome, max)
+#Destabilizing   Stabilizing 
+#-0.003875969   1.000000000 
+
+#should be the same as below (267 and 42)
+sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
+
+table(df$DUET_outcome)
+#Destabilizing   Stabilizing 
+#288             47
+
+#===============================================
+# Hist and density plots to compare the rescaling 
+# methods: Base R
+#===============================================
+#uncomment as necessary
+
+my_title = "DUET_stability"
+#my_title = colnames(df[n])
+
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(2,2))
+
+hist(df[,n]
+     , xlab = ""
+     , main = "Raw values"
+)
+
+hist(df$ratioDUET
+     , xlab = ""
+     , main = "ratio rescaling"
+)
+
+# Plot density plots underneath
+plot(density( df[,n] )
+     , main = "Raw values"
+)
+
+plot(density( df$ratioDUET )
+     , main = "ratio rescaling"
+)
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = my_title
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#===================
+# write output as csv file
+#===================
+write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
new file mode 100644
index 0000000..877215a
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
@@ -0,0 +1,131 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(data.table)
+require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for PS		   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+###########################
+# This will return:
+
+# df with NA:
+# merged_df2 
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+###########################
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+###########################
+# you need merged_df3 
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+###########################
+# Data for bfactor figure
+# PS average 
+# Lig average
+###########################
+
+head(my_df$Position)
+head(my_df$ratioDUET)
+
+# order data frame 
+df = my_df[order(my_df$Position),]
+
+head(df$Position)
+head(df$ratioDUET)
+
+#***********
+# PS: average by position
+#***********
+
+mean_DUET_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.DUET = mean(ratioDUET))
+
+#***********
+# Lig: average by position
+#***********
+mean_Lig_by_position <- df %>%
+  group_by(Position) %>%
+  summarize(averaged.Lig = mean(ratioPredAff))
+
+
+#***********
+# cbind:mean_DUET_by_position and mean_Lig_by_position
+#***********
+
+combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
+
+# sanity check
+# mean_PS_Lig_Bfactor
+
+colnames(combined)
+
+colnames(combined) = c("Position"
+                       , "average_DUETR"
+                       , "Position2"
+                       , "average_PredAffR")
+
+colnames(combined)
+
+identical(combined$Position, combined$Position2)
+
+n = which(colnames(combined) == "Position2"); n
+
+combined_df = combined[,-n]
+
+max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
+
+max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
+
+#=============
+# output csv
+#============
+outDir = "~/git/Data/pyrazinamide/input/processed/"
+outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
+print(paste0("Output file with path will be:","", outFile))
+
+head(combined_df$Position); tail(combined_df$Position)
+
+write.csv(combined_df, outFile
+          , row.names = F)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
new file mode 100644
index 0000000000000000000000000000000000000000..9ebc62b0d6fbe54e858e3c9da53c9878c18035b4
GIT binary patch
literal 43777
zcmV)QK(xOfiwFP!000001MNKrd=$mkbLovVdJnya^g{6x2~7fq5GkUDBe{^1G%gp4
z6)X1Md+!AmJ30{+yHXXf&{PmX5o}1wx3l|Z_wC&7l@!GPm;8P&H#0l)=Djy>`plcc
zyqPHxGb0Eg0VFUehy(;m&Y%G4pM?aGaOpR!a!K)G+hSL(<c?<HIf(@Rbn$}fYN}bB
zHWFOS{1OC9aJK|KB?ysVsRY9$xE|uLbiF0GOoB@!SR_Fx;5bWy^Ch@kf}s-FB)CTc
zM&JqwMo5q+!P62<mEd*>E|p-h1Y;#QTY{@47%#yq5}YnUssx=RxJiN}37(f=js$%q
zXfHuG2~LyXR0-NjkS#%m1j8kGMgqpUOM+wx91={B;9d#3O0Z0VAt0msB{)ffyCf)+
z;5-S=WwMmuSqZEXFqsdMV2T6}Nl+od6%ve;;5i8%kYJ1i*GX`V1P@D4EWt$*luJ-3
zfm4Dw2}&i1mmpt)t0Z_rf)^!NBEhQ?+$cd031&!;BbON=k4SK(1Sd-nCBbM3Zjs;u
z3DPB)F2M^D^phZ0f-@yZkl+*v9wba(2#JxPT7q5@L`v|I1d}D0C&A+qTx+09OKgiP
zyy+6?9EvyCFkVz*p{=U4x-zlWZf7!(^d>-(iRES{->yMn8DDPiHs|xd3Lk#<{`cSM
zch%J8H};7A{F<20v#0&Oe9{@&7yXm_{mLVYN5@&uUH4r3Gs5=`9Z+^(K;_I!zdbeY
z{;vw_-gvuBz^K0m-TKhi)4#qm`|yWnTyb#X-i%q#{=MMR>#utF?eW789QbWa-jVQY
zLZ7<E@$H+FZrd=b<j_CIrrtXC>za4&|6*pJ{`JG&T;8j8_p~;vb}o2lR>}kWr~P<R
z?X_(tZab;%;eX0Iew;AtqoG&d_WtnY@sobA_AI<7`oY@{Y)B8S{4u-W@g<{P{JboH
z3>=kv-jJmI(G$mQczos+A38%`-FWx*`x_3`MqIyS$BXx$Uo`xzv>BfSFP?mT%A4P>
zdvnRh_0F91Ay@Xe{=!Mm-*$U(ZO*6Njs>2)@B3acLCa!OM<4lR;Jm(fc1w9-$dFUQ
zx{SW?;%~lRzGG|cU-!IxBxpo_yYR~&>hj{SqwzaK{@VH34Lz23xOme=n|>TQ^TC@p
z&j@+2eDM8$-`4hqg>mohJ@wrMZ%<n{^zi4UTR%Pe_#+J|M_;N~{riG;aa&fs@<+e=
zRYlfsMm@Z`Px$hJ4YOC)Uh;nIjk(qP>z8jl=f0;lOm{r|<YRZweSXqY_QSt^`plzq
zuOAhB)%)Aad!P5=o?q78KV|Ub%eM^(z0fiE7ke!IuxnBMC)>Z;KIpHsLyPYo`}t$B
zfBpT=_;YTY)?vg+0d2?ZO6&h+-m97OS3H|JV(oj8Ba`=>(>L+7@>hy~P1tz-pLKmc
zN?1|-*v^_WFT7~X>W}u%d};A@-wvgRUJu@Q^+#{~_Qn_Omaoi6Sl@8y$p=rHzVGyk
zi#8>$df=4J-DCR4+`e)2$H%^3ob+1i3tjI1uIJ)Ir(`}g!T#4D3-7)0y%(m}58U}z
z#()<)t=trJ-PHNxM|PUqz4(*YZt3pqG-C9$(=WSb$JNEX!UBJMW%%bc7k9Vay!8Cl
z$tnNbv+c>-_gxTn1_^&{a_q>Mu(KlWUst*JvcetJQ>?=twJ+Uw#w~}Yrc~7>4ZC#4
zlwGHV_Kkk*{Lgj=?3?lPwOuA$S#j2<oigfv9+%VM^#^;rG9fjkWAH;iox1dvPx{a6
zIP~kaZ9-Q3-M6~O%!sKCZR$wj$}d*GR{iDWorW*1v<JR>_I-BSk8@_P`S_-;iKo4^
zdh?6LML(BKYk&I250-V?`_kN=J#Sf67jkV()K^31?uc2t=&dE))?`-BJpH>lQDvQW
z{y6d6^AB4(@30ShqTlmtr{B8fx3#BNzi`QA-EN5A`+DFb|Li?DujbV&-dgwJ^0Zl9
z$J{kI?~5P$-aPxl4tpX)o|^yt4HtF%BlWRK`xAE5bXqy=-2(+TT~WK^ns0N;fBN#h
z{L?zWc4YL3PY(P!e<Dddt(WV?Kc5M!?Y{MmYj1vG&CKTx@4e~0i*6bCK-{_;M*lWv
z#wX9`9}HNMaNZB}V)AO$?t6>LUV7*6qh1()uCsXZcQsdCv!QR?!$Y#xl~>#qG3UUV
zOzXClSsj15Wpzl0ixW!*)_)tkbJXfyXWTIH(eHY9nY<?Kl92PGe&|<m&(^6I#9KC9
zxOC9FQ`!Zba`wRIb6+0O5On35l#CS>oo9!q?a6uNrWLP7&KbFBeg4t-+qcCGUG&fm
zb<3Wh7Z1B8VE7X&KfC8j=c8Y`_TPBU4SBy-^_qF`-p5At?l`jM#-hce3Zom^P0HxE
z>YC(pw*8R&*LfYv-d+>=)N2nNnSRZ{<N;%&{`m5Pm7Q<-C}ruYkLpj~@p0Oi%yU0X
zJNLCW2F4D$Au#EMy(@|yDoVP$;HE{}suQkW?f5i(Y5Q%Vv45_*d~Dvp0qfU2^Zk24
zEAL)7@xa4XHzwsgy5W)Pm&#uI=#KMGC*kQgWYKBomEUw_yU%~$d9wZK+2qV|nJdp(
zyS3og+`!&{#@##N=eSd%3jTiLj3=j$T6f3UZ;<o8Yg07-+e5oYub%%;@E)?}>7>a+
zQ+hr0`<}={H~sZc(VT9}>`~A5FWmM|aroSjOYeH(s%tOKZ}>3zywr>PrF^k;P0jGQ
z5AuEtJ#*8U`}dx+;y_hW&9Xf!pT6_{qA|toPJZ?3GapF5<*AiJcD%kOU`6-2cL#j2
zyWNy!pN75u`H{y0EeG0%hb?_(#rMD8@b$PWuKnfq5pRF9zTmS9$5l8_t-s~m)e8!f
zzBsn+m6Dr2-C0@iz{^SJUi`#@_}#PaP1!s3@oz>&eNyPmd%s;x#f|;@PP**9tin6)
z=<@!myVewr-Z1L#wt2UFM9#bTv((V~&2McUWtm%e!?KgE>HW(orD1D#et6B=m%DX)
zJ~-p1on@s<?|o{2yS|@~9cVu@dc=d*ytn4ElnoVo=ftnLHhppYn`iIZwkG}hPv3rh
z*tYF;$9jL3bmhQ`rxrx*Od2=ptlhB(E?aZ<{!8Lh^J+GnKGRNjjJV;2%l9W-m%s9s
z4OQ(D&nK(SE!j3_`TA?mdp`Tfnu2$q>78-(oF~4?q1W8lammV~vG1ikwEXd$1K(z?
z=;2yFA>{Y7-&i?0CicErckKNuGOcKw?W_2A_t!p98NBt=wmav~>9Lor8l4g{F#Y`Y
z&y4OeeAKA;2It)3X-OaaesbR>Ma#3S{jb?Fjl6S1(2IW_dHnO3TRKg6_vAlP-yk=Q
z+%YOMbb7&~r^ZfO@Y+aM*N`g%PRkl}`y;Mtwy-xocx>3umPalbcE@Skez>M#`>41(
z_F2z<WzBgHk7;=Rtjpfr{!;JlS6mT$#iWC;@BXaa{8tNho$<(lldiht^oR=!#-x7K
z{oDO#t^RFV)XcQIzFpkorCly(SgdXSWr?Nrv%VgC(*<u`)9}pTv~JIQShn2pTfw}?
zOOo#BS^CfBKX1C$F>PMKv*#yI-9CNc{)xM8Nt#{z=$vgq&%Zn8P<gu-E*dv*MyG>E
zpDa6V)kjyQ^nM_VOuQ-Tr;jd}Kl{(+mCs!K=gV*WwRe5>T_b`HmG|C!T})d38K>vo
z_t|^Bx@3;{dw>0*g<J3IIXr0opxYnb*CX=B(}(SPveOsmbhv0o=VwySnEp?Z^MdQ<
zE*(hzx_`s^+{><*xcZIW2ZJ80{`2K;=UeW7dj9CYg96%qdF9MsN-q2~vERnpGgn<Y
z<EG_5TyXm4f@``>YQHvMW!Eh^dv1U7+n$dWU-sk&U3Q+dyu-1fXN-+H|JKKE`{4cG
zuU}qV_s-{M-o54UtUIne^T-3|TxHGs@QK7(=f!otZC~JxKg<aVv8+icz5efIH$<*j
z>s<cJbX%JqX#q(a$6lA$@%pDaToN|x!uR_>YB{>TYj$RO{I2_M8nxi5Ip=jA^3jh$
zryTy{ffb(~+;r{fIg1lc%`CgNb4S+)iJv5I9`*bK&prF;EkDh;sQ-r#Z2jktV@EST
z{N%fLZ~N?rLAHN>9B^0GpI0y1zVy<Xn}5h&w_;pk!>TK<eEO@eGT!U;=ER$Sbe4Yi
z{j~3&YM<Dpw>7lk^-aIMJp9kI9{oFT#h7VhPtBS!cge+z*BrU&-9ypW-ao{3(aL+4
z#@_VtsMTN7H*;<e-(8>3qr>b4pT4#A?t7!Rbr|r<Mc))eR=@Ylt?M7W`<3UvI(o@d
z&OJ}w|HBmlzdoG&-RUDA?im&Nh;z&3h37pI{P>$4Y`^7oYoGDL!)I>jePo*bo%d%~
zuNnAzj4;*<md1po(qKA}4a}uUKwul?S00E4DuY^Gf{+UPLc2VKjFn?3p@DO4rF_OR
zuezGML#ym{E~l+R%(SG!R?AVSh|6pxu4*>+K8DD!04YF5Xal4QaL-tuSy+Ql_ysmq
z{G73nYNxGgo}Ken<EWa?zbvs=R8%kWB{x0uRkljIC_l?$?pOkjNZVpZZE;C;MRlE1
zz!2`NUX;kOFh&BFO8*)f8jhL}6H#h&*%Hf~QW67#Ec2lF!{Ij?AcFp8J#XKQ<+XJ8
z4|8&_%*?0%oU?uWbM4Nh8>6QOb-KBZ?ytWjzUuaC>CqtpQ4`-di#EtV>DDj%emlI&
z{cqD>M%O(2Rm5m|K#J{Jc{a$M&C%Xg=0z4Z4`TDiJ0*8#2{IHq$gPsQo4!mer*aAA
zCMTs7U{4|T6k*SF&6Apzhdtx5rvQ6UnzYnB>=}<e1=v%BJs3g8SnR=RGH{v<lqMq?
zc`}p7Vow3~6k!ielf|P2<)&m6sqWMy9w{(4b*vh!(a78d$X$fo(^Yq7b`f%``Ho3W
z%__v6Sv=O5WL4VCU5MN>kXwtNnNo<{JUlorIYkqv<jEe7JyWnp#U*)ik$Wt1k4J8l
zI5i^|d&XmrE*hdMEqyHZXt}4V$`F*7mYs{-W08A2a!)~S4NtmOUU?a*xyX&f$0PRy
z)t#mC5SW*ht2xJNP8309acpvOGS~i~vB~M<wGWx8I?r_E(Mo!3YNq;OTyj!IKK4w*
z9#ur+l2gYZ_eA8*NA79Jt))%R%13T3PFC_5<i_Dz+U&G3$eoYe(~w(bXMA!}dLH%^
zVh@g%g}$Vy$Rtmys+O`R4|`Cw4E!ZS{Sq`jIWrfz^N_m`xwZHyX?e(9h}<}iDv|M-
zS-HqP2Dzspw-!G;a}09lA$K8iPeX21OcRn*GpArrA@-<sV1m{dGxrqaR@<xz>6*$)
z?kUKP<4i^FLgd!+O3y@iGEv%0lr|HkMRaDOyfRT<nJBMJlvh?t9&)S9PfSi#HC^_o
zt?tBBO^qct4-cG}K239Kp&8>eXTIi~tU0x087P@rxD&IJrXjaV$D}m14Poxd$UOzQ
zwfNauPb0Y}Bli^Ko{HRs$gOsF`N_#zPs%)nnkOZx0DI7vbX91wX9n_MZ0TzL%#)43
zWMgbv$Csa+nNol~C|Xv^IP95*J!;>VpR8uX+$#V1DVpX<ZdDffsp$pC&BFup)w(P>
zHSAd^c9zyuO-{~CpNKt^uqPjTim*o=LrhMds5vKTPPM$JBx`zFke-%{-1*2o6}h$e
z87cY5U5MN`PLbx$M)9*z{A?6o8@Uu@W~%Y0CMQim?gHdir8PA<Ejb^13b03wR+yZk
zePN!d$fNa|%!8t(C1Fn%@}#GY#~xm&frYAcf~O@XX_1(xQ1fJHBU|Pv(ma{jYVSHN
zS!?Shx61ys)D$gFdiFTv*5YKOBe&YTFgFU%MB$k@TvgI(nQ9LlT$G%Yl#4wRv8O=u
zB&SWr9`q$OeIoYgzD(0RX(_qbGYNYpV-G@*foaP~(tRmJo=ikRMwZSq1$!`pEJRzT
zCa9t`t+SWh6Onrga%=H3v>vD^Gh-rhYvEa$Du>gPQ?%g>^Nhou@yL^fzht2=shU)n
zXDs&UqM=MvlgD8Xj+TwFW$V7=V-LcXo}P<6<FN-v%S6$%k+kd?k3A@#EX-V1(m3oH
zk3BezHe#KglAMd&<B?k}n(1j;^C-EsIBDr9T;(coy2^&+9H%)^go&C{BPClUD{y+Y
zhC&nZjO1j!Cm(wXu}5uJXJlvtM9DoKx$}{`5V=)FpOu`Xj`?Lz9`>ls@+@uk&fIF&
zv(mKD{j3ZWCtK2JUl)QR0N6%yJp6KZJW&>*z^_{*Y6loCT#yz{i-i+n;Wfd+3(3O8
zW8utMcu84!17zW1vvAF|@QQBXHQ2&?77LfFg-gZ4+j$G`t1Y~twaoHGDuheL+i?T-
z?P4UwSbz}0&44Yl#VPLpA5v7MV(|@hQn6@KIbK->7M$)=X3FAVPU7Rg-uvRP=$!sr
z-(L9PitL<$ciqn%IRmC_?yzh~Ku+&V<wd`oq?=ROU(S#Swpw^cPQQEE+TbEJUEj~v
z{r2SHT^w(H^dGCPKV#sZ_0jU@ne}bT<iGkTc};OqebjU9*fL~&l)Uy>P#?vD7k*S9
z3E`3Qn&X1{Na^dTTSxjRoB$Ky@P+0!dG=bDZLXujagL*E9t1{CLqRQFOf5qRJJiD#
zT^A7LfJ&wWl|(u46A0g`1WGREtt?w%)!eCRnma8Oxid6(MzZG4#O^Gvgt;l)8IYT*
z2Bs#dP8D5XZe}*ez}6di@R;N@)tSlZ3(QMaNoDJZ$epdZQ&D(Yx|%dqB_uB`TXm*u
zp&8s6kf)N7m!%?OOKCiKY;w95oSBN;D%E3CGkGvu9^=8|lGO~yrK{PD%TiH~%T`N(
ztwSMqmgY`DaZ*z>cZMpd@yVI0Q;i)sUMr&U*(!Z(g@y-DPzyb9LVB|1)FNbRp;>Cy
zY-xiBPfX=80w$)bE>!{(vsE>il%|q1DO(jZTMW?LDM^}JC5X9Eob+tXttv|po1Sy@
z(u`bnrl`C~^KMRGzA7VWdd&kTtNF1xGmkJu(*tR;t2#B=OH)+UnW<(bO+-01HrwQG
zHorvfRLz~1gxp!0Tg@||Q00|P88vqXa;w!VP?{F1PE~0Fr=@EVGSW3?hUQdRn5L?C
z5Sztm?qn3M$~1^gx-@qNim#$%?kvr%NlTgvsZO=M36v&3sxwQ)!6rD!ou#?e7Kgc$
zkvm&+r=zs#2&dW{1hH8Ga;u6yT`jplX}GUBRsETssk&4lOQUy=i;cf|@C;21Y|PAq
zXQ}OR;4D>V0%v7tq1oxaclUJdl1H;9-2VanD7!H0>fqn#mJNfiTy#?#`toV)ZyH_m
zcSF`^6~pMa*UrA~r{AOK*0b-5>oM~=>$~41ez{=&YxMJn=bb<9iwL?d=c`W>23?@y
z-B08{`s?aVD~?nyqra47U-M(??R4M9vb-xSZRoy7Ub$(@r1^B;;*+2K>eAtKU$4a<
z52*N&?!Bjb*Pa~*)4gXfM_ao0^b0cHfBbueqj1hqG;kDA;Ix<KmX+mK^B%M9<T{tl
z<*2T5RFzdbD`h8q3ZLPqC@pp^sp0#+VFlH-4iq7hrLA(|D8XaOZ4UlDEZ;HDR#iHY
zk6$D69JR$2j(NqoYKPq^@3gT+&}qAt6i+I#Rn*v>mDLqIiG-rKy3SQnUCFbEVo8Ls
zP)4dm9gpH<d;qVU8##=nRb&&vWW&4vw<Ne(D)_!cInR2B<nAN^8w#)~58I>dYN91>
z+sg9kUiKFRzXwowcW^f|==auZ>m(scfq{sKL-5PTPoeYX)l^iwc=HmRnwXrJq#$b3
z;F);+Nr+mh%;Y_IGjiHiRaX_)fa(-O^Z-U&OoI?LsMcP=&NLqeR3Ye_<y@kSPjMjQ
z!ddOAhEK<o-?o4^u0g~x+W?1L>X3@?RL6`W_BRUbg#sn$*r3Oz@mA)jDn*#F4p|l>
z`2n)d4W1EqX)r^!q^bJjsH&-R>5E9t6%|W6Or5F=byXFPT9?t63R_K0#S-m{$W5FX
zHvzL}8-zyKq|tx5-Q_6pq<P`xwpvBul*%A7qmQhj^nuerTNPr)Tk-U4Abv={q@udY
z?z{BcbGM1|cSmtON*(hTR#sP4A~*H&H~mSOT11S21p2^lFU)lS@mn7y2^tC^h)dL~
zIxh4_>7?(y8;sap?9Oh2QFcN7998{LfeV@T6B`6m9o7lUt;MUTp64iG=ey)j@(l*-
ze!~BJ)iO&}W9oN>y=tDT+y{Yr$~t16-DTJf7*@*=si$YHk$36>Mhz1r91L}zNA2KG
zRq#@$hl;Gak>e;c3LQOK3mpu26LH>hi4yJ42zQ9n?y7TEdCHwPn~4p?tET7#9%w5q
zH7}P$4|+1(s8m87wTV@A6%_`Y3dxgABsxs2IAJwT`@&+IJm~PXgGArCL<X4ThuPp2
zJ>3J{ZuEuAYFDKU2tBACbEh<Tj$@>r>2B&G6f0)VQ_+iM6;)}kt+maw7cX+!YKVKU
z8##?JmLN?O5(X%rGa3w{B@IY$)F#*}oOWC3k_4%+YVB37#Dv_^Qb*Oi1Y1?L)ap4C
zYJI{rXu!~fMdgl?@`Oc>ii!lMy{5ueVo#7TB-j^AV-1E;#+2wqdHm`9EG7OI3Ldhh
zH@Fj`braeUDN)owgJx5mZ}3C15pp5a<})n9AQ-Ctym`dET1T+qXEB-?MDxts1CeYp
z#4g&g2~kI)G(z2pvi`|_*^Gc)X5^z_3(<Bpjwy<zRtx6;9kdO7*MwkSsCHrEx3FSI
zm82jJm&x})d(}eWlhLn4-bhByK?rLPECvCK?Nqet+2>I>ej8XG4qB?kLkrSw^T1g9
zVy5b~(x9n!VpXY~59xi#cXUhS%ke}4o&ev#pFCLH(P*+5Bj~eE0YfY8m2>S*)@Z{7
zSwA3=8p|kgJ{VHW&&UVsvC1W7F{lwjUzV82+J(c!y?Px-tsjZM6CfV>M+^Q|b^dqM
z*_TO46PjcBR)NHJV2FT9U6NT)S6y4@R2Q|_f?QpNP0;D6#IaQi9nR{iN~u?s{6|#<
zfr@kiF&HqaOPRWXl)9iNRMl!{sog2xdRDO(>E#eE%9Kr$j8{?Xs-UrF$-S(N>1)4K
zOg4GV1aB4^wJkv<6^<Gq-+*d7KK5l!6jI@+bTC^40d0wH5~gD~Kx4RR>n&`V4KLC0
z;kKvcH+?ZVYKzO9cC@%5M-_ynUtCb`@esq+w<t;4_KM;fo6}Z_=lH^SHyhIRg9QdY
zDk>Dm!J&&2Av^c!=Mryem*BLfMHXKxO@wNS^@Mw!Q-;bT#^S~1pOzSW3Q+|ELeTXk
zOzNzC>P0m33#<e>5T#vY1=@uu9a%@B^)F%Y%W~)qX<1(r3b=6GDEMsyaibw_B*Yy+
zc&*pYocP#B4HNqy&UK8-X>(K+m)M*xd#%G(C7`kB(e%b>_z0JPE=m5$f?un_FkRx`
zh=zmfpdl+BSeR-OPn*stWzF>rB~27iO*TQ*QHa-8@K56G8ujQtu%HcOs_R?^Nj#7u
zvY?HY=1TLj`NeF%$AB0vH-CxsK*^bAl!V%xT=r(!G>Sp_&Y;|d88FfahQf^)@_<Kc
zZEI|`E<v?sH#QSh6>9A@@F^ArTM|7mX-2d+;rvdtp#*%-5*w00{rA%aK4ysVgNS*t
zvhTqjz7zK<Hra%)erEjYgtpj(X<>vfGBr~qI@AkOVa>x-kxxu5L6OAYwg7?p<-8!?
zHiLUMxYh2h4FEeR<!P$Kepw0f=~4$^oPBEtf|y8@?vCN<MzpStcV!)jHpRecVt~<Z
z5SJaJ91U?#0lrxLlTiAOfRE+T34S{hrTb*vaZjRHSn3aPIlhh%mrecVLcERw?snj2
za_B{x)ZI=M;|u-j8D1K{$zpn;);3E=Wl-BiKF;)1+kL3rB41~jD7?o$%Af)_wy!ES
z%3rMTK2&g#uQ|TTJKQ%ueCnbYz6gCfn~-j{SbmZI<ujN2V@V^sCJQM0LB;LMdP5?P
ze;n62Cbwk1p(tO5XCmYu0RAAz8~kmR1u>-}b6P?LT7*Bp=(0ei<Lf$D?!k)xEbb5T
zO@32x{N|FM1@!44<imc$l<+w`JQRL~MnW56^;NeL-G+37U6nP1iF+fVn`JMwRn*y0
zW<(>dxIv$3i7zdX<hM5%<FoPb2;YQ08^>>MrRBYP7r@gFc;S;%o(EUnNW~xCfC`S&
z6b>|b`SXj;SYpsJS)kcE2n<+IgizVQh^!5$%*ss4%#LNt%g95zQcvl)9L9ni0(^u}
z1pEq5ID$w6h(^HE#2|=65D(BEK|F$venK~Z9wyKYK^qPHUfAUgwFQRrs3FVZ{s5vi
z96r>g8MvWo?z$8~+&h2($cuLdK|~uV9@PA8g@$=D)USTHenkj$VjU9xnpVda6RoxN
zL8vnuQ2Y}?sG)*T!}LNe`d5Uy1ZCO_GUXWsA>dtHsE@$n8nqBc5Tb|X*lN)q9Gz<`
zneTGis%qIzNU_1vHD28<#8Xyb$=Yc1ULJkGNVKrQ3n}3$8D=L}*(qS(h;ctJA_+)c
zkcLS3D4GK!;~c3y6eLzYIo#+fb5>XC1;E$Zr(vc8h+Y6*X<gzwt@9i7#8osH6Lgv%
z1UYc&s=EQ2vec+MTR8UF@LMV7Bw?u%F{>-!wF~MTwdHKZ)KTT&>v`4QR3@y{QR}i*
zl?W=^52<XCkLv4w0;<xsxVY5#sBJV0FSgf}R5(iQ>UzDW@;CW@n#iG;lV!LT=!8W4
zlw%P>zQ90K-Wma;%`y2q+6dDf0tg;cUF|H@5R736;;`n*mk+A!HmCG~b&S7x*f$Gt
z-^$Q^571Lv(9rxiYYcEURe=_uFeh?1mul1+;P<K#LJQ&(e*yn79)+@<YErhI%Fx@k
zE9Xa*n=d>GviBzA7NJ3q%{&v?v`Aks@Nb%jfh_#WN8qvCghz*uJaSz>8-+DjFYM2-
zpt1$>Z|o>Q?7#i#>#OmviEfydjfb|VwR~Eze1tkR-G_E#26E3t?peUA?o@|KnVxPU
zQ=hidJm|_)6vnrE__o1h^i6DYCK<J-2=AFFEl$sIpM~12siJ&Cj89N=oW%|LxJLNr
zc%hHE!!ceIhi^~J_7IoJSYE%2CBniU%20>cfS4?l0WaVg28<gOz+{BGAI?Wd&|-at
zJd>G?$fU?L6Y(UPhwja<s^CFwTdPBBb@;DT2a#JJfFE%;I@+F#dK_KG(ehk$3_3FB
z`77Ydk?@@=s)vy0hz6y1bMbhs=h&OJBK$vkj@_4o#03o|s})4UF%`w}I`dN3Bp2D7
zRk8^tw>mjaUd2r)u~n4RRmj#R5~`|Q369E|3OhT{ZZA!6R3*5|9kmH1teT8WkQl71
zaMdPM+Lk2DwI?icR#(kS6xP~;C43c$?i_H+T4dge$DK$*O&Me2Z^F@Z+g$eEM1WaY
z6hc_Os3WGw;C1-hIPH@DIE&?@lM9Wr(M#M_ZkG@2dYPdX8|g<-zw^%Pr-hpy0uNu3
zs2veC>Ygy8p*2oNrOmm-;8S?5y`;LTRQYJo-EsBM2&}M|xdfskk%$C(L$qsxD!<Vz
zsr$W5_YO)7<atZs=OMM!B=2<0D+l71ijp&>Bu6F0cS_Wp5Gke_6b|Akn`f>)=LwC6
z)+O4N+Fz_7L0O74DR&eS=knvd$IzltBNmGm8|5)sX`^(zPfcyAmavRWS|`6&lH6Jb
zTWyWK#AUKA6(X~OFLNLq<$}=g3dht2<1|`9C^s)EY)h&|TOqjBT2jsL?yGH}K~XF-
zQuGKV2YF*Hz>H3Bt*S$4QELRAeOhhyrt@?C(0PyhI-!dbD{W`1_q0Mx?$ldO(gFvg
z2x3Ee6&8xx6cFw2m;v&jx+;gu2b^N938|9m$U^bLjwenXRc4%Cjoh9Xyw(o}uki&>
z6Pa4#srY<l>bhoP>bmA*s_7+leKRq2eWRGt+LU>9j#7K^Jg1|S=sKBev3m1-gu>8|
zm@2DF9c4>A+p`K&z8c;%<KLDL?*r;^N7X_{tz&M5!F@R2#&qRPYoYKk?tBl$qfxxr
zkD;4xd?tQG<2ql4Zh@}q8sg{R$lEY&u|Xi@6JRn*3z=K(a`BTF%{NRlqD0Foz<8YI
zMqFsAz04+!REYbK#8a{;e%VsK25HzOdz;i?ybDoRY&QLLyKW8J{J@9FO>3jvtP&>q
zUT>&Lt`_g%|BKu2|K|2{OK<PCZmoKdznA=5a_g>jd#-gqLYLwH&;1A{8r7^ssQWQB
z=;XMS2k}@TU!*}Vvl^b5bvwZdhb4_nCwLlMwr(|O(7M$?JnM=!#tSe#5GjuFf=o8Z
z{22M+EwU**&sM!{)mtAdm-P)m>4FwlIM|d5*&Wk=Ut;)O?(Kt#UeEDNLW*;zmpBYh
zK97TWQMPj2L(FD6L8P&kj7%tV&s&F&*I#Z&34CD1Ntk+q7{PE8kj8evPl179E*AkJ
zU3cQ!Z6BS*;K#lvL+VBfK$!5{)1Xak?nMI|qvD>_U|!63HlSTP!{W^shhwdsuNJSh
z5p!$AWGyo=kLIaUZq5$Z?0kVYI{X3s!dh*)(@{0wm}`FOp-E=dqD!hPYwBEf!@VuH
zy(e!=w~-2#Vz7bSRQ4lFowh~FpuJW<k=7){zj8X4A)2-2=f%b9%MEVdmbiVp%m_zt
z3Ea6k>@J-1%ml?MWCl?x<rbn<N`|<KXnhV-P*#j*Ls$nWK5o?|gt!|}V~t)Y#N`k7
zwFCDgqTbUb0}*~4w=MW%fu12mxzo(UI6ejwffx3TTd~0wX8IG&`b-ytv^)*}?FqjL
zkhTY;?S**m198~liQW*8rDJ#N`+|EQg!hH?gACFS^&$NbNIwjv9}el0;Wq_-Qz1<f
zq)9VKlL^0B@S6>3G9b+eNW<hj3c^l;Febkop!sAEzgWIwB*+8#c}vIm&xJh4BHG78
zoQXj5DeyZ9WR(y8$>5&?{!_sp3;qJ|=YfAJ_zOV>6Cm$tMAH+-`)CNW0}a!`UkbF$
zlwcOPXT!IX1U$^m>d#rkuR(yn7K&1Y_?L-;glMAQX(8y_F#$JkL``VXb|jR;55@A^
zJ_3$FJ&yB5`h)b}MT<S>dC4=8=V?SgwsK@(jY!PvNx^?<g8^P*V)e)<f59BtaFKVx
zT2Y#i|5HZ!d(xe7;<3D*@>LfwPQg-<sSdCU(DDfw{b#(HWF{(4ED45>#O$9D$-$!$
zF+U^rqI_g81h#MWsoM~}f<+U*6zo<{3MN(pPh7<K#l=D)b*#WeoPS)L+gx14`^ZJ0
zt)fCm+E$-bW4O}hayk|ZvD@imXNyYL-UM6NId*6DcxUxo69zhX7TmLf|8_6+`vs%#
zET4|VV?NIb{tM0UbRr)6j>j6)UUc@4wu>4ogD#%)5oLXcKl$Kz>Pk~pNp%I5Lr$0~
z1KnVJ?h<G9BKY8`HNwnvCBA8X*g3_93f)bgez~a7-SywI3=1}g&&iFxb@K2n*2A}M
z=HHMaxu;9;@GZ>4x85GU5#1k@!HW&LI6B-f@o&DCV-Mn6IY!Qt8j<2fj#5{7t$?B@
zF=@%zdBDUHA$EdE>~NOB42EAw(2F#_=`iBBw;ta$-kJ>jEtcTwrA*9?^5~<_<5f|+
z`})VzMMgY%;;|p`tyH7rQZ2W$<Jw}e_SfThMdV=sF)qjmhI4{lI{~hy80eX6kDZMX
zmxGAMl5`t9#uDF3Z?GQk>muhv{Nv*ee@kyD@vZb^Nf*n8<^&9a**Uk?6H6kbw#HV%
zkIoy%U}Lah;;TE2!6A;dag1Pj^~o!Wxy?X^#~3OOPeqjIV-d-%S~2WZU<AX7?<Evi
zT~>CyJ~P^qh)4YvN7MNxqv_+$10m)GmP|Yr*fOz>rx4@Tt<5}_ZEl6#eNZFPq+MY^
zF`nnysIk-@Rr-5b{2HpY%+lX$@c9{#H=jvz`M)V9x16JVSmCW9P4yrgGeuzO(mZ?@
zh06V6{sc||8HB%K60XC8tTW0hN72N+3%9tNpgo`avBgGTP3HzFetCK*{XX45TE=uP
zrz^^Ut^lB&uL3ig`fL2j<$^3ZKFRr2wiuL^B7aQn29xIB;))XR1RyZ^^D7?<3<3@U
zPHu#e8)sy{;Y2g;$ji$DmFpn-h6mIvJW`NrwBo;&``h>_-!{!CUmx;4uO;%ez~G1B
zje_4eCH!0-9_y!EV*e$%G-tjR7}7JmF^YdV_qRoKjUsw0$ZXYw>1}_YuNTnF=x1#J
zUoGzCA#Nzd<<@+8IMd%M$gel>!OHAx2ouvY`l7w$#p#Lxd}eX6Mdx^s1GH_~(5Ak?
z<<p+%^{Rv7zr8__D;#OKko!9+{>RMv1Qw^W66YeG2IPJRZqG+8CAz;r0SM7{7lRR)
z#T6|q^XWUcjf&fwD5ACf5kh)o(by|pF$m1!qCdZMTa_(Zh_JvSg#{KQEHLt6GUnyY
z`Yo|OD*5UA+U_WiaK7Nf7kBtV4qvq4<;xdiI8A(!B}ZeD(;(Jy)Cc)n$5(J3yDI)y
zxW6ajHH^3)!Qu6T2)rjK{#S8d?duq&7t!hmuNN&bM&UTambxQq3SQChGM86x;FbLb
z3UUhu-g_(lYs~6?tUv=_4B)(QW6VDMHHs&Hbf6DuybkmwIvu#^zor8gm=iHxdcbf0
zM#^!3;(xs{avVssK8W`{|0Ow^)g>-3PjzV!(dv>9<0o~zJ%}{l_$ikBck1h4(s(@^
zLUej|$A3-FEU-wyc<D#9atjyM*7VDY=HK{XUsxK{F9Vt<a|+)Sf`%yszQG9`s`%9>
z9EKBjMT3PU_Blz3o)2a7CQp{A_*Zg&iUAsJEy#x&^gPCN9A`c+w?w{D{I`5;IbLg_
z{NMI5|7ZRsxxe`@<-4MV<V~T5EaUi-;a6B?6juvN1;5)WpRn2}zJDa9=XVS3fJe8Q
zXo2r$_;`N4;`4*9Wda_pl^DDbo`~`RtKr%z!dj21<E6W0sS?KLIH;7>xghuBxvaOh
zKk(Zgc<zels2qPZ0yZCw1K{!4{ID$m^S1-QuLgN~6{sr$o)*6Z)WZYxM$oqrFc`s*
zMnEEhBp)FiL57bo62WKyD}r1ffy;Lc0$<ZI7>8iOafTuUCjp!WFv9>&M{ov$(-2J4
zz~8%&`eXE!HgD#u@X>rv*9i>GK!rJK6BpSOqc;J_)D^@_HcAuXa?F*@$mkF97vxSE
zo3BJ2*PyYZf+sS9?UgmIC4SHlTr$^DRjMUmq`cSQJ$Iw4?%d44dqT$JcwBMO0O52P
z<s|^4?Ex0gkJ~2lch>NAC+@e_g2zdNS(k=@Usz+AsK#VH+#Y-w#cRyPlt(brA?7CT
zbvOdN;0Yi)p}gY*jg8U#Rk+YiwCr6!6bj>h-v`{QQJjBWX+4yksca3(u~oh%@)dOY
zQ4^hh%tY>wBli=?jW1F?iQGQjof68<$TkT-OM96QykZ<rAsj{)zW9BJrv;hfIQ&S-
zr6|r+vFr?VBMsAFqNi?j8u2B>RS)SKS-J9%hg&+-=o{tpEW%+Vk6T4~1PF$@1bokV
z!FR5o_?}1jmLqpIa=#$RzN5g~?ZS7n_?-3&h3{N1)wA6m%1^{~k&k(ETJJE64|Q4I
z)spkqcDq=(yvt=&=7Ky|n8-6r$D44hSqbZ85Eg3?*3CUkILxZVOE3rvGYIQ#5T*t-
zmt0;(vUml#Uqx<HJwf=cG1Czi7E9b$kdOJ6*F`ul6CPeO;o)@yyl!>FfCsg`@<O*K
z{CD_3_Zvofi1XB+s?>ea9Srp+jmzMR?f|HJlDUn@#_)@p#zY`*dJ_mbjo2)~1`V}p
z&7mdpHsOC6;(w)~Lg1r~&%<zrE%Uw%Znot58n{`P_zbuiS8sxwEfcQr(Dh)cVD~co
z3+ZMf_Z!Hq#u0|R$93oh^tTSbJn$vNZ5@6QwQd;=7g~p3t-~+*uXXs<I{a!Kezgw2
zPVnJZ0CBGdUVzc)_w#At@)G$JrH-0@04&U8+(Skc6y?M%LVm#GwHi}vvGWN3Rby&g
zSG<`Cd$^%-`z~7NQ>(rlprb2hivDpS1*PG;K^|0HMHl6A+8kA?9WP9>+58C4p-kCo
zuZTicgR6-CKwKoK0i)wGqHNv=5zV~UL=Xi#Y|snB!yr5yh4%)$tm+K_yd8*g7=hzs
zV-seEX(Wh|<;xBroJy3c91nE1gI`8>f1;WA>VR-@8rVewc6or^U1uXjc9G)@$hQc>
zIuOmA7t4oV-0DFzGhU4T1n~EQ-`<cvKXh0C^l}>dAb$E0?I7nR0zU%<eg+V|nJ>%_
zI~z9)_+bYVY6O0aOnh;El89#N3-gmg^d`TC13#A{exitWxgmxqy1@=M_|a@?y=<BK
z^o;l^Lh3>7PGyH(7sG*(d&<<J|88#nqSEFnDKF-a#;EB7h%wQ@a(rVV!~;{G34VFg
ziV%I5>mj{(gh-cO{Kdov>hZc6i3-wx?<{@~)_)g`NIh;uhcAU9bg<?&g%%etv^i^q
z)Pm_NZS3HX8dYbl-cS@vAlSlkyB!;@|1R39d0Zq<oHU87OMe55ikRgk*t`liQWa68
zMtHm!uTq5)I&DmD{@CJ~LUO@2RR>WdZ9IsKxNGD(BX*^~&pl}f%O{!`mt~~fs}<X7
zYATi(NC^4Yh@8hFG{JHgX+WztEjFSxhIp<`vBYz2iX$Ft(-V!YP4UFX+O))|Hnr8K
zUMpg6M?6;^!Jw5`d7fyl%F`T`rh|`?f=V+>NG{mMa;r3g%`0z}W@BS3O=o>R>%DY0
z8~m*y8-3ACZ@(7kq%A-YpD;-NFStK|_+)U)WB+LZKR?LK@OAZ!&treekGEQuDq#rE
z7dRcihKl`N^>}r7%qD;qDDZ<soj5(t{rqYVzjgu>u(kX<>k0XUjnyk(4ej!4EkQ&Z
zATfOW%6&MZBbpeRX^`tEpR`3I9{D8N?6|V<*Qm*9IMH9f<~*~JIKHyX<p#5>aIBA<
z_VA_dL>c_~d$@iLFC#8ne$Bp(@_ilu4(18gbAL4PX%hY!j(;2{|7%(z-&kVODEUCp
zx_?O!kGDj=7HDi4-Z-M!H(<Xp#N7@-b5#0vM0bDxi4(2TTc8JHG{zInh5-8o+W{N-
z_YOp}7vN7N>fDTF=tI66{-u03{!95j(GvM`JXZ;L0tMiAW4NA%AYkF#wnI3AU=zUi
zWOyBkAzA|#hoG$&z;|Id4V@8mLcr}ya6i5~)4h=pCqN$nc0Z;cKz{@Sya2va!}%GC
z0N<bC{21M*8Qw_1w2C?a;O}ZN2i>x3g#123nX|ge!^08299Hr546wFP=Cs#n!G>oG
zj%i_ojZiRCp9|{j+81%V=sk?d090a`3KKyzk!6l5M{T(uH)70WV<r#rvG*$){C~WI
z<516hpCD(wKjv<%o&N0^=t?78R#f(0;PKJZuaMgx<=lka{-~pR52cCsPntpQo=DWT
zg_PA7-D7!XRXx0<+~%~ExYRxSXp|{&&*T9wgz2lPR7+ik&zfVAX{#adj*J1O&?Vw0
zUoJDmiHag92R^yh8M*n2*>?tMjPk(>#q-p?56XuDoSlFXBM2YNLYdW6EO829!OjY0
zRT867+vrN$V)xv+I<GUJ9LM_(etUUpg?*IL)+R`)$UsWl;M;ci-66!;>7g2L^uqU6
zANh_$O}D1baywUHo9p&qQcJhy=a{uHY)s6?HEf)CfEb@X;3p3FHYMMl3nAL8M=TD1
z?I;|@i6q(<E&FAyLvNytMSBvZC1Yu$AZ;8<3p?HFHg6cD?Eq;z678iU=I;#tF5vG9
z@w*Z2MI+|#0sfwlcRPc;Lx_H>v=_wbW565VN*zG7ZBfiyKM%YOLHRHk>IZMkKg<hn
zgAMZbrEG^2ZOfI#NrE^jC{7y0$spRRN=9Xy?t%XiC?A$~B;p^J9p`@(_(y~PBrp7D
z8|2M-%Y`^O5a(nA-p0c3IQShe@RsL+w^L9)jJ8Qa+2%Kqw~0dD?TBg{Q$Bj&X)A{4
zznfc|R^DVTl^+<z2!!owZPFo)tV@dx3~$R#96>3L2^a&2ZzRPmmReA-Fkh^lU*Twk
ztz<V->%vxYiuhgFN*0&o&k_9nFk6^haaBrW-ADwyn>P``V<N3)8_Y3aQrcEpTI{S|
zRO<na^tX%q)uBO6buH$C=>N>97k00`QZMXg8&W?{d1%HAjafmqWB~>s^U|@1>=34x
z>N!{*YN;TG`{^QCKM4MxMY2?cPGY#R?hWByQ-m%Ph69ixcFd>`w$XikQ7RtS<+;@^
zSGCyA2>bhP)j67YuFf%@rR1MB<PhDTx9S}0Sqg3yLfCiT?J2(W_Y3%cY9NOD>skp9
zGh)Wiwn^B2_sWd09lys@{1&gqIXYH-sK(If$hEKuwRW$c>BUOAj~A_Qc;Q#ELV0Mc
z8dHZo6!Z@!1@xm)0h{6X=Zr(l64pk@uNo9r*vrH-Q^Hw?S4BM(&N{d?5W*P+Z+ceL
zoc+CUoWN_H5{?TP_LwEr6<+%L{kYvwNhkV0Gm?c@w!D(uogCL@!0+NQZ3dXu@tZI|
zH<)12^`mo%;R9IS>ihw-I^WYjJo^MZjqSS=h;Q{W7S&6q!P`u|i06vgn|Q96eLS=8
zU&-eX-JiFL+1IlS+_YSH9qTtQJ@-Me=k8A$*Sve;L3mBeD-Q$72~<;abgcSNQ=LSs
zsf?RJq|qJb0Y5u!!wP(yD=rZ_Tnlu_{IHe}d*6@Y_|14u<Ohxy@Vw6bfyDTH2tNZ7
z0CB1yZ5zmw@o+YxKZNMs&^d?Whj(h^kzq7(FLLpG2<I^X0siCbV>rl9S?uDpDIJG0
z#1{|9`BFxq%~(c0<h!dS@>R<O3Dg3OBIZ@uwNM%)e(291S%o(v&9TI6(r<z0$zKcI
zuow0Na4k1roZ9BJ(9LoCOJAg(zP&9--(IttYJo;lls+_y7HBi^$<Q{Wc^el{RJC!5
zAqI_0j>h}H(761<X~PdC`BPEi$Z>5{emNfb#v)u?rQ>m#(fz8x&1gYo<`*saP-Htl
z%c5N~$|RN;HCl6hlvaBW(lVm5Dbi}I<ozm_LPyfPwHS_Js<pVY;x9UZh6wvPZ5<T<
zZ`|LN_@_Z#4a%kkYxDk=$d}>kLbPX_{NcZQGtv<&#M4#>{OwMA5M6hApar|r1C8nK
zd+O=i*HZMcyt)zXVX5Y<<-Li%miHkBwLC{7{9mZ$2RUsCM0?!CAMNf(j%)2ccs%lr
zMYy;=_9y1Oj}a|R>3#YV?Qtf5%3@$M$|RN;wKPs2r8UTdw2Y{1inInGd<Wd|>A;t#
zAB>(U^cCU}v^&m_fS{KN^hMCmPZ)$?@NtGj1iqAQk_n_DNcR)65M&=`up-DYfiVd3
z`~>`b#KhwalM%#e(3GnOcQxXWyBuzSOQj)@pJr8FD}eL7^XAn!t6kOnJnX!A?pQb`
ztAr_l%`AA!$=hW=DP?Z_)IjTV0-C!y&E20U*Bn}Hs<zrxZMCV|YExARYqhEBO^-a(
z<yM=jexz|3d}+0*+G<m^Wo)Xl{A9x;jEf*u56Yd6s`=oHE~&1psdL$jfj8A69lR}v
zuQ|zjT1(t(7KT>ir!BT(_g^-C>IrnXxSAqsynaj;87gj%_d#AGT2-=DC51d#T$y%Q
z^k<XPz5#HTIhdHfk!ZRQA;i<QycVJ5RZ}zT()UBZkjryM*mX}kV)NAOBYxUZB>4fO
z@lR`2XsrtWZtF~rqS}H<>t1FnaaB797F5?dToQdl6RMr1cBj2GVRS;#Ad~83L3LUE
z7|tgQX!H^yTk@?{YoTI%07#WaIyH`}d7KSbxxLcPM?EFg5*dyvn@f!zT!z>TtEs4V
z72B&8YWcuDa4fO_Bn5~kiAPGQQLjyNl<>f|GAGJ!B;eV@<Jjs-9sh&w@RD--LT7b_
zD&E!re*=O`t4rtETRs1?o}e<RL27mT<PI*ct}3-1XHRJPk~+!f_}}pcoo%<*w4ghp
z!d6w9;H+L~cQz>`w8BwUQe9Eyo43;LEU9xYX^c0d%28WutNM@mLTjq4Tn<}FGk8O4
z>*m^=b^mRjt9nVbHb89k{SWyf7FJixuU%xDXRm4v`5z3?Plp0*Y0cn@u%#s|sI$3T
z&gv5+B%=`_@g>RWNl6J(OIPQdYpY5~SyXzQzl4=kI~UqrE_*ZiW9G{_xoqXt_JlMw
zn`Q|PskBu%=GHaQ7g=I++7{}B@ZS!KE49s+8q<UtN6GxU8YHj(l@F1nbxvE=yy|~7
zsIAjhnINmB8-4!|KSkRY*Vs$tr9Y&J{)9$Jp%v9myAxqO9$_(M5=ZU~ojAdvm36f)
zo4UaHZ+U}D919&K$JrB7TU~{gfm^^A>XLc_XLVJJd*kOh*^IZO-0rNNZ^zx(fAdQu
zYY^?a8GNII+Em#VI_BA2)lSqP_#PNjSzTA<vN@{k`kLJw!6CK^m!s0wyuM1?V&8nx
zH8xi{v)-tqsp+8+O0m{B9gCYeB&tkmUu#`bZZ@siJq}@!j>;OR{T$l~7nJ6#tE$oo
zzlDQ>OKg?uT37RWqGV4(ZFPk-==_(%LZyjDbwx#Ucss~zktNPLN3A4DX^tulOf-3V
z{u-yOTu5`1lIlv^3I8E*p3Bh)XYkq5l+JdXJrPy*MG155c2|YH#X~}BY_+vESBv<R
zL%n;fM?PhLpYh)uqWr19A!YKP`WvQ??{7NwR}7DZ-}b5UpWAN-_znlRRlgc8{KCO)
zQ#@VZdmx1KUrtXHe2)UZ`W?a}fPQbk(I&sKfTzu!?C&o1HwN<Izwwa2_}fnS^))`{
zza!*l7T?!=-S7c_?ZMA~IlpGVJY72;F6agSF8+%7n8{hp#|u8|As>H*df-+cMEw$f
z;iQ=Q%k@UAPvUPd@GJgu{p9+j@@-O{YW)%A1>tdC%GIoV#q#vVca-oOCj9bnv3~LV
zItcBFS-op-lE1fK3-HN*{Y@{HQzX#Af5mbV^K)ybOyuKlI*_B89=qwas5fSMXvQD^
zC0>4^f3P0&u|A0MhV&vHQ+z^tzx%f+B_2@@^zD&`b6)^#0>FMx-!=*EO$Om&d@+ug
zm(e#dy%|nVagD+~(QhUPGrtkdVjkXP=?RA?K0TF}H@=Ky;!oN5A~R3vyy5Zpdt-Ik
z3|C|7gFkfpK|imrBuIb|1fITO2x|hE<!hFgH!#YhsWcklGs@E&4c_P!<?Ra`kMa`J
z`I-+-XB5^Le$DdrMwdU|f=qZ7c`(9hgo~U1FiXe3d!jLxXxsE*wmMh!z>4%id=6b{
ztE#hA46Jky$rz+dV_P`LXB=V*vls?Agj{Pg{LNKMVTQB9G=)i?Y0c#~rO0axAGm5Z
zDjVl_lUnMDMPG$TUlSziU7x-AO)+@m!rvUdrT4Cz{`2IAWPO%oQ%l>s<h}WupdF1a
zZT~f}36k}LZePm0Ir3>jjcbDLohVZFqaSIsRyTKh;{|`S_qG1ER!z_b(@~Ar_RekW
z<TY3SQ@D+4nysw?d`+qqO>Y4kkF(YfPMWoLHsffZIocU-HO_b5&2`J?6sDuY=F=U6
zF>}lGN&b&D8Z+K+1|9W2i#TpFYl7bT+yeSB6ZDj;eg<tRYnmgi<sHrzbNG^b6XnzF
zj^iNk6U5QPf;;i`@C4DQ|DtA|XqCFzGd)4-pkIy?L=OP?KnK7774e=Rd>t2uwr3Vk
z6eBk6;5U}&O{~YlH)e4phG^!aW00T4VQ0}I;g{i#g76sh8-wEUxD0OxlwTbBjRoL&
zMj|}12rthw8bLV9rwu?j(F`2%-zXH`2Kgg}c#$Z6w2*HbA%7gkabW(iJg{8YiMLKf
zd8(!(xVwPA6O<3f(_ZkmMbHj_=OvaImk*~E^A=^oFXpK|<kJy==ZopUGU8_YIW64H
zY3KUY4)JEh4|j_?6$|;a7XZr!hhu$<2ibC2cY|MEZX7?B9}jDfaCd+>y-;`uAzzM%
z8PMvE;&J{s&25RkF2)h9zIF%bNt6d>x&U+~nyz#Me`f)(ZgDwsJ&A+#9f@A&a9PFy
zUY>7v_~r8J0KolmNCsF};!&QwjH8f@`8OWM<-p}0h2n6(SWf(VJjw^>7boBl({dSb
z{&*R8gkL5%R@Mms-Ql-4(Vl<c^5HV;iOP%XS$9-korqqRTsOEZxa@d&aQ(&YFy@!@
z*&Xor1RA-na2`7&S@1fD^{N}u>k!v1tS?;zoL$i`uD?7FkuFg-U5TdW_#2lclRc|P
z{ot3&gXhKTV_Ou?%Y)Y;#>-HHa1I?&oIxmFHz8g(^o;{63w4HGbHY2+vA5kV#NE+c
zm`i=gtGOJ$@O3=1XdFGhq&g0qp7;)T^=n+55`Q8e2Zt|J#)ATQsGc6g(sH~vjvil<
zHx6NQ3E_Ck@c*;Jg93P{DZXf(hpOjSx7FBwSwDgZW!BhjC9eNnZ>6KO)^U#ACvRYF
z-NHsVLsM$)HkW1?t_fbhvh{~Guih%hLd|yD3F!?{OilbtzDDYXvGd<$r3D%Lx7dxS
zt(&W__^l!ShBR+`;=%jxvUwcz{rEbYu>)+1_h0I|nl8U)t3OTfHiH%%uW~+726jys
zZ_9T}CkXq^==~q-kc^*+4ROt_C@$sOnt_s!)5soqNGx;ODj{hEOUkxT0+veu8X6ip
z5dHHIk&rTq9lK(WPSiQo!G3UMb*Wu_&!W^(Tf;p;OYAmvN>H*yjpK6Ks`$$gfu&AW
zr%PN*xU+l_cLtQH=}IbWwfr3|{u~BeXC?<wcz5FdM1=Zah5M(-dG_kc;u@PIk6MFg
zX$UcXM<ck(mguY`x(7croVDfEixMmB^Xyfn9M@6>nZY|DD4EOVYC0+I4`eXBo~32>
zhKY)!4SMoSDty!DjpLBp>`__u0p2D6!{3JGAE+@F!|?)RZm$6aO;ZE`>G@+o$Jj5s
zm-ibneazS|zNE0;5au(U025pRq*1bA59`D<xQFK=UUga&#bh(F)L!LsNEOlZH}n$I
zQ4eK_-xX`%@me2KQla%RCE`|}8`DS~YtYx*wy2I7J!p;}(N<;K734dcc-Bew%jyr;
z%OHpw0^h>mHynN=P#U)H&Gu^}A$=%O_F|&|4?lNMPLzG@VSqapaKyuJTS(9LaEl;r
z2T05CbOE0D9`rz>>^Jv<{G%YO3kqZW4<O2(ZZAYjF9_%77rG&y*<Lf-(@%ikUO-n*
zqV4<kA=<t_^Y?=|y+OVMh+lPD)uqlQ<@S>K#ZLQzI)~F<T5Pk|y2|Y?M@g-Kz@kUc
zQK<B7#YRAF^e}4pc1*w>pvRph5+A7l9xmi1E2k0OAkXP$;dDmngNd=wB5SK`^X<ia
zC?$%)6H1}&9cIJ|jvs15LbTIfS-sF+T<MV70eM8`2}>C9Sa`F{3Qrvq8L<*hJeOL8
z=S&5Su>_ixVx(VqG~#2bs;jts1mzR6(a>9hjL_rw>WhkHjw*-C4qx#*jb^zpW}=A4
z@}DINv5h`{2XFDC_3_t=-(!gHRRX7rMd_G7jx?@NBII$o-NoL-bC;CaFQz2kvjnwK
zs?9l1Y{c3U&nnwa|2-RzBH8!$#JDyFFQ}`QRl!566vvTt&>^uz3Jk<~YC?z)trqV_
zxxbU1=xsuN+f6HfX=zQx5@RA9Vxm}_upoEJ*nEMi&Uz|(V*~+6T89lXH0#(rs6m`j
zJP!)HOz;mest9Hk(F0ylQ12PxWq+*VTCDpC{_|87jMOm9<y<nx<|--o5-+s0I<dr7
zQDGd-=en#;X89xX?tEDy%$m<&yVF_iBp#I_r-@3j*prra93xY2la2Y8*&G%2(ga6U
z!W=_X-n#|EiZF1{oDmVeNjb+%%Au9gP~SF>cQlaS5}rR=*6<`EtpUdCR%!|(PJC&Q
zy~VQ-OMua|FO+E0FGl@D_|B)OL68Qz)V9j<gEB_oQ>oCUoHM-y;IC2B!VW-J7r@yC
zXle&E@nL)*xbgQ8^gS54IgMe!6Y!X&@T7FX2Z=iUbZ~!=;-7_i?yywPWS|EiL^G9S
z1kE9@FycPVWBm0;eDkslmi<ai@?jd@wgIJU?rC_KAV-5gzwk<Y1#@~sedEu8<2U#6
zhOhN2jA%M0@etC|^(&s}>sLFVX(Gsl*D)N<XJKabDV*r)ld3mL%c!&&%pa-vLotu>
zOZ80gr5CXt@$lxVKN1~bK(DF?A@JKqS0>63o`pvvIfk3aQIruc=NO{3HDa3*F37;-
zuTkY4LEQC_%a<)};C3PyXjSlQ<rt^PD;(1tu~bWEA^K&rrqIpuiz1pnF+Xe|WOIKz
z_-zlr9hC2JP=_LTym%#i8~1l2+BFb$k|K?;od)6R*qbHnNc?M1I+EsVP}~UPV|$UH
zK`4_zAfz+VE{O!{7ZO!1i2yvEiDym03x+{Nn?><OG`?WS7YbQz=ZkbK4YU&C1lij{
zna6pBxGrX}mer_vhQXRw2Qgn^ZR@zrJKWdBENw*E^ov}UXfKOe-d2WUjA<9=7M55(
z!xomiTBj9ShH*j$e$FE-@m^LZyvzm^{YRtVA>7aM=50zeF>edX0bd&s3abZ25XS4G
zhtafL52A^?-tz`CdZ~uXQ_sSUj^ZHyMgU4%!e>Sns7HLnEB2t`2%B%H81<kb3chGO
z()M*k7)o?Au9%h{5t?^sT2C56ntuksM-V>zHHj~yev!3LLcf?J_KPs<QD>yQ{CgAq
zMp|!(+a2V@HilTg$I4OMTni)mzED_M_tqCSwcT^yMrx+^&xd}uS=vGOe%ZJ^w1BL5
z-H!ELR*jihE9&L{#M(km{E4+FI<@TmZP&n`1MON|OSNkj&?O7#2Jef+6`X)3LbyBl
zdsx7&@EgywGWO#5It%i|bO#wgC;|h%%-~<+m*bh;h_VcA0<=L8Lp0+T9OAqHkJk=?
zXi%Xu(JtX*qZM6*Fwr1`2>9}PPcP8hH)zCR7`AE<V0NfQYgFuuT}~U(H~uj2RXfEH
zG#Ny@nN{|RVzkp3g7#EHP|TQl^J<*cuIl2lYG<Xb0!8ON70h}p;&C&z2QnmJp0m2H
z#+(34`v%jTG%V^yj*QJ~_R7Ev!iFPvf9}Qw6quNps3dFGprW(Dd{ztdl8uTkwASt_
zb(EF4NAbe8g}`q|BLSoAfg?*>gu_4!jL8eGtCBR)J%Xw1qh#GvkE@yxrr;97nGO#y
z(QhO5%|Zg|Y)+f9Vu}P3?h7B0rS>vgU4;uyT8MJR8WC)uW{mbL8q5#G`f><Xe+t-X
zmybxZ`bPBUGf?vo%9|6S_`6E}z_zMiJ0(m^nxBxmD*HlPg$WT+?2JflxzTA~Z>Q_L
zFe*0_zTm?rWygq<48{rTOim`or}g**f-lX-K^(pi7ecfSJ|4b%T9D(D57t3Mp}5fy
zCl-F=AZ`rNEGhB0+{y~?WI7Vf(h8sabcXcp4bpR~FC2P6+-|^20`Y57G+g|ypW%@A
zg^7siVKrd_lPu!(W#VCkS6C=CJued?*GlokjM)*m;M$rBhs%I9oLIjO=yi?Ky@BwB
zKVdP^l)qqhd|zZwSSjo-%o2Nu90_OMgq1~00xy)ttStJmBHB?<B2X$N{{Za_QJjEh
zjFHj=5Mv>m#WOhDCkWlrOO%_i(x53FyHO%<bl}*+AvtmVRvTJW*%txA!{Vpvic<7R
z<iUqk-r*+ULWQ>UF{;pJ71`ek>_M?Nnb4<=4W*&Tf;xLibw#Z%rKPVYE%2o(g}R?Q
zMaepQBg`_;I6{AugyJE^O<p7(?x}>l{Tkp~ZG_Jto_ILZtc1Q2<;uE0ycX?ActKUp
z>P0^AhrX+FRdJL$t7{;x(R!CTF4H`E2{P&-<{36^;^cz-iQ{JZkS}fv^ao(w_mZOZ
za`O3fV}x_AqpCEq%u!)-a;=w<R4q{d#T^S@wq=`hxee_a=vJAul`cz99*%bte4(>B
z&)fLBN*97?u|R*;GOKiohN7LJ?%W^dlf1O_VJ#ppPrC;!y?*yVkrN)+!-}IE$Cq98
zo*?_~P*y#GuLPp)xLb%@zwdEyY%mx;uHzbUsoMhh#j}VW$~XbXZ>}=t`t3uVjBBC%
zb*tZP1$+Srj8<#Cz`wZ)-HXdh*~PT<=Ke?nc^lO&ZUPn6tv=cWS{XUvdKLpXQ%K8B
zpz*{x62)l)abk!zfo5@8uQ-xu=5f7Ep!sB((ITAGZ{`WG@p$=PoB(eU^AS$B^b_d!
zIC?gi^@?+&{$hT@1lDBuY#AV?6XvdNc!jyDPorpqxm~l2rVZwR9!J#%vp|odYxj|2
zV@BDQfxhtSUY3?YJf1J3Y5&KN3LXE#7_y&#V@RcLH1`<N0%eWHYW<DIYW|*yg;Nmx
zOzbAyC${97Sbyn^(9x;rLQB$#`cGBP&`+)Q)Sr6N+ep&<W8^rZAKO|Xe};pvFff^(
zLA3iCe1VwJoI|v}3Z45^b*&AOfd%R~=OIqu0srx_lMjE5+I3D@s?SFRJRS=XAXNCy
zz-)&-3PH3N;5*E15r|JE7{I?aR`fx4d$pwQ`ZBL<&U_`jgE?Nf7ZHv1-0ZYrZ=&c_
zZ(@GhFk10%<FsIU%+9y#S6`0znLJOW-KH6(&heZKcv`x~axgk4Vg`QJSU12A5_Izp
ze%E*eYzEdvOlxGeiAQGBeSh{y`EU?#nt#mQIMbLZ)vg3bWle>>(q84VmnvxFb6?^S
z5=2fQr=6%_<gP~SdCTGM?7AB-Qw%l1nmrsq0dK&(d8R@pRtXWeo7EhaoDA)438t_N
ziBX}|+)+#QH~(pKN1i~3D~KP+(3#-!zn18am^iA+?M{a)Q91HraKywdNf0xW2j#*3
z4nRCs*}D72ms!gg>Q?)amBZlLa$61egp^cQRM$BL(!xrewnfGB?Muw-JhuF#v7}f}
zG;2Tc`7kw2l%qDWq}=ATmAK@p58pfDb_3@D<ZC<!xMwu1!4#1tr9q!gMchF?BUE9N
znjtxh;#!we+t~DkNdysdL`}7$%2hkwSv?oND(es12>&w((+M`PMmFT&fe2mnfO$0n
z2|@E}Y+@+~N@XuZ3$|6%l-qnL1yye_!y;|jNI-f$fx%t~#4$-fvZ0aq!^)~9c`Lez
z*|Q8WC>_gS^Bj^e1==ab8ke51T8#<IN#+b8@kzQ>J>PDCJT%emldKpys;(Gv99Xfa
z)Gt~rIT~x0+y?L2;r2y68W*C*PgUc`VttxhVJn%>BU%4eBVhxc*lkb=)X_qSrv}Wu
zol}%dr`Lk;eLKbHsZ;UaUgmhE(TDV4FH;^4>TE~`aOGUg9ZQJ*;3lTove+naiB@2t
zB7WaH2GdM=V=soQ4Yc?{*Jf}v;#B?rP$%5(1j<8&Al`-xe6#QoZWA2fKZrL?f*gW)
zqlsh?#2ZQ>EJsVP%Ezr)iI4{ZVAIE&LDxd$eo}KQmL$9!r$<8rOR|SnOV{T%t>?F-
z`kIeH-|lw$UQ(QDXDK>)kFh=pzvYMm-X|0qaHL_KH?K;u&LdW)2*t=U8(3%@rMYut
z84qFoj5|~HdyzJfu%tH_#=(OPONJ81lBsBdB@6rp9lDx+gl8JW4qee@FE~vG38pwl
z0nX7HPNRNOO^=2E7ORk#m%diT;cs8d=sdZhW%JNMh`D)XziPEXjr@<+Y_(oAu`XLC
z81<d4wYRnQHec-x(C5u+oVUIjP2Uw6d7rZX7izCohs2SML6|{ZF`8EUQ&-gVCUwPZ
z>guVkG<G`dYh5v#?D|_*8a=D@qpmc0=Bl>7(R_JJZGpW^_3)I>?`HtQ*_5vUGBW)D
zSR~#oiDo(uao_;}%NQQ7bw#{&MZ6IY0O+qL*45gJ;ofE~@!j};2v|eoI@(^rEz8wO
zJnJIY!wy8*LFT6tLy5b=e-<~4cs}*W;zS$7i6eR=|M5TvEALdI8CAn4AUhE4E<B@g
zEW~GcM?<)SC|3x_5bZ&C9*3Xc=^?>%h<6gC<8<|eG`--rH{_iFdG|qa`$3!m@H-IV
z_J=quU$&6RmWkPE))Jzec40V*APm#Z9+|H|ai&8Urm-EkS=kLIezg;(_k|+$R@gBL
z{b5l{CoziveHP*3cm1JIuLsQ8>XZD3n<k7;I4mk2nsgYYH^L*_n=}<o6c-IF7-1Yj
zGu>U}T{t-EW+Kju)qjt&LE-sE;RXjx#j7{k3YN3NCO%Q1b%>1dVwa<IiI0-?a`kYs
zC<U{thw5u)?#Vr6FPutsJN+!2LiOqTuctZ|?U7ygJGU_=?8e$^i|5TNX4apJx%Fpn
zeEUOZtOq*Hl+=j+IO3@SdM+Pd`C$3PdzPy<YnR$A%{l|d&{mJZ?Wu)!=sWYbC;n7N
zGQgWAcJL@E<97m#@N^_58lyZ7WBMYD@UDQzrzs??g=Wo2XB{5#47PETi_>=@P1ih6
z^mT0lpCUey9}9GO;#qLk%J^a@8J`ypAv#M}{9Lno2MSKQ$;~_;jYcgcx<5Yh3`8LO
z9hWoDL3$n>nC6x^^K5}`PCFB?-3JUod_)u9EK{BVJQN|`c==!pi~I~bFBeV+=k<75
zqio9g&v3<Wc>YuE`o#~rZoj3^e_Ehl;5QQ_e*yRVc#?fGj^CU&Fkm%Zy%B;><9e8*
zI&l2vq(?agZ<)&R$M`9qm}aC0^BGMv>)=McRp9JV!-%e-EyWEpOB$#6?MefzA1a^@
zsK`@C;SCROw0JxVGT_dS*y4(2)R34NMu)g+!vPoe2H*yxr3|Atwju9;WMVlEhUYEN
z0kgd075_Bu$9B~Dk{OpjZ@gL98Nd^6yg5Ie5Fb1*acRfLOCgPtzdv?CJ1aEG11f)-
zl+L92sxZgX+;%}3on28`Fu#5v=vI$Q^?s%7cs=I$2=T8y@P>!;?*#rmEcK~t9UQ{E
zfG-zxViClfKsy8-OrSFWj~5AGbo9T62jFpdI)*F33-m?M-v<~BFbF{(0PY_GFbn~Y
z!~H`LAd{EPbx4_QgM^!#r&R0@`eu>Rg2TMnK~&BBjcYKVOTgQ~f4pewEN{7-ur?+6
zVxwP1B=g|+G$Y}m{2F0IlG|kTWH(Z||6-i)iE1NKnb{b~$lNAobqzC(Elm(KW0Il@
zG}VZtqrok$Wje3bL_0x={qiPz>*l)hXAsu9xvo$DzrMMyI~GoaqqfAS!?6Ok(}Wsk
zwhd`?K+UMO`FMNJtb6wPwjoatsOMhpV;jp4KPzs=8}CATpi=IsFIhR>I1eSCU{w3X
z76v!DbJ1zn?(PKO6NP%KR0d6$QD33<s{xwTCqBUDr|`x!2*jE+T@#RCf^a^F9>iRc
z({2|w$iqs?9TlZcdzFGjH9n_PiFx^sT4fi>GQ+GfJtiOyFSk{dR@jTHY)!KY%%}=B
zD041WSe#Z4ZX&j`4C0z~?c(=P_nygJyk?9mV?wy{=?~}3z09oEv)L`oT1gr4%@k8V
zqM2f1arlENArObnu)0E=a3KzVe55<*0-Fi3SrWHt%r@$yiM9g6(!>#MgPpm{A#Dca
z5l^(k1l@rrmX5WpMG&_G!qWxPbphT70=>P!-3__<fr9}MzZar|hjU~8J&1MyA%W<P
z@biO2y@+N$t{>43G4R6*+{^|)u+SG|F_`#uprBc7D|^Z>K>uCb1RNyB7fu_v(I=c9
z(5kzmHnGfB;*uXc#tDRD0^Ten>t1ma88tXEplx<nRoEBWb%tP@D_i#0hUizl%*Yc?
zEV!BY5>71WOgN$NY?JU8ywEww=({I+g%b{<e#{c-7tSVl+$uCUo8Y#EEF3oQxW$Xp
z;X?=phX}l#1PS*=PDc60c$AkmvE}h&_3^XB@4{h#Z1H=%?z<(@m-fJY>)tJ$=~i$%
z&*`4@qJM^4@)$opB4Dt^@3V2+;L-}ks<r_UV&yh|-V;+L91sxAajQ~pSwq`iTXz#f
z+s3Ve2RvdeFEqiMViGRau4TSxmszXP*j5@BFsuFkJn@M%JlaQE#1>?nF)jKhJB?@+
z!&Svh^wDOu<;1hld<MrKgy`|rB62XI(E^3cX>3l5$moQ~nVjAbqS+Bbau~$F2N3N9
z13%e-_OxM>Oxpm(d+q*iB!YFe<F=95l#N=Dm%33Kt;jVT)7(-Uwdh2K`hvBRPJ6*R
zj%XXCd}CVMVAkH<j8W35FIaaVzFYWGyCA+(?N3E)PnxeHx)J8j3)UTp_ZO_A08ct;
zUK_yepd=>(URHamh*H}I!S^=sJsL`@9OxN<7%zgbvx)I(dQTHi3vuiIPtXT(eCy^7
zCTSkKdxJSziuhfab2X~_Z-(RCEjAo*p9s0X!XQk1nss<^;`Qi?uJIld_k_?}FT*{k
z1<D+q{Zr{v&i<LdO0AuR)a*5e6aPmM&?EC#oyWD*D8h%6ViA~+=00YUzEtDi2!arV
z2;j-mVgv$w1cQ%`q7mR@8L<G|&(p;bW1Fca;P;UOQLc?GPpQ0$v96RgLB1TY{31+%
zf9J<c+7NeH#~73smuV~lZ}R2wIuQL0Fb4dXN>>1U)P>*v>Q3}lFguw*PXxWZKwtl%
zF*gO>Ek??wihQ$$hY_h3BZG-2TSmI9yjmgpe_<Y|)v!*h`3=)NTFq~qpyoGPPXWQ7
zL0Ic4AbtA(2d99v9yFLA;d9y-)H$5?QuzoUoV%&2t}4b?!Qpl|emUCR1Rb2j(5%xM
z>M4g-vlxw=K>d%K#SrvWJEvthPBl8Y)#$MqlWb3rZYnWfOfG<5HiBm7Iry?0(^fVW
zVw%dPsG}k6DS(TmIf-aXh-?aimk^^MA3k;B=Qd)AZ*Oh)CdcckFpJv~Uw}TnmnAQy
zcOgVy!}xbkbP7v}W><*)${*v`FF9&GG5_x0F%V-u&3<t{LJKGE&yN@B!b_4qEI~dg
z_{)vt;m(hKNz{mLVfk@_klxEUpJUN4Qfgxw{#`hY(b1E12+R+RsEzUuPmvyvGIkPA
zad^igYM(N`azFhckGBXPZ@Hh^(8fxfpIHCs7-1%tIREfu8*7C!UNboT;YPRql!w>+
zMw-#x&Obb6bdTb6>kUCzMvHWJ(0@Nk{4Q+Bc%vuDxE3m7Q6qi@!w$7#{(`{<H~IvF
z4BqI=F`+Nlr1fwcj}qc0>yx>x1>+5Fyma%zOZR`77qefyiJ**MmHN*&J<#!kR36^+
z;|EF}EqVL^$+|>nC&l}ld?U_|`Q!5OfXA>mHN#_pUJY&5lyiS5@ozIKlpNRbdSOfC
zYXP0tHn)`Cm~ZIB5>LG`r#hY(-(nVLXl&ff&u52#pP#|zT!a(dvu1Ly?yr_dBF-$(
zv$y2-n6Gq2>FFvuQRr%;qpRgw1q+OZIDH+6vEDGf@I`O<T#ETwzF3bUiSDpYCeBwh
ziz4bP#={Q*#VB~P8>5f0dijnxQOZ3|PgizJ(Ip;|^5M^4y~=88`NosRmv39W97mog
z<=c+@&y?>-TrOP<_~+{`?STJ8qLeL;!;b~9urP?jdpU-0I8oY-p+u8S8-lH~XuYMP
zTgqtM0;6w!sO}))2O*Uaqy<KdES!zDqlx=Sk$vlgWZ7k@Mgw09;=s4ZgNZt#)2(>o
zZF%v?1K)ygLRpN$Wf9{?S;PWu!;yUXVbiXHe7l>-ubYYdxZGm=kQ<j(6J&8Ru2UVB
zl9u}Om#z#<SA6r))u?tM74kPLlVAj)CcyPL5&;hj7Jwhm!N*KuJODmE(%w5Xakbk0
z8AD`6-2ZbM`Wvn_)wK?0Y=LO=o&yk<ao63LDvsxv57ri#+w6r^(F#LjfZ_M`O|J0`
zS{itVi}B#k(+S>r2{C_I#S~9~39bNletnUL-v;Yr3qTLl$1%Ns2b|1&z0E^8vN+s)
z{ahZeb)T-q=KcTWeL7y3o1-oA=SI0$m${Dd8+UNi&Ju;{rMixUdDC5>GieBnD!gzg
zL_H<1YY0NnDfkd#i5Dv!#NpdIaRM`Z@sgiNA0pt!#-4=30m=zw9#<Ypp;O8RG+-Ru
zZ~$+RxGWqET7k@J&>F?K1_7Usjh<-wr!mG90t{r}&E}C%M#jbvymWR!ARm7f_&7XW
z?t|!kdfMEG9V4={O~hn)U^R^QOHDbnYH6O1Uw06O%e(cNbJE%?HnCUq*6_K_58Ov<
zO{*<p_bd*zed)GYoWD^`In<!X`<aL4LKRYYO+~d!e#C(I^y2s`^X6RN9P7JdOPSEr
z8@^&5$*g+RRh7W>$^UG0y)`!9;}H)g&xFLzM&!1U$YqjCM9)8XBhIi3>7OTIN#L9(
zRMyqH5}bBdowF*zmVk5c+Orr8aN7A=Vg{<IQW(b@-CCE;<*0QzN_-m`cp45i_vmbv
zNqHMFegx%n7jip1q&e&*e=~jamj4w7`Oh*?BY$unGsDTt#8Y9%^TNh2Rrpfcf8x?+
zU5J&)O}eEgF<+wOEgy?7nvcRq^_T?M(r!4U?Iz%3-Fp;@!;joW6aS9qb%*f&KvM_!
z#*hCo-0W5`TSh(^!WfQ`L^HtK5&UdPH5+Ixf;5;uOn(oeY@;!E0#R-)O#(c9Ku&!j
zZ=9|l(8}BcAl+!lqnao;n@fRzW_Y&(!cIc6bP(-`BcpvN$RQDahr@3Y{3b)5!%&(O
z_?8CWQVq%?1L9=CZ#Mjn0NzJJ+Du403Z*>>;^sgcE5yk~vdjfO$3XshAUF2z*oRO)
z;~?%RqS;0mFQl6Q>6lDkM&EdTOlGSA-z1=KGSGJ_{1(9PRG@bX$}eBQ^^qX|X^{3b
z_?-?oicp-H@a=T?HVeXLNbtRZJebU9L%bQlA2ZB;0OFiU6un_`VMk$$C71(pA4RmG
zYda_>CY!ks2d66)(urk14}Qym?lR(D)@MVU3Wzh`AWjv;sew4vLL6Skte!~iS2G@~
z^?Dn7XanFphQdANAE5s(-k={K#upCuXp7I<+k?9JL4MJ%j4DO?86GdkYX9_v$L!Y^
z`3O@nuksOY{`<wRH#!-9;n=R}TUUX#cJa)t{^YG@jhlZLF8+%a$_AJfO{9l(?7QbP
zv~DVFc!JoINQASs=FfkX*eXiuDr}0GSd1Z>7%O$0qqbN}Bs_U2io}St7|*1+X-}*^
zzG!sLExvFJ*X_y0c>VXzf~YIZinOigbd~-VQ9I(fl@s1m>nwinp#Sc6+}qPJTK2t@
zhwsF_Rvt2v3jr&(-uU&@nh2LH%q}OScq(`H7faAGHMgEF62da{bbgn8@9N>ZyI_nT
zR`VtpBf9iwbv0(%)=KrwUR!J{EiE?bQ9K{xb?XV@j7fI8xZqZ|dia+Q+J<H88uqG%
zUPRE7_}4Zh=;AA{Uis4&_41sbN`DBu(C*em_SUEF?v2_$`uKvmL(3Gwf2vSQ+>(&P
z8&`cjb2Uh$tDjzqe!o!IpLlM`2I#*hd7_8?v93ovg5_&#GSCYxgFMi}z6&ON2McAg
zz@+9hiRK~1_ww?Tmp^$9^&?Lff0#c0PVsx9{(D~|xgq?6#q`68apM>2sB+oo$vX>f
zBJjb;ldr#+lq4M~%KKU#SQ%tCU`QsORWZec<$(VEg_x=Om_C~rOVg*j`gkZ-Ix$x)
z^RB~9s*N`9<qM4&El4B7pXm{gZ}z9n_+t38JkT%F$>L`dPaTyTUs|_US5>8~rF)`g
zgrC%m)WuhxVl-=Fjq)4iXMUrJaXkxX11dgH^pybdy9L?7RJPcZ^#*i7X+HO}bpn5n
zEC-=HEHF1d5sp&BG21^|nl1~BqxeNIe&L7bsa@1jChz>BPGjw`!d6ar1InB8)sl8t
zoAM%Zq@HF~AK`k9?h(f*frkKqKm<+U#{x5G?LwKQiu<F9Pg~goaQx=l%C^7^onP<a
z*HHLX2YzjVUjyM+T=+3`3(SDgQTGJy??AL8@%)+rI>N19RO>>tBkx8HQqXMAkGv9M
zGRqA>xBI$4M15aoS=gxQvl-%8vjh^m)jUBr;(xQEoN`y2mAIB}?mVcW1M_hlPjU=s
z$$Wh+haS3eu++3%^R7Kro#3Z@5}Hvy3|~(ro%%3(Z$Ihj-2&w$-rVX9W!0OQ-`wh>
z_)p*m%l)Z+%_&um*8+n$hQF8MAISav{G_j6^U^nn<7JCD(ZqdD#=Z@J-~P(C!92}i
z_#Fhl1L1dw5^pDuH%#&O<^DvUc-CZ=*o-{;AnU;`nJ+ifD%LPuvv{jGT#<vb0hbM$
zi*nk+mJ)A&ez#6TmAuv0zLR{ComOT^Eg(A&Wj3fK@@4o^lys`e>{Q?Aap3sPNlzO5
zveh0VonYUDH4!hCAC&a~KJ+36I0;eP3puViZsRozUvM+JGvQZQFY$|hj)&uIEUh9Q
z=4a~~{?vhNay;vR#nlq|GJIJI{;3@QNI&H;vYF`_q2RCM_(%Im&*)~RXOxD|(o?_}
zfFKA#h!@}^F&>}e!}l-SAf98qKwAW6cPp`ZZGN}2GlF;kZkoFz05|8&O?mV9-4Ss5
zdmu<a!1}>H0DTegbo~$vKrj$sFarL{JoocBeFYfS2uL-7bOf1x!e|6m6F3<`F2ERo
zJb<wX#(MysZUVqW1pM1606aV&V2Y2BkDw4>tN=wmz$_D(fk2eUYykfK3<PHaoCRP*
zz{7alkpOcMlmOTf@GzF9%nLXW%r}8b1l1<60Kq5?8goMsA8pnh1#VMmt3hYJ8+QsA
z2GQ~<$L4;|hYhBga`IJIwlWrJ@SWu!4SBG}a5&@<3v|T8Z(E{0F3WMTe3OX&geFhd
zfw&v;#5AyQ-Vk<$I9&|l@J6yH#OVQX5+Dvs(;IkVVQdn~8oxe33v0|+Bh(Mc)CTbf
zAo<M|_!tEF4uQOfLfXNQ|1cy=CbQ8H#u|oX2s;VI>j-o(TGQb-18}l|D>s&!Mf4|q
zvk{*o4fq@dd1n*tG;kl_CkMWtgz&I9xscZw_|1dgvG6+%XgHa;>&tlHjk$RP83=s8
z4frPs_$LECe!4dsWKIQ{pGvfo!R)sX(iK2Cu}06+W+Ksl|Cio}Qto3AExzgYSrGBG
zedc~@#mgyEpH6RLd?CcMy!<;Y>i_9pFQ-M__}3p1Rd(|{#TSl@Hp{78BRL93Ouh1<
zKXU4wk2W6oVBe!X^C{}Z5u<XF=;-RJ-p2UJZ>*kQzt4undCbq!N6ar?A7AaXBWhhv
zM@?~QwcDw^w)zxaPqwzxb5!LEN1iwoPMEsY#18uJZfB*06Q<%>sXoRf?)|J3!!4Xn
zO%{0ddM3@&sZt_b|7G!oL!oNvg;v=YB`OCO@klJB#^$ukHXm8yu4akn+G_2_ryh-H
z77l>=YNlJf%4|~8jKf*J-95`n+aq8@PkuWUBj|w;Fn>=yLZ9Dt4qZ^^D4AbcW<qCz
z9<6wc#jQs6@<@i)^OibY6Xi{a_wwxRL9QrVxKv;{&#avKcuv>HTb<~uORsFy;_v!%
zpv`rP*u7l%EW)>Zl+8y^d<o}E-P$qXOsHE=Hps(w;(Jfl&EJuTaMo1pVa^ld3+F`D
zYQqnRyPUSF+OleAWwBg^d}um+EK<Enn*Cx@h2y1KyIE3KrCldCVw9LP?v@konM`HK
zeURv%$+VyYIB-Zx9^&DhV!ofG9AwfCj44xV?Z6m5$mGioba*rJ>F<dFI5^ef2f|oh
z+(u%QpYn-n0r^A_clq$;6D}*hUK~QOePQtdZKYwcz>rwngEv~gZmJx?F0^_ABi2uO
z#x|oo8NL{Wp2Zx0JfbH?kaIcE#bnnJ<k21I?hmvUA=<g@+aezD;k0&s^4_i)dFOay
z1U&dJ&;-z*zb9|{a9-k+{M3DReo<c>sIzY!2)6&KwXw<}Im;66?}FsX_fPoYMtq2o
zx1DfU?p&<<AxnwbpTE4G1MQM(C#N&f>KyY6gKuwpoJYXV_fd6&0uOi$2kT~dEa-9i
zp2Ypoj@je%XdKOw_CQ4uO?)#=&wlxeFM+tBL_2t8f$_Hw+kC2;C6E*2fK)FeT2XZW
zy#rD$Ayp6gj&6y3Ii6_W<=G_pT42!0_<+sOaPIFzw1a0-xNM6ERp>I4QMw(cC&aBG
zC^KnL4AbFbv~1Sq?_jJS(GA8<h-0*U$??=vt<`CXe0?p4{<?B#xnr~}FMR0F0<%%(
zSN5AM7jXX&-{f~5j^CVvyuk|o3pxIwe$q3v1<KPG{cX;CIi8OIPk;#oArKFwgbLpo
zh(_)iL?Phc+aQQR5a$Kj0mKO4%}6*t$kN3Gx*_Oc0=*FQ@c?}R_yLvv2=GCb{sIj4
z0*MHQ10(^YAV@-x<^_xnpo~B;$^&>iq=FB!oZJZD`hpLuOh9l7f_yJ91>h6`3cQ0q
zbxUtgQGQmJz0q`h4CEOZJ{+WlX;<q*+|3oc8G?g&`2Fm!kQ=}G{6d31-5?RyF?YKB
zFpy^%rtxVx9r5hRK;Iz~e*DR6w!e8Hc^b+1NdvjCazE1G7N)GqA<K?>3yfusDo1U(
zb{MC&amIkX2m@OYv~>4T7;csk1;&Oo==`WgaZ7{hqY3|ItCE&EC?7YglFd*lF#cw;
z8mYtSkJZQ^oYp|WwfSw0Q+}(?s9?PnAU3<2X3m6a23KTWr}(N-t`eIIf@4u|Sf#C|
z#!)qIgh7(vk_xHL`Ix_m*b-q#oaIs@;=*dHZ1e4obL`r;6cjJamu8ITAupyGSd|wE
z3>cB1QFWQG4XhEHtZe6s-6kPsY+Eiznfb`3zU6EYHV>ZwZ0hTlKWw_G8>*DJAz~G{
z(3c7v%^PX?<;mp6bvw<}EE_&2;H^&=pBczP=}ev+(6+h_aJxwjXbbX%p2m+&44ZtB
zA5Y^HpLp@E0Vyr70Y)9BKP5iNL{h$RZ}=)gaK8N5fM+A`EG2)Jr<7xEeSx6m7lxZN
zUydgd@C5h<{!SQW+JlrVcuhhp|BGEt8(6v&D`TYcvBu`KRYHD5iSj(pKay*0l`YX(
zY2>ji<_`XnL2zxkt;Vh%ny9X*uH*V1RLjOA;4X7iRPZFWikfm8Pg3i0R?p|rxOoPC
zxq!b4B2I7IUiSbhRkjZ`A+cP&<-}qFqQ#M#4NGE!HYq#WpfzQID6L!JhAgr<7V-y3
z)VCOd760E2>IhkW%8S2z(r8wOa^yTAoZ{%K)0>iVd&&GnMQ`+xb#7gvNF@>1ClARO
zq5tUUSSmB{kI;&;c<NIy5sG*l#rV<2CiK1F7<f<-<sCLRc`S90AHgfgJniAYNad|=
zCfQoGE=luu{#icVsQ)aVb%bsPCM5h+uT-r1$l6nbfX`7?Zg)CdvI6id%BvS8R@mp+
zt4g)PVNZJg1zMRail>d-)lICDh^s1rHz(v4UKO;`7s?;t$v>s@4OWVpZ`6BRUzgLM
z?yJ4nJ$a%EMYLRpM*swV12hA&hrrKwy7><HVelWuqYk2t82CZ?zwu~+z%L)76Zj2K
zF9s;Tysi-KSJ%HZv7h2?Bdd#zsv<s?l9EYlDK);7jzB3L^}T8+X<CKec%dIY9#!LT
zYbiCYlpIh>estG97LB8|Xqr|ucrfd$(En}~dXt2%(7{_RVmxQ@b#ztOS{O|$j5$hS
zC^G;bg+8ZM=uH#4!8F%r;SX&Uev^c6IJft;?f<7$_>C97A+#<WH%^xS!wbiF-0a^H
zmoKZKW}wR<$hZpou}i{Nz0IDqm5&a$^-fk}fZ=cKbGyfzme>aUj?zsHq_tt-@md=O
zuRBVju31sNf<<7<jSXlKI9MN_mBGmk+A^=YuiM&5wd$h3gBi2#gzv9MOp`B^=d)8*
zzHhSM(Ap$w@mkf;r%m>@N>e8{eQiyeNp;oOXtbn`u@oB!T1wWSUhkzrZHpSzpg0)p
z1Zz1lO;fFm7#!z`;}pa9-VK^$`rt+yzGEERDn>8NbpY{NVOwh#X{}wyw~?18Hxi1#
z5CK&HPq?72y0*^ACmx~921#9oO^KbyVjop7jpYiXZuFEoYHKPWo90I!SiWe9CW3Gg
zF+O)=GIgt~V)uTDHvb^RbY8+R$Y<T~av3Uud=YzqYzf7O!kf&S!2o@@XO5bC4s@z8
z2fCxR!`I@qHc4JPe0I?39-%8{VfPw@P2yo5PCwo0B~D*n=Yok*y>kyS8+-&cy7rk+
z=AO3>A79o9sc=*}T>SJ<h$LxSRVi-;;VEE#RH>xGQKON@MnAWE8U5TRjD99VyhgGb
zqFhO1vHLYB1<l+=4N$<xPy;Cx+N1{a0==^VHJ*kAnJ<hRTSrPuwRo+~l3TNMr%4y>
z)98s;&FC@J)y~o}HmA#8>#*@Nf5CES6P8??3mDB0co>R!>gFF3y7{|XyLl~MYkl&m
zn}5Jd*KFF=8g%mpbNzCsqiVjEn)P;fdFk!`5PCZ!3Lj|gNtC}<&2Qd0H}5sV$Jt8k
zt|@ier42d2TZeHJ4azWS>g*W@Dsx`n25q`n;e05GPQ8azusuX|1PY66h9}~KUFG&l
zK1Yu%VI85P%H~p|2bUpM!)hw3UB%2avYO9adqu6u0sxZ)#FIoMrPQcb*$%5*QtWcL
zD(r$dIu%GFwL7X3YF)OujtYlsNy5Oq)5aDJ64D1Sb7}HQN3;T#Q|WKk^Y-0XUQ2iX
zFemrQ%zXOKIoroS*X~@pF?xDXr<?2O{`yPet8TxR9vu=8HSvwJXoLKdZvC?Fx5K;K
z|2F+)bj`zGMU18gq}Z;NXVc%5?{q`<%-Fe;*IRd7aOmsaYkr{XmM6S-_oF-Mk-3xC
z{r2P$`m2;kN=d)`Y4L`m2VbH4rGH}|{KWdnu8OP2+<Fl`2*1aM$%N2vR-P?k>Z8U#
z8o~&8fF6~zUtryF|Jw^cT#-!=K>ROc?@X#>6G8uGsm4A;{|rm+|I^})^vEBpu0Lbo
zpVl9NzC$BRPw)0j82#yE#_%2W7KzHkyDI5^#?=M?Sbt-&=A2A_1$;Z1R8HMmZ&7L=
z-F}!6e4BNJ!Z+PBSw=v2%K=m9mn`<kQu@1skA5SI>q5HuPWE?KeP}yb@YWZ8zO2V<
zQ}<doO=XTkdhpkKUzAv)TOr@WO!(*hV*N%E=Cr@QQ00A;k(Ya=^+38@pw@LDhkt;M
z1B(3XEs*{#hBc@&{o%^YNvj{<Pk&zhILnEyk)svS^?!Z7xx=y{^p|CF!P9?$?yslG
zALwR~|9VLGkD@=;Z*MGjE>C#Bo-l6C*-rlf`Rq~nuP4vRjM7bz&pMF*pECKGbhVP7
z_0fx123_e-P!38q(N+rO`nwXZo+#mTJrl?0o9QvBRHdS&KQihAqUgt<7k?<<t&ed%
z1${oO&`W>5RnD+JNYO+3&8O_|GkRDlSGr*zi`a`EW?Y=!o$gljhyMARTu}8EM*XA@
z=rKh<tv`ai0~9%1-*w3fOb>uO_bGZt|5otUTcDgbG4<SVjP8)tYa|6cbUUkU3qPX!
z;P(*JpZ!X8tPfD=r-x-VPPV=)aUo?v8)S0E)rTnZtq<tI2<}-QB8%q~`UCLyAQQuT
z_tXc<V(Lhb$;spC_s}jJN|pcU7FHY;S6g2K{D(mwj)2^iY8yh2Kt20AmR0zt=ub-h
zv2IuLtq+EB`waM50qxGmpbx*CB`Y%h1?u}=`2O1@Srh2XpdX(p^`9P8^u)Ro<nR|O
z#*g2rCrW*y2Y`-0K>vOPJiNW*<+&E*u@2-FsnlQV2GEC}Kwox%ejQchN%sIhzbSs}
zHmLW2FUJDq^}Sp?1M0(oj_>8-JD(l|Is60d#BWObLU)3EcPjOpeh=-!F{Rw;KfK+8
z`nLzlZHLk>TR*u>Ca6A=*9)jOL4cpreNe9M0rZ%n&-5=vU#$xIne~B6J+@u}?cs~i
zz8sK0$I=f$zDI$t{j55yexg1c+S$FJ*Sn$J`Ul#n)j;n7ki(B<@*kCR@4c4(&a{8p
z-}Dcb{OIrXF;KoorppzXw`=q_S!}1P@@Rl|?jI<p15i)Hpx-(IzXw5IzX5$cptQqu
z6RQq)KW5#h^pEus5Pu`IM{DJ3kVpRky?Ih;$E{B(`CHe3JhnkQ*C02C6L^1GPn7<(
zK0+4XTq`f%KcQcMdQ9JhcI$0LUi2`uyFWwu{ix7o-LBMgx(CYn0LbST@Pj_ngHT_-
z1i9~mb}vvVH|w{+-+R#C?t*^tu%e&!fn($Z^qOawz?RT0(9VAc<#`pze-GsUry@uC
zhthysZ-;XFL8(X9mlgS1A5`jTJ%RRSKePk%Jh_3SzkwWHgm&==)Pvuk9r{+G&-yy+
ze%BpP`~7X8Z(E^#y#dA_2bK1Y9s~V22<`edXy>_nVZ2gLpuB$qI>VImwSKAStMyCh
zSN16FBmG%vFY5`kW4}W^cmw=LmHwhW1nM8}*M4B#@vryML!d7QfZv~BoCEdQ`V#cd
z8<cud9|Us$8^&Y2pZFWvlW$}JEusgYz1t7<Yd`4g3rhW>-$DKV8``g}N;_E}1ntcs
zXg42+a&!T{f0Xj74+XtFsPIuAtng+18T9CHr970$i{(_JJZ^Q1<Yq?fZj#_m$=z9k
z45AqKyH#>`li(x?n31&`q~ER*^pxOa$$zu-n=SoLli)7N&5YumA^l1dGmA8VwOENf
z3AQjpbafIuPh?|!7G{v|ISFo*+{2{b>C$hy1k3=OG)A^CBXu*S-#nsN*?vX>cBS=c
z2{uaMqls+n&C*Nyy^1Kt2Var=sYEgAcC`feNWkW27fMh8<L;*<cz`I6biFUZLqwT4
zvJpKS*{_svFoTqZ(r>c#yND>S2i`0BbBOYo*EJG6D8ZLRF*sOHWb4V6ED3TY_v_Lx
zdo1iS>6d-~T!Q-~cvkYqNWU);dE#ne1|MIO;1UTgBFY0{Otwi9bdcOnN^r5{W)n^J
zNZMHvoJJJmg3=7d!hYY7e#a7df^Qio{Yo<%OM>)!o&={8`87mK52Cz|$P5zBlHfKe
zj4dYIEy4K`R7w8+5-=IOCH;;eicv#m0CKVfpAcnHVz~rQNbrsXYbE$zf>$MYRDyRU
zSSG;+2^LDQT7u7rGP!1x^0`DY<aoIR+laE5QAw0X*(OSGg#<4X#USA$5=gSPFrHtO
zV2k8tbgU!F>xa@jifoeL4hf2h^6;IcUlzVF@~h-#crK8B6N&P08=L7qE`gIM55ldG
zzOhAvKc!zrV~GSiB)2qcv#>>*Jg6TpNWUpWF^2diQH+(c-k8+~_RD5?t0dSZ!SzJ3
zUVelq#vC^jWpQG^1dA0>$ge=YC&BF!{7sa_oZS-alfJXLVQ(ToFlS+PPpYScjo?{5
zVr3`QGYk8EtpwjmZnmhy9@b+ERX-5ffTm?QQ4A`w`pfFEBzp_1C#?MTO0Zpm4~SyC
z@+YDgNW5QybBMBt!WK&YAc~PpR(B=&StPvVItg|X#VB*N1ZyPND#2q!HVkWFbxW$h
zvLQ%m=4@g7e@e91OV5_VSRG|;$Ztfpu5V#7O_Jaj3H~CAHU9My{74kzkBk>qzdw?I
z9lV(%!KFm8cqFwG7AAXEhuB&JTk2+YpB;!{ZO4Za?2+OyoUHslmY_z0LlXQf!Ics`
zOq7lNucUY^zu%=_rt_@ce=E6}Y|fS78zMh=Xkl`d>XL=&>(|QcJ``3GBwc|;h}`6)
zlmhH2#GWGTnXY+K)AF!qJoXe|4@#4knuk5(v8Mogim(SG$QX+~I86pllY!D?BqL8|
z@>uLCz@8%P!D+I1w4mIStRmH&n#3ao=BAESgEbnNy8yY1kbAo7&de@CZZ+RA$*Ea|
z*fWdA8k4L_o4E^-dj@iA@iS8jk(-AH=Ow3T;*>nu<FRK7_NcfdPcCwgMegy)jS{D3
z<YLcw?9oL-bfu+_#U3s9R8<**^3t+%k$Wt1k4Nq)$gSZ?*UBp|BQ+Pfark)To}jw3
zR2~BJvT`-&Sj~wds4R|6PEO|9A2c>OeZ2M|GgarAjyzgPk4?=~Ka5LG%E-r_Y1pHR
zXk2pY804Oa-1*2o4Y{?n=~?;6t;NYo9)sLCTuYmsHU_!#k$W0)tL%(VPD;<io<i)w
z(X!B&6cw4|NmbQS_T*s?ik5-DWT;<)#wTayB6l8g7b3S7KP4>>xeJjS$5ACRJ~Jy9
zxyK;)H00LeXJ?K<?mXl!MDA(Gt%_+va%$!j>?y<^wGK?s8e`_3g4}AGH6dM7S;;*G
zxpADS$X$rsT3+dy2u~(Tn~Bn9qO^$4Oq5q9$}1D)m5K7oO36cRmHCOusj8;S9<|k-
zn5wC<<mTal6Vs<@PAxQJoaW5eoRc-DmMjA$Qww)ucG5KDR_U0OrnVu>JsG*DAh#Aj
zTkB~g_hjUrg4|P)yAZk6?k+z$S?fuer%>~xBo$x}`jV~+P4>({9*iwr&7XO)@t16j
zP3!pblQUBaum?rUN*RYe)38VF`|^|3Y?xc+KR-p&Jjt!fB0n{~0J(X1V7^+HC8vfx
z3&qaTnySgknduX;XA<`0V^0zGsAGu9$rCl_B+aRo_mpH!PYcr1a*;b9xu+tx7C$2;
zAGr&W8^<Zq+}S98Hj1B(;%g(9g3L@c{?z283CLZ5+^V#uCZ{FmV^0D0sL=|OQ?xJ4
zGZlHXUXyuHw6rAb$wHp=wDH)(3pKD%l}_-q<RmQ;^Au{H3~gk~JVlx(Gh6LlrzLA`
zz2sKepO%`U#YxW|hum76jCAByn-}Ir;h88r6NjrxIxSP}frE>Zlag|=XCn3#XrAP>
z$=HLwq^3{A9^IE|nkOwK7kegQ&t&XD2r@8j8A-Y?g~*eMD9Fgtd8S|wMv#SQ%hUu_
zl%{p|l6xX@PeE=ieumZq6=h~jL~boSD^ulgdUA?3oME1E*fSn^vhbHI^d(i33iFJ`
z9$hq)X=?H~?7`8pF}7^omwfC&*wWK;v1dH?;AojBnl_S_J>#(l<&%Y(%SsxDJ>#(l
zr_n~N(^Hajk$XIHt3@+CO=}({w-zTY9fhl01x{DlkeuT*CyFppb84hyt7HXE&(=_A
zBA$_)toP(&Pa*cG?dpsSZGb4b$0K(>au*`Es_3(llhiT4?8(C(wOO8}&EA<?&3aav
zHoBjcf#PIK8f|hNpv+d{s%DE+$Iv1zTkv3u5Aj5MF(V$pBKX@0{<g>;EVywT3(;N}
zk3jwigETlkzs<x?I`Q;vgfyW<JA1{$LjiXvKnRKx4!@D$<|lPR1RR`Z&RbiAI~?f1
zXTCW7{IpOggtb9=#u><f(-@4x@i&$x5YZ9<ZXO5UF^fYo2}XSIbhw<jd^oN+gpZ#9
z3l{j~z`|mXJR<>D8^9Nd(s9{ZAWjSl<L5Z}t5ku2FA(t;P1GkV)e~vaKz|6r6Hl~b
zMqCE*fIkHA^0>S#V&OLg=;Cq+ggm)E@HBApOa6@mcO>B9ddSm9gFhDV^L!$K_Bcdu
zEb?>QZBQPZ2F_Ou(d$JFiib|9sV^UKUSlACo^KGyh11Sy;W#)gV!8414@Yrue6IIA
zjPoCa<iUBud2+l#@GSu7<>efT@bElC3BMhth4C`s{wPEPFRO4Q=XOGTu5+Q_#^v2k
z3B!xDQkc@pT6pJT;q9}9H_jG5I<WAL%fh=13vci(y!W#3L79bjB^KVBTlnC|!bde0
zKH9eMzQ)3P3JV|mTKN3M!Uvz0S-$iXg!dFo;;c1Yoq04=fBgSRQmGV*td+9H5ZTQT
zAwm?g%_v*4O!h1@LyCkX+eenMt8CdB43ceRkFkwW3^T?$W-K#i{Ct1EbH0E4&bjBj
z&wbr{-tYH0_uPB#`}uyJ=fl}SOGodEX<TcBJk;U`R3Q4vw9QYq1rLWu@z>7#UKbQ=
zSPulU#dkHlnyS&74Q}Qtf-5y$MPD|U?s*nkv))q5zVc*CdCr@$XmtY`-#4q2kXsv9
z<D(|_0`~4Jw4nc)x0c-HNS`v?Nm5EAw4q*|Yi$1HK7&o?oKbk*!R?Ov(-aXNbwN?@
z*=V2YoU<ym>Jbi-!mgR@+gEwK<NsY<@Z<bNP^r=8yAF}m5_nsy;vL_~?*D}?5NDcd
z^m0eHXD#GvpX=!noy(Zj+s0~_F{J1Fq9<yag()PC*9Gh=)4HIPQKl~6;@-_0P(0=i
zlB4GLsm!U4;juLKs+Fh&yX=4{i*$gO67t=aZKOra@L7s$Xw8+di%S~sUO4=?evPle
zI-GNBkSKP>Gd3~-8Lbj<4JMe{$zd?BnbP?E>2$5nOzidSZkd?FSq`U%lLG7(HpaH6
zT5Ecd+)=htzv9`N6HjLT*!R1L90i}H*3^|xxH>r5>djlQwcj3!*R{PfN`;*n$vmgN
zdEx+lmJlpqIbWKkw^U(y6{6iYf8qdmqQ>F|X7|b>!S&OF8(V$$Q==WT@&=<j=DM@I
zpGcerqoV0jvttgYs5KPih1Y_KfQxQf>H>!zCs-#7yya@-{YyaQA@8H0w(M=?b5HF`
zPXH#a4aY|Y8}41c#sGg>zVu0kJL#mz!(4AI9pM)qbu|_LLOb>rFCN*SZsDD?`~<!6
z77}kbTWvsQ{>Vx)`S%Hl;JL?A?;H$Y(`Dy5N{peldUTWIPlLpYe@SF3-J84kHa3<^
z7g*h)wYeRAO@Vt+;_(o-MUqi^wt_ZqqSOXdi*gwK*Fgo7g0JR&qh0uRKO(Rb-gR+P
zFoSdZ68ql>mAO-+AKbc(WFy`Vday+-iJsYzHtsU&6$x>8o3%22s{3!vic#w^LpJ6`
z;pB@{$t%sxt)V)9>$sw`VxNU_U1Yvc;eWAwP|tzTDj1ah;r%?$rss9wO!^CD1#{tc
zmrj~|!i&v7!%Jqvvw54Z>ppJ(DujR)B6^<dk(#_`E>*NjJZj>6@sH5AvrNmAA9a_C
za2MgZ^H;*&JU|eW(%V@VaQcRQ&KZO4(9A8$g&SWsDErw)z<WIhe>HpXmgjY;JAsa>
ze$tR3Epf|6{hALqzJPa<RRF?6Ac*Axo%fU{$9q*qqwX;k2b6ndu$6^KXqd?TId-;U
z<bMv<o66E3FLdsE?acnaN`dp#;i)Uxx#aPewGp8jQkKoyDdUK;h=H3P3r-ZD^lbLW
zOAb03ZGbcYVd*CBHl(NjD(`g&>NhE*D&nf48Fo+kVDhnf#_a>j@o!H*r5Ks5Nbpah
zw;iv>J*A{*cModPzF344#w^WK{l6IxW{i(R>mt6XNHHHW0lj=Bj;AmWK4t(+2{)xP
zn*C!$lA#iWqPLig-6xu_)k^_>Wf3isQhLqjN+{Dd%@5PYZJW=18=F32zXf9DjOX}Y
zYT>iV8MkisPD_bMj36nMmPRyse3V6^+xr?n_|HtpzII6)c>djyFU{0k79;{nQd<)}
z@OL(kHR2CDfI_k&$TA*NWfa2oyGLReH_3|Dw1F@UB}}pET}ZBZF#;W7XPU-ST4)-6
zFqS3j0S0C(fEBVW%+*Sx2V5@pA@cm2@0Q|K4*ZK`JwC<`#F4EJA5e;=tpSYNDS+^A
z`=J`t)2@vpXZ=qJMCabq@Q15|Ek9-tvldnk{o91+Y|%x<Ju3$Wn2@D6{$a8iLr-9<
z>DM7O<5a8WlmCqyHJ5)~y@Yky*gSdQulcwt9pF?(Sq|$XU*=gk_=e91exqGH6cNV?
zgYoL1)<X@QGe)WYOMw&$fnqz$d2q$i!rHtwg24fc%fMt<S?jEX_0c|O0I+-tYOI`<
z-buqEP`#tkTWR_)+JB4vtxo8{ujx`NYS}oM!$n;7-#i{|U}=0tsl7#dU(cDkV7C3W
zBFC`72Z?FWvhNYQmJ08dZ)$&C>Rt3Z2+$!lnxU7zjGrMMhl~?+7S7xkp9o_o^w3^U
zjhkp}N2s@HBZj1^xM1K<0J~+X?4y6Tiuy9vCc^M-2Iyb8;{>Gxt3n~<BB|67ZGEsp
ziynw4(|(W%rui>5{0_DcU#kPK3O;Fc5E~{d!;5lo7W^@3Aek(j!Gq=d*xv3xZQngi
z$i0TQ+bG3ktjva&+}nFZ@PBFk*8hwHp{2_IjE(&9o|Ij4Q8^}lK+hYmT=|H&L|NKa
z&uImJU2lADeXjz_K_QqHX*>v`xJ_j*Y4O91ANlf>QT>2T^Byb-$g<7BdZ{DOmX_O?
zm5d>T)jNi9Nk;_e{H4WaxC}=WBd)5RrSdO<&QVs@HL9SFzK$0hEtF>m8vmOJj@{9C
zdOUPP^mpJrrp3lBVWmf;zy1%+QWDdaT%&StN*ika*^CywlP}ABATGn|+*z7k)4+wv
zFu9<JF)d7EP{1ka#)^qRM^IBlC~T2tzn+orXytttT#0l<HqRY@ptL$@zzSVY+t<L>
z!=_xyKFlh0`_0ToB)mkEU`l$)=phUdG+$TWQShvC9J3k7$`J2G<~f?!Ec&5B)idJl
zEg}(O;{Ud{Jz#bI-&%k|2{-F3dpKy87&kIq`dt!T`d|p52jh1vNP4?vOt1=p72VjR
zzv|vMgsBVWpW$+0|BGoV1v)BW9*JKtRQY-BNU_`45cc2mr4r!7W`2lxiDOSCPxQdl
zGp8k1^>IlAB7Qb7Mz)ba*1s$vK+FGxc>jE`5NObF`H_=&!Jj?)#sBI1{d30>ph0sv
zSiC^+@>d?q9xhtvt5D!mQqpR4YH$(ISV`=Wco8uG_UU<19`N)MUr~bAYBWEyisz(@
zNYp3br?`scWV$Y43<irFYsYd)uj|3sM#R|&ylnd%9s7on&%F!ge<n6YUOLG=;=*1i
zbEW&WUfX>oUQ1{c^@<|uUQ_a=yx3O~Y9B`5gq<LmG+ko<#Le!)ci!2%h&>@Sa`fyB
zTTkf6D5W<p&aP3yE9$2*Hj+ATv0O}aN_sn9|5EBe@NbE8x+oiUKK*H_=>Cd7!Bga&
zDZ{99s?V}%sfzW+??a(8|LW<Hppyv#B2kA)k;y!)@TbW<I_0U{I_G+Bb-cfH+^Ag`
z=ngo=<Vuh_<cMH$^rZfQXv|!j_&RcZVe{fr!0806$gw=zA=rOiamN32f~crfg7k>K
zv&!N{!EW`FhSx4cB`4}TJCdM^yxVGOHEf3RtWz$x5Yd)#?VYdtAiA-1Zo`JAWC@x;
z0_O0H{}V*#i!P+ID}hH5yv;Sj;7Y}cJO7G3VpZ1dQA6&FlQx|rmO@5i@tww;M>?k|
zTBmy|F1$(p5`C>b+5}e0)ojV7_`&aDf|m%}qBd8D@f5qF(9gsgkw-H+VQdMZB5ci~
zY|KZAF;Nk_0WQvSr%wxRPVrEtc>L#idhWwHCCuAp*M*Mt<6eOC?RUa<Gu(#uyC-1b
zY4CR&oD%YL`xC>DK=4z{{i_K{uNE(Z^mP`F+2gNVq?}#dcmzXcI0y|~)W1@S%iDZ{
z(+h^3!TK;9C!vKq;pzH`=FJnD!BJxVg*Ko!+VGPS3LxoqWhOf{TKLidJM~|<B79ee
zIJ<Z4D972=M`{7j9@fl}5aSaY{9Nk~a{UpaA-Y;laoM1eecs2JKStPT`jlZ;gUq@l
z?i4LUxO@7nKcQz3&4KRt?;NMgbE<>lE>-@|W5wM9p}Pk{3HcG{4<pX^d`F*x;ZD#S
zIWca^Tbw;uqWc0z0&4_txlsFiayB3;H*o}5&gB9occ>4beD5p<$_u9b?LjnM()3+A
z-h1|^%KYg~%6B>v97CM!@dBP??z1J_iEH1~58c7<=_bv;qqm%;23#f*WEJNrnp+1#
z#4a!0l>y9QB#@gqwD?GjM^`6;ZS|Ph(O&r&UZEy)Sk@l$=F&~Y8h1_!rOB1f(+Oj@
z`TZn9=bk3f%!=pAp;4ERjj6kHCYPB%&}CH9$&fVvhRRcgTH?~Q9(sPk@MKa4aa_vz
zHj%rSW#9QUB}h~d;J{;e4ttzB&3|C0CQbLmo$d)%m0^AfO8MCyq%62Hd*TpYm|7at
zVzn`u;~e25B}h6DOE}47JAPhr%!Z)GF@$|Sta`&5(}fcFiLhMF4{9$!OmZx6Em{D5
zbvgG1mt|U&@Q3{Z2!X|u``b9-ect1E$Zz|<$5bSJT*mG;Jj@cpXML<MT6?7p32bX7
zEXY_u$Y2-h90b+c|4k-nPyj7FE~b(mlCqCFFYO!CXqGFQK+G-7WF}_Eys=N>s=Fsw
z_Y=W>g4Oa$#NH!V`7KV`PB+y-^7jj2<<ot>ft)`ZWeWsLTuzCMcMSV}$ma5#N=yTP
zO8@)Ja9x&dh{Jd4yh1QvBJR~0E)gkqU@+sG+ETyoi+5~8zy*({X`id^5lL6e2Drj_
z-#>q-!1-eET|}6kpfV=oWQVUjr(PJm@B?m6NUu&<@Ek&Hr0dn~lV2DoDtE38^LKO-
zou#TWw|`0JWp-;^bxk{0qs{qGS!}AJNMvD@(@!Ju_ONkZM!)aP<2B9iyl0?KkI#*C
zWBkwcNwdLM-aL%qz`^p$>Gx2;N9NXBS|YBsY?OWu?a@eOcB03fniFOT5cl8Jr(K7j
z!?&Uz#zY~DYuTzebbD?OJy}R0%w|d(N~f^fa&=(OEcH`d)e_S0<*rHoyLNiuWTv7^
zicb{OaL?QLxZ637v4lu!PX4*6id(_aNmp}R-5|#N9I?e0SDyDFt{>a5M0vD>-UMD*
zNW=A+aDsg<LeKX3sP?_>7C*Orbv+;woF06|(wDO#s&ya|az%=txT>oH1F&gou?^k$
z1QkscZciFaumYdlcaK`@)kiDq?mkrGi^@>elT7Y#xjpLpUTLw$@{w;!<SEH<^#bCB
zft0rNI|4*E0-keAQE2)+Ah?!xZVSo1KP~?#Snd<<<1>1f@4aA&nZfMsN519dDUlhK
z*M?u`<&5I4xZ=geezRrPY{f-J_XkK?QhiR2V0^zju|>>YXMMYvetP3+pBnE^i_7*a
z@y*s}+%2sFN=rO9>YgkOjm%x`|Dwg$QoA4ZV~&60T%VaW$K#YIr5_X&)s(YboOdt(
zfAH~OzW)o%F4(FIyzBYh#29Sfu=)^tPq|6?n#C;%fa(37hg_6}>VA^_D|cD{vM0y7
z<5K5uk!w6+8RhQk3QmLWj_Du592@LU{N51r_wd}{=a;&BKfvSbXCY+vH_eHusKxpl
zLdc4--rHl_3uj*DbD0}kOlI4^eJHPDI`}2if3cJAacDwh9O=7hTE{PM>+_Pu-<BR?
zY&ZuKke(Z<HwP<tf6YmohcztQq}_U-DX~c0Ntw7YY+)JNZ(&UuuyspC7N0V=*>O?*
z`TDKQ7cu^#vi`Fw(*mwvGcUV-&onc$(Reml-;^~ye((S1k7xFc&0X!Qu&L_s?EhX@
z`ptw-UEV4FQ*xz-vPvoDr;2r3^nVkwgE<L5tus$4Nj(0d#nEkRdFL3hK3Q}ufN4zZ
zO8$_3e5*|Qm_U9`^MBCmHVz`o=7t;IgE=3^N+hyh23-7Ae(n0P=rlg@nL9f-z2tup
z<QBs=+jqkCwO_|bo^qFX&oyi{yVzm(edy#6X5?P&I1ChKid6_z@%qUWA4#g=uC2Vl
zO7xvp+{fHktIr1@dsro-cM)o?FdL-=c=-An8BW*v;H$%^Y!XvO)}k3^|1!9j+YLd8
z*YgL9==XVKe>7d?*$8Lgx-nyVeFt11H`IOy*4J}?@@^xpg<5NiC<d@*8V5tm-VoKw
zPWXaQA)-#>a`od~;k_fa%JYJLkYy{B>bQ^zP|z2qvlKTlix)C>yK`^81dpRmv+!!L
z2l!R^ODx^?*ujHv5F1fF_+Cw&rL`~;wA}K72G3Fgus=Bians7VGXgE|6re^3`7$SS
zNKQ3$rXL2(&*DPwJ`DaXM0fkE(e7q{h8b*lKyZM@)%pq=8kWz$QpQDe{j-~X*Gydm
zSFtYoZwv<-JzR?HTy!CQv2bTKs|CIda#&DK<AZ1>!v7v_mu-&&i#D&IZ#|pRZ5;4t
z#8!T09i=^e3Hv&{)`CzE5NGaJoA%;W%jKBT)S4{ldVm7VZAEy6D-EHm<~_4N1K9Hz
zPm4E)EzAtbP-zgvP+Z&i{gA<&2#mM~&>eP-RX%^$6k3Zu>W&4?ZO`36T_+5TLhV&8
zWmg!dnI(&ARqd_8@w7rINwaT&*)j}ccAx-M4&ryk*#xp?7%grNWT`!-9EG4ox{~LT
zbNk2`#T-Tp=YrO?$d7q~zJU3Id0~k47`%;P`ltmX7qz$`6HlUCW%A_?fZ{3zr=?!4
zB<E0fAgCrVjCcX%23f;$DfAaX6*k3a6q+)TTnbUWyWJs(x)DLQv*M@M|LVT`c+UZZ
z4(y))gs~v*D#$z*_+6gw@YFAYDZt!Uv*|#N`haRq^yOD+gaavKp%SEcXg{Z3`|8ly
zJ4j#y@YzxPkW_`5g-F{=e5(R=u~W~22l6KhxEr3FI<7}njxQAvYdDld{T*+{wvQay
zp?n;q&MdOic^^mr(xbM3RuZRzH{+=sJ%+xQV5=j=on1W8JYmE%YriOfZU==kQa%8^
z_=q!O;*x8ELZ<C3LV|qDo}(^E<5fSgkZ^^rKc6Mec|~fhF=icpgOz!inl)}QA3*)4
zK_%497lL2C9j`*NcKW#%;d$a_hX-xYkJLg4=Xwh*Zu}o{w?*~xT3d-cPLo#LD4rXV
z8Prw2Tbd+dZ64AW(e44uSFJOeZ3GpP=Mguz59EhOhhs~1#O8OUP#Sw^7rEQXfxVAx
zen3XB3UD2bDUI05u;mKq7`$e5!boiY-_P+blGaCKnU!9mhZ>kVTg}NcDj%m`(0d3M
za0>9ZqwJf7jEMTbJ5vC`w)=Y!-S+GuuXk%XhDGNL8cmAr16&kvoD`f=xQ|}D-O8A_
zPx^yiR^Z6Q;ysXh9qKKZ`l|iodIl{YzX$LPVWmkBLoBKUA2a>5VTit-ceYQ&in6K%
z{R7q#*r|hurZzC@cKjq4`w_MVs7_RC2UdrYpFG%rg5jG#a8;vF4A@hxW={##9~>$*
zOUNPhJ`~N?DMx~K4?`GSnj{{piuwH+IQ{lk0C!pCaDB0bl2VW3eGp=^(qXSvBNLwA
zmjahf8~X?48LvO*B&F3xlQ8_01C+xaokQj8a1&jf!n>wzr`&T))ClsKEtQL;z;Jw)
z<Zhb`veIROoL?EZ2#HIp)RbfXBlJJR^5hgWgtTfyQpwv9bgSECrympN)g}LnuP761
zCnHm!W(DHwC}@8rLzdWLMZUY;PomBix2z9mJKP6V|49EQj0%=~yk=T~WNOS#E@HGl
z4x^BL2x=R{#+>ypM~!Il@NmbxVABc9$k_pfeyOZtsjpTY3w;iGx#ez1qqTy~`I(gu
zgl>V65YrM<d1GnVX2YsmQUyE?-WVw}HyFrG+zYdbXuDwFj$-X0z+|SpQ}>tVkr!mI
zVJAg6kr7WE=B@Y8Ve&f3W2;(7d*dzg3{PUCEKZmoNSe%T(FBFtsW4Y@G^>xaNv|b(
z@EM#8c2j@y-6d>$GHS=4%rSy457+pYuiOrb1zyhx;G#PnM&i@8H9u578}ytU3{1Ka
zLS8;2Ub1paM;~-ZV9hDW+zK41-(!siEkP$UF5pu2^{t2~x8i`ImMsMo{T(c;0D#on
zJlOwJOW&k__H`u!!g77fg%0+=_}M;I=!JsO=<oAsB@JF)pjO>gnUCKoQ&)y%N5JHb
zU{5Qv-$i8Bce^8^!*<}_y^<~QG3b9n`28zRM~?eg{j*}Fa+&B|k{MR%?s_N)w$9A8
zia%@#8tAhO*jdJo>R6GPqf<OYxrg2kE~I0Vm(Jhc|F9%1rv;7Sdk+#;Jogd*q762?
z`cKss@yw~&z5o3MQ<oPyb)^mv;t^3}7<T+-&gy~?=~>gR<QY9I?oJTl2K`nUoaEUm
zHXeSW<-up;Mj&o`G^Q+joQQ2u0!Zele1uyaV6bUb<;7*0n%XUs^zMW_<@tjJZF=<+
zGc-3GS=n+>CI>mvN`Wgh#%_M2w}MT4m=UL8gcvpAbVz-nxN#c}<aL5H#Jp=+A6-6~
zL~5q-0I3=<m4&)leCwe(J?iZ(xtkeq*hGt($ok$zTqr5cXvjbNo?PKm_CF(PMjQQ4
z>z=8?7#j*z!T5{LPDNW5Ah@rOu;vdQCdFrR<WR3Y!A+E+F*y*1T83SGd>@bhsW~G{
z?i%Gr$UHn#JD*1l4-8Mrlv9{}om%;94Id%b2C?b2AkD0ocWyvo&eOw4?S>T@`XG0?
zo})R|q@vd;%e)jSv;Lk|t%e>xi{96($5&;Tw83<*R*3ID-Pj$At}`6_2VztflaX`H
zO<i<gpqB#qYsnamHzWj#H^V>czDeV8L`SsNnEB?}jwIQIJOUes$gO{&wOv4Hi_$u>
zCPN7DThZ_vjY-&1i=t7hxvq6$gTuPjuyV4mh_&dC;<|R4pH0FzC?Aj3A~fLR?-7&y
z+Z&KTw#BqPz?3viWq4$8?=WKfXA-EGyadlnhu4LnpHQM-!y151jC3CJtNVAIwhui&
z-wYYuv<sQ^6UGHHAARek8?8&JDg`~=z6v83#gChahb2|I;07E$9d{O%&2_?ACfFa(
zX5VF%@3moVR{eM$pQRycwSAk@t(tQJ$2O<8@|HDGcB_Y#xr!MIE?u9_GhQUm`8MV|
zECn+RQRHIp$y;Nz&w+fqAX9`ZZs1U3((JMa4(>g&5}4w6RQi3<0*pjBwRv_YW8A12
zpVz$L58HA!W}l{qyemGB+v}54^T|{}Za73xY9UUV*&F2g{YCc6ps}{Sz|EQ3MQ7CB
zx#bt8d%1AE9!3(vVy`t|D~2TQQyQMzwA?msl`*2T$4c8<nzTo)j0s->DJgy4(4ZwF
z%wbDV`Z&M^%FtolqK}vEeBQKMF93~Z0Gk=FE-*>o<ba}D@;R))iP)6h--3n+3^90h
z!?tA{d9UiTqV|K3qdM|R4i|O|m&JnXco}Hom`|qB6|uIXxTLzP)wG8Ez`dOK`G;De
zY&aOB{5&3ZD2Jo{o&!w=IvIwf<-#U^`dN%R1@J--U6+kf<fSqm+EWm6<K<3=Mp^7|
zLCd{f*cQuZbIhreqY=vwHVff26xpEAu2OpyWY`;G86)DeN0}_lhs)I1?`dr%M~4@f
ze}7vbLN$xqRlqW2<`T~8_0YojM;)z9_|agkWeNza-wwfUe~G;Uzt@iNSSSbKO$zWx
zt%Ep@Zh`SlJE|?90Kd|Hu<q3MoH*3GiD1zIABaP44k?vRNTU#N;Pe63251ywqZkF$
z54i%Gmti7bqLjC*YUC)uVl5WEyT)x~BCt297j0ZVZxO=ZTDO_4p=P|8VJm*<{S1n7
z*~)<`Xa+wL0yXkj>{oo+I2|4k_HzO-D#|Nxh8aGx-hRbtVlnAKAOnUU{w@~4g&V2~
z4_3$JQpi6krc&eUOr!3h@HS<f*!HSjwFRJL_QH|iUW~*CU&dVm3A>3iXoUSu^|#k=
z#NG7JZInO8HfJJP4;!6cXz-%ixp407S~9Ma1A$o;O|H9#GJ#}>ZK}?M)H`2XZL+q;
zA`hxQ_x@p`uT$9Vt<C>L1HH_sGl!%_(qwg2MGIs#wjnsDh1F=wOPYK?!__O+oCy!9
zgD*`jjT1w}#b56)<(ZN~0En%vVUOGSc%5lqYOb)+*rqRwRFs7f(r>Q~g001*byHj3
z<ky2vwi>kyw?7}CF<F1$El3(|H<61J^6VSlQO8@2x!XH_r5L?^g1L)UZt!J2lVGZ@
z*W_?)rX9k39pLkO>l1h=8hZ5aUt2ykYk05qPx$Xk-*hn9b6(p*^g%u^h1F&kO{V*D
z_8XyZ5T^U#n~hER4}}mYu<Wb$iYEQ8+{S<v3G0LgG7~Zb>hW7`9O?Kyc3cguK`-ht
zIM3nfR<>v007`W8l%u}}?gkdA6F9KGH^0C1IWcg+Iy^>udoCr=l(3)m@tnPBV=CN_
zSlu5LIFRiSBL*}TLOh&ppd+&eNa+tgK1*u#SHs<TZ=HQ?1)e1p^Q8wDWT}NDE9&T}
zf%fJ_83E$OSzdtbknrWrz<_ylxVdIss_&3cvY*vI&%l6HrQX3wabigh)@{Aj_a?5c
zl@;aFaCHXy;=n~e%x$E~cb#{3#W;62yeq^dz3w0x9<&?o$BpbCh1=1zwxpSq73=jf
z8x!V;a<k|#yh@aL=O7#BV{>^FU{}9!C(HNHmF$&I*z;;1PTUBk4-EzuY9HyO`R=cI
z(Xp3{1G~;*J3(-*duq3b#S~?>gx2@ALn@htN3($(a_Ef8JYj{fh~20?t<m$vdYc~y
zWQv4)2Gd^t;zeDrna$4@02YxU-;!16qA1q1egM>1%NPq8C%l-{8rvV<ov$tv&^E^+
zL)|E<p1JktNK5|v*u5c%f%G{&?4I)n<xiDKa-XF&O|Y<FqlJ+{fkzKKmDT3t6A_b$
z?Fp_Hu_CXl;kBp6>$$IXS+#4t9&OxCGTrfN20vIwJ0_v?yn-F2JrBDA&9J7B?9t^5
zs1l#7q4c68lE5Pny6-||6ZF?6HNb8&hgu5(#<x*+P3Y66<@r;FAP0KS{<8ph3DORn
z^`~lCnNAwWSTlLOYe^?bG#Y=VC&N(-(Z1Zk7e=6<?63z|omyfk(>JW*&Xzvi^UtTY
z9l0DTpPhwx9`eVU_}4<@AM`ACchQ~RfB3-G=sq<MX~44&+pW)}F5>Mt8mzu#G^W|-
z&614Hdy3-_ulSk7vy)q5h2k4@^?lS%ZizUWaCRC<wW0q#LDt8sOOEjjfX=lae(@s3
zmd+SL=pNe4g=i;>bAo?z;T?a%0wDCLa&fhBp1d|Eo$$dVXwPs_nHy8OaC0-<;d?b?
zY)=m`-(IkX&J-ve+XF<Pq!&g#sXCL_mQ+Xfd{OHthQjMOG=q3wJn_HjbTrv9Ia_1N
z6}7r@DFh1iy6u$Kve=2%xkLOOwc@+hwq}Nsz5ZgU)Z*?)2+R4yE+5RO!0`KZYL0U6
zWIbLn4Xx}C{yp1JNfniD3<*n7k^{jbCr$sPwT`Dx)-La+i`B76n&0BGP00+AOzLkj
zt%{^3>w?=~#u#EvE6EP?MX71{K6C0i8R_(cfeaze3bgu&=7gVxwNT=}3THT(;l)G%
zmXp!wZn~X+WaXR=?I+OT%2*m?a1fHcCF|6W!m>c4YCK6{IcVw)Sjye_uH`Yce8>5w
z!{*wLlgRt{uZ8;D`WuB+156Tw8q^Gim_X9Pki+4*`Vh_oG$mZsi=8N!4?}!eyqzb9
zQo8@O0H9Rw5H10;9Qz1J87>BhkoVl2mQ4c>ocvD(9<*Y$euHb{fm~Fe)+#YnAUohH
z%<#JzWR6!4$@+o9Vg`;1EFRPo4`><VrB6dLM`D38<X#9ht_2F)!eHZZaPgD@2L^13
zImY4v!USLjmF+sDuErm4oF0m&Gun8F`bkKDb+HfCJagEVltWR!o;+V&|KrASToln~
z1%wnwOiNHv0+lS;((h_c4JGP(hYJ$a8_i;sC8mV1y<2UMDj}fpmjrCsH2uq)5kI%p
z%^=SEv%5N1U$n{j_KL3Dwtv<ZzH*-rn=0^BobEbLp9es^U`+uXO`nv-8-ehJ2ew0#
zm6Izc=D&Wp2C~b5xR*JoJZq%A+rHI!yK2o1_IJ?2KN^9lN55uy1eE&>)uU^aUmD@}
zc`B-aLM|H0%5(G{Lg^CB?^SFNApwWjWw3<E7$bn+TgYQiSh(e;Lib8Y-6S$-;Hxth
zJ87LX7qmiIH!a^>4_&BIE=j1VtPE3O?#yn~RHD`WaW%zBPeD!Fabrf3jn(k2el-d7
z`*Zd1AGD>abmk=w=2zDU2Eq{U&7{`rbe3IFsDYzs)?PC*whkb~-#VskVnQuE88MOZ
zxC+>hika6W<$vm$Jiiy?h}y4aO#+U}+ZM7UkPhvuxjQqdceTnP6+%tePzNotd(ZB8
z{Ur2bxv2Jbe@i(Y9EMo`{a<7GY>yeE#i;Wfd3XO6!xV&jj>~;DyOy%y?S=FEtYYBV
zIEzx=>}MreC;VWkv&tkq%EKauyW&_n2QQvpPn3Q&V;V4kp_Cnbd)tsgc2vzML}4tg
zZPv?EbGBq6VhRiDrs|7g+HZ>6JaQuIUb<m7BP;NcSG#0xc>&)=5#8~&F6L`KU)!f`
z<m30w1?*QDCSAZ6egqRjTtL6JimNE-*(fisTzGJkzk4F6^7BiY4nsQK;54^ek;$KJ
zY4aYgElhKA*HinHiN*I;Sp(@$m+KxpB?w-+eG2q;`-&xa%QcIt#(Qx8{_gWz;c<_F
z&K=-WZUC!~l9>2(EZ+gae7B(JozEA%!ICY2aDGiIj&fqryP#EbDypw4#w1X>%D?8k
zpF65vIT<3DW|lUun`*0QAwl8Jr*Nll8-&7A+Z_8<(bLNS<t~h>(=<6p_&vpc!L^_@
zF!YwCsu4sIW15%%aCcA5>~M>{K2<`Bc;|6&-^~3)dRrI<{4C;=bnLw;cFu-xOi4?y
zLaMvw+GX=lj$9uhynbrN!|5JP<!L%wGUR%iT8@k*nDck)wb(vD7zV&qlW}%BWcRG3
z@;c+3CHO%^a-Z-dMXvC7tK94w#3a?sM{@SB0bIvg)gtVYCD_bWxW~IN)2E?UpR3*g
zUi}#EnJSv?3@EROzO_cU-!M9;+T?MGU~SznI;4u4);rz6ejb3rNV-qwC!5@8XUVod
z#DGC=T>Ae3e{gVmKhAE;?o*{^W;(c=*XK%CfoibVGUFib=|Pt*fB&h$3Qy~??bgVK
zGn1LygMK|)$~=SMMAw7bXvssxbIP-rhYWYV?my)>y$(2s1>PTiC^@98*%yf1a~38*
zE_EN?nN>&XFX^A44E*O^$Py@MH4X<_g45mHI#oxeP1DwXoJql&8icm(@DSFbTCcX}
z^i_Fds@|A{^@0ao0lAo}j+!nh=taU7o7PgtDhFtve!&u4?z-${*IX?i&2@?J&dMgZ
zotN<HVp+)AkDo0D@F;8e_UF;x;0wp@vkcj3J&^}2ifPK)#FV}P@Wu6ZDh8b4hR?Kf
z+10$IRo8JL;79bSwOf-<3tEFQ7kqyFKTG+p|M$b^lHoLeVN0;P8!fd>Q!`l_Fs^VN
z6P3(i0Ox!RJf4g~IL+#(T-I-#UaCI-8r!hI@z_t`2WkBY6Y{}%A0`G04dC~!;dj&P
zFKHM6`>wa^VrD)u4S=7nw{KuBVAdoka^;j~AA<}+`%^^=oLg^RB;2xSV02%2>3A<S
z!?HN|9=twn;(G9FvRZ+N`7>KSyV*{#Ix|ihkWoWnyqAtmK8}+NfB&3!|9U~|ANnmz
zuqe<tRlqvs{o%jTcl|M<yYw%8T<S|1dc~-l?fj^BeL?cZqwG^Z<>XV0PcKbEt{Qv|
zAqRIkiLn-#hD`%!cs`F8zRIX%-{^@r!;^b-g2$xGNr!Nl?I;S1XlkY<k4+kxIEc&m
zz>4br_k>Vw|9>m|&Bs@6RV1a}upmnmAGa?z{`c<G*G(g(K+At!62(WL+|!og5!>5(
zK<f|Hze;;cSqNjxv=2$PzFGTUbyI37y^*SMFMK&>M4$n}hEz*+cFCWfyX3A!g_89A
zU%m0aD~^L5r^7s}Xk%iYZFQyU4uJLRdq<RF9p2HoW4lu;BkmCE7+d{-d|en2>%D(T
zsF|_1eL8WkOCv&mdX%lDk6p2EVKe=w*ETB3;F@iBbAwgo&z<jO&02LB@QVM8Y=qdz
zmH>(49Odk*;Q{eO&5D0bx8s{#;;zj`7LT>Z#KaVy@E!7d@8x^nPX<t++rRSQtL%ek
gX{H}CrmnQS%%?x~=RI?zwTmctv|pEz&B6A60L6~UR{#J2

literal 0
HcmV?d00001

diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
new file mode 100644
index 0000000..e69de29
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
new file mode 100644
index 0000000..015d45f
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@@ -0,0 +1,250 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+require(cowplot)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp
+#my_df = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df$OR)
+#[1] TRUE
+
+#<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR PS Plots
+#<<<<<<<<<<<<<<<<<<<
+
+PS_df  = my_df
+
+rm(my_df)
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+getwd()
+
+source("combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df2  = merged_df3_comp
+#my_df2 = merged_df3
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df2)
+str(my_df2)
+
+# sanity check
+# Ensure correct data type in columns to plot: need to be factor
+is.numeric(my_df2$OR)
+#[1] TRUE
+
+# sanity check: should be <10
+if (max(my_df2$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+#<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+# FOR Lig Plots
+#<<<<<<<<<<<<<<<<
+
+Lig_df  = my_df2
+
+rm(my_df2)
+
+#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
+
+#############
+# Plots: Bubble plot
+# x = Position, Y = stability
+# size of dots = OR
+# col: stability
+#############
+
+#=================
+# generate plot 1: DUET vs OR by position as geom_points
+#=================  
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
+#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
+
+g = ggplot(PS_df, aes(x = factor(Position)
+                   , y = ratioDUET))
+
+p1 = g + 
+  geom_point(aes(col = DUET_outcome
+                 , size = OR)) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "DUET(PS)"
+       , size = "Odds Ratio"
+       , colour = "DUET Outcome") +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p1 
+
+#=================
+# generate plot 2: Lig vs OR by position as geom_points
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
+#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(Lig_df$Lig_outcome)
+
+g = ggplot(Lig_df, aes(x = factor(Position)
+                   , y = ratioPredAff))
+
+p2 = g + 
+  geom_point(aes(col = Lig_outcome
+                   , size = OR))+
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Ligand Affinity"
+       , size = "Odds Ratio"
+       , colour = "Ligand Outcome"
+       ) +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p2
+
+#======================
+#combine using cowplot
+#======================
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
+#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
+theme_set(theme_gray()) # to preserve default theme
+
+printFile = cowplot::plot_grid(plot_grid(p1, p2
+                             , ncol = 1
+                             , align = 'v'
+                             , labels = c("A", "B")
+                             , label_size = my_als+5))
+print(printFile)
+dev.off()
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
new file mode 100644
index 0000000..30b9981
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@@ -0,0 +1,154 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# Lig_outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig_outcome.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df  = my_df 
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(my_df)
+
+# sanity checks
+upos = unique(my_df$Position)
+
+# should be a factor
+is.factor(df$Lig_outcome)
+#TRUE
+
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
new file mode 100644
index 0000000..169bdaf
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@@ -0,0 +1,149 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot 2: Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# DUET outcome. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET_outcome
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(my_df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
+  
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
new file mode 100644
index 0000000..a5d9361
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@@ -0,0 +1,202 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Barplot with scores (unordered)
+# corresponds to Lig_outcome
+# Stacked Barplot with colours: Lig_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding Lig stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+table(df$Lig_outcome)
+
+# should be -1 and 1: may not be in this case because you have filtered the data
+# FIXME: normalisation before or after filtering?
+min(df$ratioPredAff) #
+max(df$ratioPredAff) #
+
+# sanity checks
+# very important!!!!
+tapply(df$ratioPredAff, df$Lig_outcome, min)
+
+tapply(df$ratioPredAff, df$Lig_outcome, max)
+
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = Lig_outcome
+# subgroup = normalised score i.e ratioPredAff
+
+# Prepare data: round off ratioLig scores
+# round off to 3 significant digits:
+# 165 if no rounding is performed: used to generate the originalgraph
+# 156 if rounded to 3 places
+# FIXME: check if reducing precision creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioPredAff) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioLigR = round(df$ratioPredAff, n) 
+u = unique(df$ratioLigR) # 156
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioLigR
+df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioLig
+#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
+my_title = "Ligand affinity"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
new file mode 100644
index 0000000..8828e90
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@@ -0,0 +1,192 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+source("../barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Barplot with scores (unordered)
+# corresponds to DUET_outcome
+# Stacked Barplot with colours: DUET_outcome @ position coloured by 
+# stability scores. This is a barplot where each bar corresponds 
+# to a SNP and is coloured by its corresponding DUET stability value.
+# Normalised values (range between -1 and 1 ) to aid visualisation
+# NOTE: since barplot plots discrete values, colour = score, so number of
+# colours will be equal to the no. of unique normalised scores 
+# rather than a continuous scale
+# will require generating the colour scale separately.
+#============================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+upos = unique(df$Position)
+
+# should be a factor
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+table(df$DUET_outcome)
+
+# should be -1 and 1
+min(df$ratioDUET)
+max(df$ratioDUET)
+
+tapply(df$ratioDUET, df$DUET_outcome, min)
+tapply(df$ratioDUET, df$DUET_outcome, max)
+
+#******************
+# generate plot
+#******************
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = DUET_outcome
+# subgroup = normalised score i.e ratioDUET
+
+# Prepare data: round off ratioDUET scores
+# round off to 3 significant digits:
+# 323 if no rounding is performed: used to generate the original graph
+# 287 if rounded to 3 places
+# FIXME: check if reducing precicion creates any ML prob
+
+# check unique values in normalised data
+u = unique(df$ratioDUET) 
+
+# <<<<< -------------------------------------------
+# Run this section if rounding is to be used
+# specify number for rounding
+n = 3 
+df$ratioDUETR = round(df$ratioDUET, n)
+u = unique(df$ratioDUETR)
+# create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$ratioDUETR
+df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# else 
+# uncomment the below if rounding is not required
+
+#my_grp = df$ratioDUET
+#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
+
+# <<<<< -----------------------------------------------
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
+my_title = "Protein stability (DUET)"
+
+# axis label size
+my_xaxls = 13
+my_yaxls = 15
+
+# axes text size
+my_xaxts = 15
+my_yaxts = 15
+
+# no ordering of x-axis
+g = ggplot(df, aes(factor(Position, ordered = T)))
+g + 
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual( values = colours
+                     , guide = 'none') +
+  theme( axis.text.x = element_text(size = my_xaxls
+                                    , angle = 90
+                                    , hjust = 1
+                                    , vjust = 0.4)
+         , axis.text.y = element_text(size = my_yaxls 
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_text(size = my_xaxts)
+         , axis.title.y = element_text(size = my_yaxts ) ) +
+  labs(title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+# for sanity and good practice
+rm(df)
+#======================= end of plot
+# axis colours labels
+# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
+# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
new file mode 100644
index 0000000..c4826d3
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@@ -0,0 +1,215 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+
+#require(data.table)
+#require(dplyr)
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3
+#my_df = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$Lig_outcome)
+my_df$Lig_outcome = as.factor(my_df$lig_outcome)
+is.factor(my_df$Lig_outcome)
+#[1] TRUE
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+}
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = Lig_outcome))
+#prinfFile = g + geom_bar(
+  g + geom_bar(
+  aes(fill = Lig_outcome)
+  , show.legend = TRUE
+) + geom_label(
+  stat = "count"
+  , aes(label = ..count..)
+  , color = "black"
+  , show.legend = FALSE
+  , size = 10) + theme(
+    axis.text.x = element_blank()
+    , axis.title.x = element_blank()
+    , axis.title.y = element_text(size=my_als)
+    , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+  ) + labs(
+    title = ""
+    , y = "Number of SNPs"
+    #, fill='Ligand Outcome'
+  )  + scale_fill_discrete(name = "Ligand Outcome"
+                           , labels = c("Destabilising", "Stabilising"))
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#require(data.table)
+setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
+
+head(df$pos_count)
+table(df$pos_count)
+# this is cummulative
+#1  2  3  4  5  6 
+#5 24 36 56 30 18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count)) 
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#5 12 12 14  6  3
+# this is what will get plotted
+
+svg('position_count_LIG.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+  #g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+  ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+    )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+    )
+    , axis.title.x = element_text(size = my_als)
+    , axis.title.y = element_text(size = my_als)
+    , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+  )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of Lig barplots         			   #
+########################################################################
+
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
new file mode 100644
index 0000000..51a2812
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@@ -0,0 +1,211 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for DUET plots
+# you need merged_df3
+# or
+# merged_df3_comp
+# since these have unique SNPs
+# I prefer to use the merged_df3
+# because using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3 
+#my_df  = merged_df3_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+# sanity check
+is.factor(my_df$DUET_outcome)
+my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
+is.factor(my_df$DUET_outcome)
+#[1] TRUE
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Basic barplots 
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+if (identical(df$Position, df$position)){
+  print("Sanity check passed: Columns 'Position' and 'position' are identical")
+} else{
+  print("Error!: Check column names and info contained")
+  }
+
+#****************
+# generate plot: No of stabilising and destabilsing muts
+#****************
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('basic_barplots_DUET.svg')
+
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+theme_set(theme_grey())
+
+# uncomment as necessary for either directly outputting results or 
+# printing on the screen
+g = ggplot(df, aes(x = DUET_outcome))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes(fill = DUET_outcome)
+  , show.legend = TRUE
+  ) + geom_label(
+    stat = "count"
+    , aes(label = ..count..)
+    , color = "black"
+    , show.legend = FALSE
+    , size = 10) + theme(
+      axis.text.x = element_blank()
+      , axis.title.x = element_blank()
+      , axis.title.y = element_text(size=my_als)
+      , axis.text.y = element_text(size = my_ats)
+    , legend.position = c(0.73,0.8)
+    , legend.text = element_text(size=my_als-2)
+    , legend.title = element_text(size=my_als)
+    , plot.title = element_blank()
+    ) + labs(
+      title = ""
+      , y = "Number of SNPs"
+      #, fill='DUET Outcome'
+      ) + scale_fill_discrete(name = "DUET Outcome"
+                              , labels = c("Destabilising", "Stabilising"))
+
+print(prinfFile)
+dev.off()
+
+#****************
+# generate plot: No of positions
+#****************
+#get freq count of positions so you can subset freq<1
+#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
+
+setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
+table(df$pos_count)
+# this is cummulative
+#1   2   3   4   5   6 
+#34  76  63 104  40  18 
+
+# use group by on this
+snpsBYpos_df <- df %>%
+  group_by(Position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+#1  2  3  4  5  6 
+#34 38 21 26  8  3 
+
+foo = select(df, Mutationinformation
+             , WildPos
+             , wild_type
+             , mutant_type
+             , mutation_info
+             , position
+             , pos_count) #335, 5
+
+getwd()
+write.csv(foo, "../Data/pos_count_freq.csv")
+
+svg('position_count_DUET.svg')
+my_ats = 25 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
+prinfFile = g + geom_bar(
+#g + geom_bar(
+  aes (alpha = 0.5)
+  , show.legend = FALSE
+  ) +
+  geom_label(
+    stat = "count", aes(label = ..count..)
+    , color = "black"
+    , size = 10
+    ) +
+  theme( 
+    axis.text.x = element_text(
+      size = my_ats
+      , angle = 0
+      )
+    , axis.text.y = element_text(
+      size = my_ats
+      , angle = 0
+      , hjust = 1
+      )
+  , axis.title.x = element_text(size = my_als)
+  , axis.title.y = element_text(size = my_als)
+  , plot.title = element_blank()
+  ) +
+  labs(
+    x = "Number of SNPs"
+    , y = "Number of Sites"
+    )
+print(prinfFile)
+dev.off()
+########################################################################
+#               			end of DUET barplots         			   #
+########################################################################
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
new file mode 100644
index 0000000..f3a507e
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@@ -0,0 +1,175 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 		Installing and loading required packages and functions		   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for PS Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df  = my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$DUET_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+
+# subset data to generate pairwise correlations
+corr_data = df[, c("ratioDUET"
+#                  , "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+                   , "DUET_outcome"
+#                  , "Lig_outcome"
+                   , "pyrazinamide"
+                   )]
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+#                    , "Ligand Affinity"
+#                    , "DUET_raw"
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+                     , "DUET_outcome"
+#                    , "Lig_outcome"
+                     , "pyrazinamide")
+
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+#default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('DUET_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(printFile)
+dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
new file mode 100644
index 0000000..4e05d41
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@@ -0,0 +1,187 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages			   #	
+########################################################################
+
+source("../Header_TT.R")
+
+#source("barplot_colour_function.R")
+
+########################################################################
+#		 Read file: call script for combining df for lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R") 
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for Lig Corr plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df3_comp 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Plot: Correlation plots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT 
+df  = my_df 
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(my_df)
+
+# sanity checks
+str(df)
+
+table(df$Lig_outcome)
+
+# unique positions
+length(unique(df$Position)) #{RESULT: unique positions for comp data}
+
+# subset data to generate pairwise correlations
+corr_data = df[, c(#"ratioDUET",
+                  "ratioPredAff"
+#                  , "DUETStability_Kcalpermol"
+#                  , "PredAffLog"
+#                  , "OR"
+                   , "logor"
+#                  , "pvalue"
+                   , "neglog10pvalue"
+                   , "AF"
+#                  , "DUET_outcome"
+                   , "Lig_outcome"
+                   , "pyrazinamide"
+                   )] 
+dim(corr_data)
+rm(df)
+
+# assign nice colnames (for display)
+my_corr_colnames = c(#"DUET",
+                     "Ligand Affinity"
+#                    ,"DUET_raw" 
+#                    , "Lig_raw"
+#                    , "OR"
+                     , "Log(Odds Ratio)"
+#                    , "P-value"
+                     , "-LogP"
+                     , "Allele Frequency"
+#                    , "DUET_outcome"
+                     , "Lig_outcome"
+                     , "pyrazinamide")
+                     
+# sanity check
+if (length(my_corr_colnames) == length(corr_data)){
+  print("Sanity check passed: corr_data and corr_names match in length")
+}else{
+  print("Error: length mismatch!")
+}
+
+colnames(corr_data)
+colnames(corr_data) <- my_corr_colnames
+colnames(corr_data)
+
+###############
+# PLOTS: corr
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+###############
+
+# default pairs plot
+start = 1
+end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
+offset = 1
+
+my_corr = corr_data[start:(end-offset)]
+head(my_corr)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+#==========
+# psych: ionformative since it draws the ellipsoid
+# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
+# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
+#==========
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots"
+getwd()
+
+svg('Lig_corr.svg', width = 15, height = 15)
+printFile = pairs.panels(my_corr[1:4]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
+             , pch = 21
+             , jitter = T
+#            , alpha = .05
+#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 3
+             , cex.cor = 1
+             , smooth = F
+)
+print(printFile)
+dev.off()
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
new file mode 100644
index 0000000..1f868e4
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@@ -0,0 +1,227 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+
+require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df		   	  		   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#==========================
+
+###########################
+# Data for plots
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+
+#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
+#3        104       1293        264       1311          6          6        105 
+
+#===========================
+# Plot: Lineage Barplots
+#===========================
+
+#===================
+# Data for plots
+#===================
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- my_df
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+df_lin = subset(df, subset = lineage %in% sel_lineages )
+
+#FIXME; add sanity check for numbers.
+# Done this manually
+
+############################################################
+
+#########
+# Data for barplot: Lineage barplot
+# to show total samples and number of unique mutations 
+# within each linege
+##########
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$Mutationinformation)))
+  curr_count = length(unique(foo$Mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+bar$num_snps_u = y1
+bar$total_samples = y2
+sel_lineages = x
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+melted = melt(to_plot, id = "x")
+melted
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_basic_barplot.svg')
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted
+           , aes(x = x
+                 , y = value
+                 , fill = variable)
+           )
+
+
+printFile = g + geom_bar(
+  
+#g + geom_bar(
+  stat = "identity"
+  , position = position_stack(reverse = TRUE)
+  , alpha=.75
+  , colour='grey75'
+    ) + theme(
+    axis.text.x = element_text(
+      size = my_ats
+#      , angle= 30
+    )
+  , axis.text.y = element_text(size = my_ats
+  #, angle = 30
+  , hjust = 1
+  , vjust = 0)
+  , axis.title.x = element_text(
+    size = my_als
+    , colour = 'black'
+    )
+  , axis.title.y = element_text(
+    size = my_als
+    , colour = 'black'
+  )
+  , legend.position = "top"
+  , legend.text = element_text(size = my_als)
+  
+  #) + geom_text(
+  ) + geom_label(
+    aes(label = value)
+    , size = 5
+    , hjust = 0.5
+    , vjust = 0.5
+    , colour = 'black'
+    , show.legend = FALSE
+    #, check_overlap = TRUE
+    , position = position_stack(reverse = T)
+    #, position = ('
+
+  ) + labs(
+    title = ''
+    , x = ''
+    , y = "Number"
+    , fill = 'Variable'
+    , colour = 'black'
+  ) + scale_fill_manual(
+      values = c('grey50', 'gray75')
+      , name=''
+      , labels=c('Mutations', 'Total Samples')
+    ) + scale_x_discrete(
+      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+    )
+print(printFile)
+dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
new file mode 100644
index 0000000..ebbec76
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@@ -0,0 +1,233 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for Lig		   	   #
+########################################################################
+
+source("../combining_two_df_lig.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+#############################
+# Extra sanity check:
+# for mcsm_lig ONLY
+# Dis_lig_Ang should be <10
+#############################
+
+if (max(my_df$Dis_lig_Ang) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#78     961      195     803 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#77     955      194     770
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+g <- ggplot(df, aes(x = ratioPredAff)) + 
+  geom_density(aes(fill = Lig_outcome)
+               , alpha = 0.5) + 
+  facet_wrap( ~ lineage
+             , scales = "free"
+             , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian(xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+) 
+    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_LIG.svg')
+
+printFile = ggplot( df, aes(x = ratioPredAff
+                          , y = Lig_outcome) ) +
+  
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#              , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off"
+                  ) +
+
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "Ligand Affinity" ) +
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+      ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioPredAff
+lin2 = df[df$lineage == "lineage2",]$ratioPredAff
+lin3 = df[df$lineage == "lineage3",]$ratioPredAff
+lin4 = df[df$lineage == "lineage4",]$ratioPredAff
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3) 
+ks.test(lin2,lin4) 
+
+ks.test(lin3,lin4) 
+
+
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
new file mode 100644
index 0000000..cc7e15a
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@@ -0,0 +1,212 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("../combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+###########################
+
+# uncomment as necessary
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# uncomment as necessary
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+table(df_lin$lineage) #{RESULT: No of samples within lineage}
+#lineage1 lineage2 lineage3 lineage4 
+#104     1293      264     1311 
+
+# when merged_df2_comp is used
+#lineage1 lineage2 lineage3 lineage4 
+#99     1275      263     1255
+
+length(unique(df_lin$Mutationinformation))
+#{Result: No. of unique mutations the 4 lineages contribute to}
+
+# sanity checks
+r1 = 2:5 # when merged_df2 used: because there is missing lineages 
+if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+# REASSIGNMENT
+df <- df_lin
+#<<<<<<<<<<<<<<<<<<<<<<<<<
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# basic: could improve this!
+library(plotly)
+library(ggridges)
+
+g <- ggplot(df, aes(x = ratioDUET)) + 
+  geom_density(aes(fill = DUET_outcome)
+               , alpha = 0.5) + facet_wrap(~ lineage,
+                                           scales = "free") +
+  ggtitle("Kernel Density estimates of Protein stability by lineage")
+
+ggplotly(g)
+
+# 2 : ggridges (good!)
+
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+
+# set output dir for plots
+getwd()
+setwd("~/git/Data/pyrazinamide/output/plots")
+getwd()
+
+svg('lineage_dist_PS.svg')
+
+printFile = ggplot( df, aes(x = ratioDUET
+                            , y = DUET_outcome) )+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient( aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = fooNames) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                  , ylim = c(0, 6)
+#                  , clip = "off" 
+                ) +
+  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme( axis.text.x = element_text( size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#         , axis.text.y = element_text( size = my_ats
+#                                       , angle = 0
+#                                       , hjust = 1
+#                                       , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size=my_als)
+         , legend.text = element_text(size=10)
+         , legend.title = element_text(size=my_als)
+#         , legend.position = c(0.3, 0.8)
+#         , legend.key.height = unit(1, 'mm')
+        ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!
+# COMMENT: When you look at all mutations, the lineage differences disappear...
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS
+head(df$lineage)
+df$lineage = as.character(df$lineage)
+
+lin1 = df[df$lineage == "lineage1",]$ratioDUET
+lin2 = df[df$lineage == "lineage2",]$ratioDUET
+lin3 = df[df$lineage == "lineage3",]$ratioDUET
+lin4 = df[df$lineage == "lineage4",]$ratioDUET
+
+# ks test
+ks.test(lin1,lin2) 
+ks.test(lin1,lin3) 
+ks.test(lin1,lin4) 
+
+ks.test(lin2,lin3)
+ks.test(lin2,lin4)  
+
+ks.test(lin3,lin4)  
+
+
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
new file mode 100644
index 0000000..41ca884
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
@@ -0,0 +1,27 @@
+#########################
+#3: Read complex pdb file
+##########################
+source("Header_TT.R")
+# This script only reads the pdb file of your complex
+
+# read in pdb file complex1 
+inDir = "~/git/Data/pyrazinamide/input/structure/"
+inFile = paste0(inDir, "complex1_no_water.pdb")
+complex1 = inFile
+
+#inFile2 = paste0(inDir, "complex2_no_water.pdb")
+#complex2 = inFile2
+
+# list of 8
+my_pdb = read.pdb(complex1
+                  , maxlines = -1
+                  , multi = FALSE 
+                  , rm.insert = FALSE
+                  , rm.alt = TRUE
+                  , ATOM.only = FALSE 
+                  , hex = FALSE
+                  , verbose = TRUE)
+
+rm(inDir, inFile, complex1)
+#====== end of script
+
diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
new file mode 100644
index 0000000..658eec4
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@@ -0,0 +1,386 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			               #
+########################################################################
+
+source("Header_TT.R")
+
+#########################################################
+# TASK: replace B-factors in the pdb file with normalised values
+# use the complex file with no water as mCSM lig was 
+# performed on this file. You can check it in the script: read_pdb file.
+#########################################################
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T)
+str(my_df)
+
+#=========================================================
+# Processing P1: Replacing B factor with mean ratioDUET scores
+#=========================================================
+
+#########################
+# Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original DUET Scores
+# 3: replaced B-factors with DUET Scores
+#*********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+ #1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: DUET scores
+hist(my_df$average_DUETR
+     , xlab = "" 
+     , main = "Norm_DUET")
+
+plot(density(my_df$average_DUETR)
+     , xlab = ""
+     , main = "Norm_DUET")
+
+# 3: After the following replacement
+#********************************
+
+#=========
+# step 0_P1: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 1_P1
+#=========
+# Be brave and replace in place now (don't run sanity check)
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P1
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na 
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+# sanity check: should be True
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if(max(d$b) == max(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_DUETR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P1
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P1
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P1
+#=========
+# output dir
+getwd()
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+
+outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with DUET Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "DUET_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+#********************************
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# NOTE: This replaced B-factor distribution has the same
+# x-axis as the PredAff normalised values, but the distribution
+# is affected since 0 is overinflated. This is because all the positions
+# where there are no SNPs have been assigned 0.
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+
+#######################################################################
+#====================== end of section 1 ==============================
+#######################################################################
+
+
+
+
+
+#=========================================================
+# Processing P2: Replacing  B values with PredAff Scores
+#=========================================================
+# clear workspace 
+rm(list = ls())
+
+###########################
+# 2: Read file: average stability values
+# or mcsm_normalised file, output of step 4 mcsm pipeline
+###########################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
+
+my_df <- read.csv(inFile
+#                  , row.names = 1
+#                  , stringsAsFactors = F
+                  , header = T) 
+str(my_df)
+#rm(inDir, inFile)
+
+#########################
+# 3: Read complex pdb file
+# form the R script
+##########################
+
+source("read_pdb.R") # list of 8
+
+# extract atom list into a variable
+# since in the list this corresponds to data frame, variable will be a df
+d = my_pdb[[1]]
+
+# make a copy: required for downstream sanity checks
+d2 = d
+
+# sanity checks: B factor
+max(d$b); min(d$b)
+
+#*******************************************
+# plot histograms for inspection
+# 1: original B-factors
+# 2: original Pred Aff Scores
+# 3: replaced B-factors with PredAff Scores
+#********************************************
+# Set the margin on all sides
+par(oma = c(3,2,3,0)
+    , mar = c(1,3,5,2)
+    , mfrow = c(3,2))
+#par(mfrow = c(3,2))
+
+# 1: Original B-factor
+hist(d$b
+     , xlab = "" 
+     , main = "B-factor")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "B-factor")
+
+# 2: Pred Aff scores
+hist(my_df$average_PredAffR
+     , xlab = "" 
+     , main = "Norm_lig_average")
+
+plot(density(my_df$average_PredAffR)
+     , xlab = ""
+     , main = "Norm_lig_average")
+
+# 3: After the following replacement
+#********************************
+
+#=================================================
+# Processing P2: Replacing  B values with ratioPredAff scores
+#=================================================
+# use match to perform this replacement linking with "position no"
+# in the pdb file, this corresponds to column "resno"
+# in my_df, this corresponds to column "Position"
+
+#=========
+# step 0_P2: DONT RUN once you have double checked the matched output
+#=========
+# sanity check:  match and assign to a separate column to double check
+# colnames(my_df)
+# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
+
+#=========
+# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
+#=========
+# this makes all the B-factor values in the non-matched positions as NA
+d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
+
+#=========
+# step 2_P2
+#=========
+# count NA in Bfactor
+b_na = sum(is.na(d$b)) ; b_na
+
+# count number of 0's in Bactor
+sum(d$b == 0)
+#table(d$b)
+
+# replace all NA in b factor with 0
+d$b[is.na(d$b)] = 0
+
+# sanity check: should be 0
+sum(is.na(d$b))
+
+if (sum(d$b == 0) == b_na){
+  print ("Sanity check passed: NA's replaced with 0's successfully")
+} else {
+  print("Error: NA replacement NOT successful, Debug code!")
+}
+
+max(d$b); min(d$b)
+
+# sanity checks: should be True
+if (max(d$b) == max(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+if (min(d$b) == min(my_df$average_PredAffR)){
+  print("Sanity check passed: B-factors replaced correctly")
+} else {
+  print ("Error: Debug code please")
+}
+
+#=========
+# step 3_P2
+#=========
+# sanity check: dim should be same before reassignment
+# should be TRUE
+dim(d) == dim(d2)
+
+#=========
+# step 4_P2
+#=========
+# assign it back to the pdb file
+my_pdb[[1]] = d 
+
+max(d$b); min(d$b)
+
+#=========
+# step 5_P2
+#=========
+
+# output dir
+outDir = "~/git/Data/pyrazinamide/input/structure/"
+outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
+write.pdb(my_pdb, outFile)
+
+#********************************
+# Add the 3rd histogram and density plots for comparisons
+#********************************
+# Plots continued...
+# 3: hist and density of replaced B-factors with PredAff Scores
+hist(d$b
+     , xlab = ""
+     , main = "repalced-B")
+
+plot(density(d$b)
+     , xlab = ""
+     , main = "replaced-B")
+
+# graph titles
+mtext(text = "Frequency"
+      , side = 2
+      , line = 0
+      , outer = TRUE)
+
+mtext(text = "Lig_stability"
+      , side = 3
+      , line = 0
+      , outer = TRUE)
+
+#********************************
+
+###########
+# end of output files with Bfactors
+##########
diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
new file mode 100644
index 0000000..9f30f28
--- /dev/null
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@@ -0,0 +1,257 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+#########################################################
+# 1: Installing and loading required packages           #
+#########################################################
+
+source("Header_TT.R")
+#source("barplot_colour_function.R")
+
+##########################################################
+#           Checking: Entire data frame and for PS      #
+##########################################################
+
+###########################
+#2) Read file: combined one from the script
+###########################
+source("combining_two_df.R")
+
+# df with NA:
+# merged_df2
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                  , row.names = 1
+                  , stringsAsFactors = F
+                  , header = T)
+str(mcsm_data)
+my_colnames  = colnames(mcsm_data)
+
+#====================================
+# subset my_df to include only the columns in mcsm data
+my_df2 = my_df[my_colnames]
+#====================================
+# compare the two
+head(mcsm_data$Mutationinformation)
+head(mcsm_data$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
+head(mcsm_data_s$Mutationinformation)
+head(mcsm_data_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data_s, my_df2)
+
+#from lib compare
+compare(mcsm_data_s, my_df2) # seems rownames are the problem
+
+# FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+#====================================================== end of section 1
+
+
+
+##########################################################
+#             Checking: LIG(Filtered dataframe)          #
+##########################################################
+
+# clear workspace
+rm(list = ls())
+
+###########################
+#3) Read file: combined_lig from the script
+###########################
+source("combining_two_df_lig.R")
+
+# df with NA:
+# merged_df2 :
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                      , row.names = 1
+                      , stringsAsFactors = F
+                      , header = T)
+str(mcsm_data)
+
+###########################
+# 4a: Filter/subset data: ONLY for LIGand analysis
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+# sanity checks
+upos = unique(mcsm_data$Position)
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# Lig filtered: subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+rm(mcsm_data) #to avoid confusion
+
+table(mcsm_data2$Dis_lig_Ang<10)
+table(mcsm_data2$Dis_lig_Ang>10)
+
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+upos_f = unique(mcsm_data2$Position); upos_f
+
+# colnames of df that you will need to subset the bigger df from
+my_colnames  = colnames(mcsm_data2)
+#====================================
+# subset bigger df i.e my_df to include only the columns in mcsm data2
+my_df2 = my_df[my_colnames] 
+
+rm(my_df) #to avoid confusion
+#====================================
+# compare the two
+head(mcsm_data2$Mutationinformation)
+head(mcsm_data2$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
+head(mcsm_data2_s$Mutationinformation)
+head(mcsm_data2_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data2_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data2_s, my_df2)
+
+# from library compare
+compare(mcsm_data2_s, my_df2) # seems rownames are the problem
+
+#FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+##########################################################
+#  extract and write output file for SNP posn: all     #
+##########################################################
+
+head(merged_df3$Position)
+
+foo = merged_df3[order(merged_df3$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique
+
+# sanity check: 
+table(snp_pos_unique == combined_df$Position)
+
+#=====================
+# write_output files
+#=====================
+outDir = "~/Data/pyrazinamide/input/processed/"
+
+
+outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
+print(paste0("Output file name and path will be:","", outFile1))
+
+write.table(snp_pos_unique
+            , outFile1
+            , row.names = F
+            , col.names = F)
+            
+##############################################################
+#  extract and write output file for SNP posn: complete only #
+##############################################################
+head(merged_df3_comp$Position)
+
+foo = merged_df3_comp[order(merged_df3_comp$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique 
+
+# outDir = "~/Data/pyrazinamide/input/processed/" # already set
+
+outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
+print(paste0("Output file name and path will be:", outFile2))
+
+write.table(snp_pos_unique
+            , outFile2
+            , row.names = F
+            , col.names = F)
+#============================== end of script
+
+
diff --git a/meta_data_analysis/.Rhistory b/meta_data_analysis/.Rhistory
new file mode 100644
index 0000000..28b1797
--- /dev/null
+++ b/meta_data_analysis/.Rhistory
@@ -0,0 +1,512 @@
+, stringsAsFactors = F)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
+my_logor
+pnca_snps_or$Mutationinformation == i
+View(pnca_snps_or)
+#===============
+# Step 4: Calculate for one snp
+# using i, when you run the loop, it is easy
+#===============
+i = "pnca_p.trp68gly"
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+# uncomment as necessary
+pnca_snps_or = pnca_snps_or[1:5,]
+pnca_snps_or = pnca_snps_or[c(1:5),]
+#===============
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+pnca_snps_or = pnca_snps_or[1:5,]
+pnca_snps_or = pnca_snps_or[c(1:5),]
+pnca_snps_or = pnca_snps_or[1:5]
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+pnca_snps_or = pnca_snps_or[1:5]
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+foo = pnca_snps_or[c(1:5,)]
+foo = pnca_snps_or[c(1:5),]
+foo = as.data.frame(pnca_snps_or[c(1:5),])
+View(foo)
+# create an empty dataframe
+pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR, but only for one snp
+# this is test before you apply it all others
+#===============
+pnca_snps_or$mutation == i
+View(pnca_snps_or)
+# create an empty dataframe
+pnca_snps_or = data.frame(mutation = i)
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+View(pnca_snps_or_copy)
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR, but only for one snp
+# this is test before you apply it all others
+#===============
+#reset original df so you don't make a mistake
+pnca_snps_or = pnca_snps_or_copy
+for (i in pnca_snps_unique){
+print(i)
+}
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
+#........................................
+# create an empty dataframe : uncomment as necessary
+pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
+View(pnca_snps_or)
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+View(pnca_snps_or)
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+warnings()
+View(pnca_snps_or)
+View(pnca_snps_or_copy)
+#sanity check
+pnca_snps_or$mutation == i1
+#sanity check
+pnca_snps_or[pnca_snps_or$mutation == i1]
+pnca_snps_or[pnca_snps_or$mutation == i2]
+pnca_snps_or[pnca_snps_or$mutation == i2,]
+pnca_snps_or1 = unique(pnca_snps_or)
+write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
+# you only need it for the unique mutations
+pnca_snps_or = unique(pnca_snps_or) #2133, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+View(pnca_snps_or)
+2.290256e+01
+1.561132e+06
+3.242285e-04
+#sanity check
+pnca_snps_or[pnca_snps_or$mutation == i1]
+pnca_snps_or[pnca_snps_or$mutation == i2,]
+write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+View(my_data)
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+snps_all = unique(my_data$mutation)# 337
+pnca_snps_or = snps_all
+pnca_snps_or = as.data.frame(snps_all)
+View(pnca_snps_or)
+snps_all[-"true_wt"]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+View(pnca_snps_or)
+snps_all = as.data.frame(snps_all)
+View(snps_all)
+#remove true_wt entry
+w1 = which(rownames(snps_all) == "true_wt")
+View(snps_all)
+#remove true_wt entry
+w1 = which(snps_all$snps_all == "true_wt")
+rm(pnca_snps_or)
+pnca_snps_or = snps_all[-w1]
+pnca_snps_or = snps_all[,-w1]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+#remove true_wt entry
+w1 = which(snps_all) == "true_wt"
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+snps_all = unique(my_data$mutation)# 337
+snps_all = as.data.frame(snps_all)
+snps_all[-c(1,1)]
+pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
+pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
+#remove true_wt entry
+#w1 = which(snps_all) == "true_wt"
+pnca_snps_or = snps_all
+pnca_snps_or = pnca_snps_or_copy
+#remove true_wt entry
+#w1 = which(snps_all) == "true_wt"
+pnca_snps_or = snps_all
+pnca_snps_or -> pnca_snps_or_copy
+#===============
+# Step 4: Iterate through this unique list
+# and calculate OR for each snp
+# and assign to the pnca_snps_or df that has
+# each row as a unique snp
+#===============
+# reset original df so you don't make a mistake: IMPORTANT
+pnca_snps_or = pnca_snps_or_copy #2133, 1
+# you only need it for the unique mutations
+pnca_snps_or = unique(pnca_snps_or) #337, 1
+for (i in pnca_snps_unique){
+print(i)
+#*************
+# start logistic regression model building
+#*************
+# set the IV and DV for the logistic regression model
+# IV: corresponds to each unique snp (extracted using grep)
+x = as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV: pyrazinamide 0 or 1
+y = as.numeric(raw_data$pyrazinamide)
+table(y,x)
+# run glm model
+model = glm(y ~ x, family = binomial)
+#model = glm(y ~ x, family = binomial(link = "logit"))
+summary(model)
+#**********
+# extract relevant model output
+#**********
+# extract log OR i.e the Beta estimate of the logistic model for a given snp
+my_logor = summary(model)$coefficients[2,1]
+print(paste0('Beta:', my_logor))
+# extract SE of the logistic model for a given snp
+my_se = summary(model)$coefficients[2,2]
+print(paste0('SE:', my_se))
+# extract Z of the logistic model for a given snp
+my_zval = summary(model)$coefficients[2,3]
+print(paste0('Z-value:', my_zval))
+# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
+#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
+my_or = exp(summary(model)$coefficients[2,1])
+print(paste0('OR:', my_or))
+# sanity check : should be True
+log(my_or) == my_logor
+# extract P-value of the logistic model for a given snp
+my_pval = summary(model)$coefficients[2,4]
+print(paste0('P-value:', my_pval))
+# extract confint interval of snp (2 steps, since the output is a named number)
+ci_mod = exp(confint(model))[2,]
+my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
+print(paste0('CI:', my_ci))
+#*************
+# Assign the regression output in the original df
+# you can use ('=' or '<-/->')
+#*************
+#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
+my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
+my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
+my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
+my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
+my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
+my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
+}
+getwd()
+#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
+setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
+#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
+getwd()
+#===============
+# Step 1: read raw data
+#===============
+raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
+,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
+raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
+# combine the two mutation columns
+raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
+head(raw_data$all_mutations_pyrazinamide)
+# create yet another column that contains all the mutations but in lower case
+raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
+table(grepl("pnca_p",raw_data$all_muts_pza))
+#FALSE  TRUE
+#10603  1908
+pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
+, stringsAsFactors = F
+, header = T) #2133
+# subset a snall section to test
+#pnca_snps_or_copy = pnca_snps_or
+#pnca_snps_or = pnca_snps_or_copy
+pnca_snps_unique = unique(pnca_snps_or$mutation) #293
+i2 = "pnca_p.trp68gly" # Should exist
+grep(i2, pnca_snps_unique)
+my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
+, stringsAsFactors = FALSE) #11374, 19
+# remove the first column
+my_data = my_data[-1] #11374, 18
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+# sanity check
+head(my_data$mutation)
+my_data = unique(my_data$mutation)
+my_data[!duplicated(my_data$mutation)]
+my_data_unique = my_data[!duplicated(my_data$mutation),]
+my_data[!duplicated('mutation'),]
+my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
+my_data_unique = my_data[!duplicated(my_data['mutation']),]
+getwd()
+setwd("/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+getwd()
+setwd("/git/github/LSHTM_analysis/meta_data_analysis")
+getwd()
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
+c = file.choose()
+c = file.choose(../Data_original)
+c = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F))
+c = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
+outdir = paste0("../mcsm_analysis",drug,"/Data/")
+# define output variables
+drug  = 'pyrazinamide'
+outdir = paste0("../mcsm_analysis",drug,"/Data/")
+outdir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+output_filename
diff --git a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1db3c8321b73129609403f35aa5ecc9a7fcd9421
GIT binary patch
literal 2203
zcmbtUOLN;c5C-@VDO#4~x8?MqJenyFwQT3m#N+YAj@zbfl1Xb%-C~+@7y-4|P$Ugf
zvE)N9`2+p|o{W6#p?{>OoN?``e<7!0K})nfo?a@B7K;zNi+y3YCnp^wf7+k3zs$`b
z^fzoqKR9_P|EZxNL=iz0Q!T_qQ>#vNW-xtdo}o7wBc}gW8d_98!^D<onc37>Kxc4r
zQA!&+)RZz4Y+_iUrE_kQT9T_$o0h3VCuoIE(kiW;TBIhiI-Qc(G@X&?ES;nCr)5$f
z<0Ew8)FxAOkv8bksYa&h4f+B7kbXozJ~hb93GQ#Q>Tvc9?VuC<V&!Lz%#nGv@CLDY
z+T1{FQE@lN+%3uFr?<!4oek7qVYMCf;#1O)dY?29YrHC*U|s<$CX)LaYc$a3NJks;
zp2NyM>Mygp)VO;^V^eC}Y@p#1tAhsQNy=|3`KfE<BPoAN$xmI7kEHwxo4&@Qw^IJL
z@@V>!M{kwJ%rzSCq{bbkF>^`d-99?}YadZ<7rA$jA2^RV^D{=9an#)FvF10usJVK-
zxw^W3Z*6_;esgVQ^?~C&^m|z+=IhPOk9ui`Gv@pQZT0Lu{{48dbr58&9Y6D1L6r2e
zR?LHgAo9cGitl^vn6h-aoxUau87O)o%QD6l?uGFY<H8Ig%KF6I_S2wEN?YxKs<o@u
ziN^!V<D?V6l%S(F{Xr;sdyB_iKa-rb+Y6IUkGqzu3;eynj|KkL)db!V_zPH{3H+T~
z68I~Dw*>w{;3ooqE%0+V+ZA{Z)V>sWU*MkrdL-~uVuBxSVm^;UpOcXn0o-ivcKt9U
z<*hJCI^cn@b`#bP{Ln4CIyYoyxCx{riK}r7cpG?wPXRv-qznWerEBp?;A=ptKo)?=
z_lx)(kXcFG@&#38e5|ib=a;~R>i*G96980z)WL|6TLpu;X|N>GYwnkU%!8P+pp@<Z
zj;(UR!NNCXV;n`1mOM*EW-8bDa3u%1Ln<2Ol}e+E;5<2(qs_&OT&v3<(*KnPGSSH!
zQnkWp1KsIrvXbM7^7=yJRFU&|q55t$FNn&FDzb80)l{{BDq_(GDgad=D4W1Ps>+d)
zuil9;U#e$gMXZ-``RK|~9DSBR)4Il`-qWunUpiY9xF}fBpini}&Ye|$UTIJj+t@K;
z%h#UBm%2LU)<{`mnLh|)E+)^9y(2kqxTue|%GDqnh*}(7nhx#_VR;^n+ny&%i62ov
z6<R`t7N^2Yco1cxbilIq5fw(_XC2|NK8W>FY2NC@T}A^rT-qI2Y&qtr2V=<W4m{r%
z<&<Tf8a%?vVoy$-OgQRD@x(do`C*U+EM2|8uaWu~KM#!53PgoQrVC&koEc=Sn+p6&
z;MYP=GcJrUNHbv`@VJ+VlAk0jqJJQM4;lu@`|TFQ(1MYxChU{l$9qpbUk=tm8l<f*
z%Y2W@sarst{mM9t+DwJ8oD77$(T!;@WSh{IX<rgUx3OKW;0mtdis9%^*|9WSe6~^W
dIcMwVh*s=Jm~GB$4nF@RSH(+;E?%HH@-L4f?(6^n

literal 0
HcmV?d00001

diff --git a/meta_data_analysis/init_data_dirs.py b/meta_data_analysis/init_data_dirs.py
new file mode 100755
index 0000000..8dd7446
--- /dev/null
+++ b/meta_data_analysis/init_data_dirs.py
@@ -0,0 +1,7 @@
+#!/usr/bin/python3
+# Initialise a blank 'Data' directory and drug subdirs etc.
+# TODO:
+# - Read base dir from config file
+# - Create eg: '~/git/Data/{original,processed}
+# - Create eg: '~/git/Data/processed/' + drug (for each drug)
+# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
diff --git a/meta_data_analysis/pnca_AF_and_OR_calcs.R b/meta_data_analysis/pnca_AF_and_OR_calcs.R
new file mode 100644
index 0000000..3c84670
--- /dev/null
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@@ -0,0 +1,241 @@
+getwd()
+setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
+getwd()
+
+#===============
+# Step 1: read GWAS raw data stored in Data_original/
+#===============
+infile = read.csv(file.choose(), stringsAsFactors = F)
+
+raw_data = infile[,c("id"
+                     , "pyrazinamide"
+                     , "dr_mutations_pyrazinamide"
+                     , "other_mutations_pyrazinamide")]
+
+#####
+# 1a: exclude na
+#####
+raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
+
+total_samples = length(unique(raw_data$id))
+print(total_samples)
+
+# sanity check: should  be true
+is.numeric(total_samples) 
+
+#####
+# 1b: combine the two mutation columns
+#####
+raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
+                                            , raw_data$other_mutations_pyrazinamide)
+head(raw_data$all_mutations_pyrazinamide)
+
+#####
+# 1c: create yet another column that contains all the mutations but in lower case
+#####
+raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
+
+# sanity checks
+table(grepl("pnca_p",raw_data$all_muts_pnca))
+
+# sanity check: should be TRUE
+sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
+
+# set up variables: can be used for logistic regression as well
+i  = "pnca_p.ala134gly" # has a NA, should NOT exist
+table(grepl(i,raw_data$all_muts_pnca))
+
+i = "pnca_p.trp68gly"
+table(grepl(i,raw_data$all_muts_pnca))
+
+mut = grepl(i,raw_data$all_muts_pnca)
+dst = raw_data$pyrazinamide
+table(mut, dst)
+
+#chisq.test(table(mut,dst))
+#fisher.test(table(mut, dst))
+#table(mut)
+
+###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
+pnca_snps_or = read.csv(file.choose()
+                        , stringsAsFactors = F
+                        , header = T)
+
+# extract unique snps to iterate over for AF and OR calcs
+# total no of unique snps
+# AF and OR calculations
+
+pnca_snps_unique = unique(pnca_snps_or$mutation) 
+
+# Define OR function
+x = as.numeric(mut)
+y = dst
+or = function(x,y){
+  tab = as.matrix(table(x,y))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+  }
+
+dst = raw_data$pyrazinamide
+ors = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  or(mut,dst)
+})
+
+ors
+
+pvals = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  fisher.test(mut,dst)$p.value
+})
+
+pvals
+
+afs = sapply(pnca_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_pnca)
+  mean(mut)
+})
+
+afs
+
+# check ..hmmm
+afs['pnca_p.trp68gly']
+afs['pnca_p.gln10pro'] 
+afs['pnca_p.leu4ser'] 
+
+#plot(density(log(ors)))
+#plot(-log10(pvals))
+#hist(log(ors)
+#     ,breaks = 100
+#     )
+
+# subset df cols to add to the calc param df
+pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
+pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
+
+rownames(pnca_snps_cols) = pnca_snps_cols$mutation
+head(rownames(pnca_snps_cols))
+#snps_with_AF_and_OR
+
+# combine
+comb_AF_and_OR = data.frame(ors, pvals, afs)
+head(rownames(comb_AF_and_OR))
+
+# sanity checks: should be the same
+dim(comb_AF_and_OR); dim(pnca_snps_cols)
+table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
+
+table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
+
+# merge the above two df whose dim you checked
+snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
+                            , by = "row.names"
+#                            , all.x = T
+                            )
+
+#rm(pnca_snps_cols, pnca_snps_or, raw_data)
+
+#===============
+# Step 3: Read data file where you will add the calculated OR 
+# Note: this is the big file with one-many relationship between snps and lineages
+# i.e fname4 from 'pnca_extraction.py'
+#===============
+my_data = read.csv(file.choose()
+                   , row.names = 1
+                   , stringsAsFactors = FALSE)
+
+head(my_data)
+length(unique(my_data$id))
+
+# check if first col is 'id': should be TRUE
+colnames(my_data)[1] == 'id'
+
+# sanity check
+head(my_data$mutation)
+
+# FILES TO MERGE:
+# comb_AF_and_OR: file containing OR
+# my_data = big meta data file 
+# linking column: mutation
+
+head(my_data)
+merged_df = merge(my_data # big file
+                  , snps_with_AF_and_OR # small (afor file)
+                  , by = "mutation"
+                  , all.x = T) # because you want all the entries of the meta data 
+
+# sanity checks: should be True 
+# FIXME: I have checked this manually, but make it so it is a pass or a fail!
+comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
+merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
+
+merged_df[merged_df$Mutationinformation.x == "Q10P",]
+
+# sanity check: very important!
+colnames(merged_df)
+
+table(merged_df$mutation_info.x == merged_df$mutation_info.y)
+
+#FIXME: what happened to other 7 and FALSE
+table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
+
+# problem
+identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
+
+#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
+
+#throw away the y because that is a smaller df
+d1 = which(colnames(merged_df) == "mutation_info.y") #21
+d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
+
+merged_df2 = merged_df[-c(d1, d2)] #3093 20
+colnames(merged_df2)
+
+# rename cols 
+colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
+colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
+
+colnames(merged_df2)
+
+# should be 0
+sum(is.na(merged_df2$Mutationinformation))
+
+# count na in each column
+na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
+# only some or and Af should be NA
+#Row.names           ors               pvals               afs 
+#81                  81                81                  81 
+
+
+colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
+colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
+colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
+
+colnames(merged_df2)
+
+# add log OR and neglog pvalue
+merged_df2$logor = log(merged_df2$OR)
+is.numeric(merged_df2$logor)
+
+merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
+is.numeric(merged_df2$neglog10pvalue)
+
+# write file out
+#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
+
+# define output variables
+drug  = 'pyrazinamide'
+out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
+outFile = "meta_data_with_AFandOR.csv"
+output_filename = paste0(outdir, outFile)
+
+write.csv(merged_df2, output_filename
+          , row.names = F)
diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py
new file mode 100755
index 0000000..0eb5aab
--- /dev/null
+++ b/meta_data_analysis/pnca_data_extraction.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+"""
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+#%% load libraries
+###################
+# load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
+#from pandas.api.types import is_string_dtype
+#from pandas.api.types import is_numeric_dtype
+
+# to create dir
+#my_dir = os.path.expanduser('~/some_dir')
+#make sure mcsm_analysis/ exists
+#or specify the output directory
+
+#%%
+#%%
+#%%
+#========================================================
+# TASK: extract ALL pncA mutations from GWAS data
+#========================================================
+#%%
+####################
+# my working dir
+os.getcwd()
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#%%
+from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+#%%
+#NOTE: Out_dir MUST exis
+# User defined dir strpyrazinamide
+drug = 'pyrazinamide'
+gene = 'pnca'
+out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
+# = out_dir + drug
+data_dir = homedir + '/git/Data'
+
+if not os.path.exists(data_dir):
+    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
+    os.makedirs(data_dir)
+    die()
+
+if not os.path.exists(out_dir):
+    print('Error!', out_dir, 'does not exist. Please create it')
+    exit()
+    
+#if not os.path.exists(work_dir):
+#    print('creating dir that does not exist', 'dir_name:', work_dir)
+#    os.makedirs(work_dir)
+else:
+    print('Dir exists: Carrying on')
+
+# now create sub dir structure within work_dir
+# pyrazinamide/mcsm_analysis
+
+# we need three dir
+# Data
+# Scripts
+    # Plotting
+# Results
+    # Plots
+    
+# create a list of dir names
+#dir_names = ['Data', 'Scripts', 'Results']
+
+
+#for i in dir_names:
+#    this_dir = (work_dir + '/' + i)
+#    if not os.path.exists(this_dir):
+#        print('creating dir that does not exist:', this_dir)
+#        os.makedirs(this_dir)
+#else:
+#    print('Dir exists: Carrying on')
+      
+# Create sub dirs
+# 1)        
+# Scripts
+    # Plotting
+#subdir_plotting = work_dir + '/Scripts/Plotting'
+#if not os.path.exists(subdir_plotting):
+#      print('creating dir that does not exist:', subdir_plotting)
+#      os.makedirs(subdir_plotting)
+#else:
+#    print('Dir exists: Carrying on')
+ 
+# 2)    
+# Results
+    # Plots
+#subdir_plots = work_dir + '/Results/Plots'        
+#if not os.path.exists(subdir_plots):
+#        print('creating dir that does not exist:', subdir_plots)
+#        os.makedirs(subdir_plots)    
+#else:
+#    print('Dir exists: Carrying on')
+         
+# clear varaibles
+#del(dir_names, drug, i, subdir_plots, subdir_plotting)
+
+#exit()
+#%%
+#==============================================================================
+############
+# STEP 1: Read file original_tanushree_data_v2.csv
+############
+data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+meta_data = pd.read_csv(data_file, sep = ',') 
+
+# column names
+list(meta_data.columns)
+
+# extract elevant columns to extract from meta data related to the pyrazinamide
+meta_data = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+# checks
+total_samples = meta_data['id'].nunique() # 19265
+
+# counts NA per column
+meta_data.isna().sum()
+
+# glance
+meta_data.head()
+
+# equivalent of table in R
+# pyrazinamide counts
+meta_data.pyrazinamide.value_counts() 
+
+#%%
+############
+# STEP 2: extract entries containing selected genes: 
+# pyrazinamide: pnca_p.
+# in the dr_mutations and other mutations"
+# as we are interested in the mutations in the protein coding region only 
+# (corresponding to a structure)
+# and drop the entries with NA
+#############
+meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+
+del(meta_pza)
+
+##########################
+# pyrazinamide: pnca_p.
+##########################
+meta_data_pnca = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+del(meta_data)
+
+# sanity checks
+
+# dr_mutations only
+meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_dr['id'].nunique() 
+del(meta_pnca_dr)
+
+# other mutations
+meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_other['id'].nunique() 
+del(meta_pnca_other)
+
+# Now extract "all" mutations
+meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
+
+meta_pnca_all['id'].nunique() 
+pnca_samples = len(meta_pnca_all)
+pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
+comp_pnca_samples = pnca_samples - pnca_na 
+
+#=#=#=#=#=#=#
+# COMMENT: use it later to check number of complete samples from LF data
+#=#=#=#=#=#=#
+
+# sanity checks
+meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
+meta_pnca_all.other_mutations_pyrazinamide.value_counts()
+
+# more sanity checks 
+# !CAUTION!: muts will change depending on your gene
+
+# dr muts : insert
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
+
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+
+# other_muts
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
+
+#=#=#=#=#=#=#=#=#=#
+# FIXME
+# COMMENTS: both mutations columns are separated by ; 
+# CHECK if there are mutations that exist both in dr and other_muts!
+# doesn't make any sense for the same mut to exist in both, I would have thought!
+#=#=#=#=#=#=#=#=#=#
+
+# remove not required variables
+del(meta_data_pnca)
+
+############
+# STEP 3: split the columns of 
+# a) dr_mutation_... (;) as 
+# the column has snps related to multiple genes.
+# useful links
+# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
+# this one works beautifully
+# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
+############
+
+# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
+meta_pnca_all.isna().sum()
+
+#=#=#=#=#=#=#=#=#=#
+# COMMENT: no NA's in dr_mutations/other_mutations_columns
+#=#=#=#=#=#=#=#=#=#
+# define the split function
+def tidy_split(df, column, sep='|', keep=False):
+    """
+    Split the values of a column and expand so the new DataFrame has one split
+    value per row. Filters rows where the column is missing.
+
+    Params
+    ------
+    df : pandas.DataFrame
+        dataframe with the column to split and expand
+    column : str
+        the column to split and expand
+    sep : str
+        the string used to split the column's values
+    keep : bool
+        whether to retain the presplit value as it's own row
+
+    Returns
+    -------
+    pandas.DataFrame
+        Returns a dataframe with the same columns as `df`.
+    """
+    indexes = list()
+    new_values = list()
+    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
+    for i, presplit in enumerate(df[column].astype(str)):
+        values = presplit.split(sep)
+        if keep and len(values) > 1:
+            indexes.append(i)
+            new_values.append(presplit)
+        for value in values:
+            indexes.append(i)
+            new_values.append(value)
+    new_df = df.iloc[indexes, :].copy()
+    new_df[column] = new_values
+    return new_df
+
+########
+# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
+#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+########    
+meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 
+
+# remove leading white space else these are counted as distinct mutations as well
+meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
+
+########
+# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
+######## 
+meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 
+
+# remove the leading white spaces in the column
+meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 
+
+##########
+# Step 4: Reshape data so that all mutations are in one column and the 
+# annotations for the mutation reflect the column name
+# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
+
+# data frame “df” is passed to melt() function
+# id_vars is the variable which need to be left unaltered
+# var_name are the column names so we named it as 'mutation_info'
+# value_name are its values so we named it as 'mutation'
+##########
+meta_pnca_WF1.columns
+
+meta_pnca_LF0 = pd.melt(meta_pnca_WF1
+                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
+                      , var_name = 'mutation_info'
+                      , value_name = 'mutation') 
+
+# sanity check: should be true
+if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
+    print('sanity check passed: Long format df has the expected length')
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 5: This is still dirty data. Filter LF data so that you only have
+# mutations corresponding to pnca_p. 
+# this will be your list you run OR calcs 
+###########
+meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 
+
+# sanity checks
+# unique samples
+meta_pnca_LF1['id'].nunique()
+if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
+    print("Sanity check passed: No of samples with pncA mutations match")
+else:
+    print("Sanity check failed: Debug please!")
+
+# count if all the mutations are indeed in the protein coding region 
+# i.e begin with pnca_p
+meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
+
+# should  be true.
+# and check against the length of the df, which should match
+if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
+    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 6: Filter dataframe with "na" in the drug column
+# This is because for OR, you can't use the snps that have the
+# NA in the specified drug column
+# it creates problems when performing calcs in R inside the loop
+# so best to filter it here
+###########
+# NOT NEEDED FOR all snps, only for extracting valid OR snps
+del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
+
+###########
+# Step 7: count unique pncA_p mutations (all and comp cases)
+###########
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('mutation_info').nunique()
+
+meta_pnca_LF1['id'].nunique()  
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('id').nunique()
+
+###########
+# Step 8: convert all snps only (IN LOWERCASE)
+# because my mcsm file intergated has lowercase
+###########
+# convert mutation to lower case as it needs to exactly match the dict key
+#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
+meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
+
+###########
+# Step 9 : Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one from the 
+# referece_dict imported pncaeady. First convert to mutation to lowercase
+# to allow to match entries from dict 
+###########
+#=======
+# Step 9a: iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a regex match from the dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#=======
+# initialise a sub dict that is a lookup dict for three letter code to one
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
+
+# extract position info from mutation column separetly using regex
+meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#=========
+# Step 9b: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
+    
+# added two more cols
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#========
+# Step 9c: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+# added two more cols
+    
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+########
+# Step 10: combine the wild_type+poistion+mutant_type columns to generate 
+# Mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+#########
+meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
+
+#=#=#=#=#=#=#
+# Step 11:
+# COMMENT: there is more processing in the older version of this script
+# consult if necessary
+# possibly due to the presence of true_wt
+# since this file doesn't contain any true_wt, we won't need it(hopefully!)
+#=#=#=#=#=#=#
+
+#%%
+###########
+# Step 12: Output files for only SNPs to run mCSM
+###########
+
+#=========
+# Step 12a: all SNPs to run mCSM
+#=========
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+
+# assign meaningful colnames 
+#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
+#list(snps_only.columns)
+snps_only.isna().sum() # should be 0
+
+# output csv: all SNPS for mCSM analysis
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_snps_'
+nrows = len(snps_only) 
+
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+output_file_path = data_dir + '/input/processed/' + drug + '/'
+
+if not os.path.exists(output_file_path):
+    print( output_file_path, 'does not exist. Creating')
+    os.makedirs(output_file_path)
+    exit()
+
+output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+# Bad practice: numbers at the start of a filename
+snps_only.to_csv(output_filename, header = False, index = False)
+
+#=========
+# Step 12b: all snps with annotation
+#=========
+# all snps, selected cols
+#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
+#                               , 'drtype', 'pyrazinamide'
+#                               , 'mutation_info', 'mutation', 'Mutationinformation']] 
+
+#len(pnca_snps_ALL) 
+
+# sanity check
+#meta_pnca_LF1['mutation'].nunique() 
+
+# output csv: WITH column but WITHOUT row names(all snps with meta data)
+# specify variable name for output file
+#gene = 'pnca'
+#drug = 'pyrazinamide'
+#my_fname2 = '_snps_with_metadata_'
+#nrows = len(pnca_snps_ALL) 
+
+#output_file_path = work_dir + '/Data/'
+#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
+#print(output_filename)  #<<<- check
+
+# write out file
+#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+
+#=========
+# Step 12c: comp snps for OR calcs with annotation
+#=========
+# remove all NA's from pyrazinamide column from LF1
+    
+# counts NA per column
+meta_pnca_LF1.isna().sum()
+
+# remove NA
+meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
+
+# sanity checks
+# should be True
+len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
+
+# unique counts
+meta_pnca_LF2['mutation'].nunique() 
+
+meta_pnca_LF2.groupby('mutation_info').nunique() 
+
+# sanity check
+meta_pnca_LF2['id'].nunique() 
+
+# should be True
+if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
+    print ('sanity check passed: complete numbers match')
+else:
+    print('Error: Please Debug!')
+
+# value counts
+meta_pnca_LF2.mutation.value_counts()
+#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
+
+# valid/comp snps
+# uncomment as necessary
+pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+len(pnca_snps_COMP) 
+
+# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
+# specify variable name for output file
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname3 = '_comp_snps_with_metadata_'
+nrows = len(pnca_snps_COMP) 
+
+#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
+#print(output_filename) #<<<-check
+
+# write out file
+#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
+
+
+#=========
+# Step 12d: comp snps only
+#=========
+# output csv: comp SNPS for info (i.e snps for which OR exist)
+# specify variable name for output file
+
+snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_comp_snps_'
+nrows = len(snps_only) 
+
+output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+snps_only.to_csv(output_filename, header = False, index = False)
+
+
+#=#=#=#=#=#=#=#
+# COMMENT: LF1 is the file to extract all unique snps for mcsm 
+# but you have that already in file called pnca_snps...
+# LF2: is the file for extracting snps tested for DS and hence OR calcs
+#=#=#=#=#=#=#=#
+
+###########
+# Step 13 : Output the whole df i.e 
+# file for meta_data which is now formatted with
+# each row as a unique snp rather than the original version where
+# each row is a unique id
+# ***** This is the file you will ADD the AF and OR calculations to *****
+###########
+
+# output csv: the entire DF
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname4 = '_metadata'
+#nrows = len(meta_pnca_LF1)
+output_filename = output_file_path + gene + my_fname4  + '.csv'
+print(output_filename) #<<<-check
+
+# write out file
+meta_pnca_LF1.to_csv(output_filename) 
diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py
new file mode 100755
index 0000000..bf6561b
--- /dev/null
+++ b/meta_data_analysis/reference_dict.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun 18 11:32:28 2019
+
+@author: tanushree
+"""
+############################################
+#load libraries
+import pandas as pd
+import os
+#############################################
+
+#!#########################!
+# REQUIREMNETS:
+# Data_original/ must exist
+# containing GWAS data
+#!#########################!
+
+print(os.getcwd()) 
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/Data/input/original') 
+print(os.getcwd())
+#==========
+#read file
+#==========
+my_aa = pd.read_csv('aa_codes.csv') #20, 6
+#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
+#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
+my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
+
+#=========================================================
+#convert file to  dict of dicts
+#=========================================================
+#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
+#with your choice of column name that you have assigned to index as the "primary key". 
+#using 'index' creates a dict of dicts
+#using 'records' creates a list of dicts
+my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
+
+#================================================
+#dict of aa with their corresponding properties
+#This is defined twice
+#================================================
+#7 categories: no overlap
+qualities1 = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidic'
+             , ('N', 'Q'): 'Amidic'
+             , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
+             , ('S', 'T'): 'Hydroxylic'
+             , ('F', 'W', 'Y'): 'Aromatic'
+             , ('C', 'M'): 'Sulphur'
+}
+
+#9 categories: allowing for overlap
+qualities2 = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidc'
+             , ('S', 'T', 'N', 'Q'): 'Polar'
+             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
+             , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
+             , ('S', 'G', 'A', 'P'): 'Small'
+             , ('F', 'W', 'Y', 'H'): 'Aromatic'
+             , ('V', 'I', 'L', 'M'): 'Aliphatic'
+             , ('C', 'G', 'P'): 'Special'
+}
+
+qualities_taylor = { ('R', 'H', 'K'): 'Basic'
+             , ('D', 'E'): 'Acidc'
+             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
+             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
+             #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
+             , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' 
+             , ('F', 'W', 'Y', 'H'): 'Aromatic'
+             , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
+             , ('C', 'G', 'P'): 'Special'
+}
+
+qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
+                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
+}
+
+qualities_polarity = { ('D', 'E'): 'acidic'
+                      , ('H', 'K', 'R'): 'basic'
+                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
+                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
+}
+
+#==============================================================================                
+#adding amino acid properties to my dict of dicts                      
+for k, v in my_aa_dict.items():
+    #print (k,v)
+    v['aa_prop1'] = str() #initialise keys 
+    v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
+    v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
+    v['aa_prop_water'] = str() #initialise keys
+    v['aa_prop_polarity'] = str() #initialise keys
+    
+    for group in qualities1:
+        if v['one_letter_code'] in group:
+            v['aa_prop1']+= qualities1[group] # += for str concat   
+
+    for group in qualities2:
+        if v['one_letter_code'] in group:
+            v['aa_prop2'].append(qualities2[group]) # append to list
+ 
+    for group in qualities_taylor:
+        if v['one_letter_code'] in group:
+            v['aa_taylor'].append(qualities_taylor[group]) # append to list           
+            
+    for group in qualities_water:
+        if v['one_letter_code'] in group:
+            v['aa_prop_water']+= qualities_water[group] # += for str concat          
+
+    for group in qualities_polarity:
+        if v['one_letter_code'] in group:
+            v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat 
+             
+#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
+#==============================================================================
+            
+  
diff --git a/mk-drug-dirs.sh b/mk-drug-dirs.sh
new file mode 100755
index 0000000..56fb820
--- /dev/null
+++ b/mk-drug-dirs.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Create a drug directory structure for processing
+#
+#
+# Structure:
+#
+# $DATA_DIR/$DRUG/input
+#                 |- original
+#                 |- processed
+#                 |- structure
+#                 
+# $DATA_DIR/$DRUG/output
+#                 |- plots
+#                 |- structure
+
+DATA_DIR=~/git/Data
+
+if [[ $1 == '' ]]; then
+    echo "usage: mk-drug-dirs.sh <drug name>";
+    exit;
+else
+    DRUG=$1
+    echo Creating structure for: $DRUG
+
+    if [ -d $DATA_DIR ]
+    then
+        echo Doing creation in $DATA_DIR
+        mkdir -p $DATA_DIR/$DRUG/input/original
+        mkdir -p $DATA_DIR/$DRUG/input/processed
+        mkdir -p $DATA_DIR/$DRUG/input/structure
+        mkdir -p $DATA_DIR/$DRUG/output/plots
+        mkdir -p $DATA_DIR/$DRUG/output/structure
+        
+    else
+        echo "Error: $DATA_DIR does not exist. Did you check it out of git?"
+        exit
+    fi
+
+fi
+