From 3905a81c38345be20a5a8a84e6862e52a55a7bd7 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 6 Apr 2020 19:03:41 +0100
Subject: [PATCH] refactoring code to make it take command line args

---
 README.md                                     |  25 +-
 .../pyrazinamide/scripts/Header_TT.R          | 130 ----
 .../pyrazinamide/scripts/KS_test_PS.R         | 157 ----
 .../scripts/barplot_colour_function.R         |  27 -
 .../pyrazinamide/scripts/combining_two_df.R   | 417 -----------
 .../scripts/combining_two_df_lig.R            | 330 --------
 .../scripts/generate_mut_sequences.py         | 215 ------
 .../pyrazinamide/scripts/mcsm/run.sh          |   9 -
 .../mcsm/step0_check_duplicate_SNPs.sh        |  25 -
 .../scripts/mcsm/step1_lig_output_urls.sh     | 104 ---
 .../scripts/mcsm/step2_lig_results.sh         |  76 --
 .../mcsm/step3a_results_format_interim.sh     |  74 --
 .../scripts/mcsm/step3b_results_format_df.py  |  63 --
 .../scripts/mcsm/step3c_results_cleaning.R    | 230 ------
 .../scripts/mcsm/step4_results_normalise.R    | 275 -------
 .../scripts/mcsm_mean_stability.R             | 131 ----
 .../pyrazinamide/scripts/plotting/.RData      | Bin 43777 -> 0 bytes
 .../plotting/OR_PS_Ligand_combined_plot.R     | 250 -------
 .../scripts/plotting/barplots_2colours_LIG.R  | 154 ----
 .../scripts/plotting/barplots_2colours_PS.R   | 149 ----
 .../plotting/barplots_subcolours_LIG.R        | 202 -----
 .../scripts/plotting/barplots_subcolours_PS.R | 192 -----
 .../plotting/barplots_subcolours_aa_LIG.R     | 296 --------
 .../plotting/barplots_subcolours_aa_PS.R      | 292 --------
 .../scripts/plotting/basic_barplots_LIG.R     | 215 ------
 .../scripts/plotting/basic_barplots_PS.R      | 211 ------
 .../scripts/plotting/corr_plots_v3_PS.R       | 175 -----
 .../scripts/plotting/corr_plots_v3_lig.R      | 187 -----
 .../scripts/plotting/lineage_basic_barplot.R  | 227 ------
 .../scripts/plotting/lineage_dist_LIG.R       | 253 -------
 .../scripts/plotting/lineage_dist_PS.R        | 229 ------
 .../scripts/plotting/logolas_logoplot.R       | 250 -------
 .../scripts/plotting/snp_logo_plot.R          | 273 -------
 .../scripts/plotting/subcols_axis_LIG.R       | 208 ------
 .../scripts/plotting/subcols_axis_PS.R        | 208 ------
 mcsm_analysis/pyrazinamide/scripts/read_pdb.R |  27 -
 .../pyrazinamide/scripts/replaceBfactor_pdb.R | 386 ----------
 .../pyrazinamide/scripts/source_data_checks.R | 257 -------
 meta_data_analysis/dssp_df.py                 |   5 +-
 mk_drug_dirs.sh                               |  10 +-
 .../data_extraction.py                        | 702 +++++++++++-------
 .../reference_dict.py                         |  16 +-
 42 files changed, 456 insertions(+), 7206 deletions(-)
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/Header_TT.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
 delete mode 100755 mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/.RData
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/read_pdb.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
 delete mode 100644 mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
 rename {meta_data_analysis => scripts}/data_extraction.py (57%)
 rename {meta_data_analysis => scripts}/reference_dict.py (95%)

diff --git a/README.md b/README.md
index 628cafd..dd4cfb5 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,10 @@
 mCSM Analysis
 =============
 
-This repo does mCSM analysis using bash, python and R.
-
+This contains scripts that does the following:
+ 1. mCSM analysis: using bash, python and R
+ 2. meta data analysis: using python and R
+ 
 Requires an additional 'Data' directory. Batteries not included:-)
 
 ## Assumptions
@@ -19,17 +21,14 @@ subdirs within this repo
  	*.R
  	*.py
 	
- mcsm\_analysis/
-	<drug>/
-		scripts/
-		*.R
-		*.py
-			mcsm/
-			*.sh
-			*.py
-			*.R
-			plotting/
-			*.R
+ mcsm_analysis
+#	<drug>/
+
+ foldx_analysis
+ 
+ plotting
+ 	*.R
+ 
 ```
 
 More docs here as I write them. 
diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
deleted file mode 100644
index 9eae42a..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ /dev/null
@@ -1,130 +0,0 @@
-#########################################################
-### A) Installing and loading required packages
-#########################################################
-#lib_loc = "/usr/local/lib/R/site-library")
-
-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
-
-#if (!require("tidyverse")) {
-#  install.packages("tidyverse", dependencies = TRUE)
-#  library(tidyverse)
-#}
-
-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
-}
-
-if (!require("plotly")) {
-  install.packages("plotly", dependencies = TRUE)
-  library(plotly)
-}
-
-if (!require("cowplot")) {
-  install.packages("copwplot", dependencies = TRUE)
-  library(cowplot)
-}
-
-if (!require("ggcorrplot")) {
-  install.packages("ggcorrplot", dependencies = TRUE)
-  library(ggcorrplot)
-}
-
-if (!require("ggpubr")) {
-  install.packages("ggpubr", dependencies = TRUE)
-  library(ggpubr)
-}
-
-if (!require("RColorBrewer")) {
-  install.packages("RColorBrewer", dependencies = TRUE)
-  library(RColorBrewer)
-}
-
-if (!require ("GOplot")) {
-  install.packages("GOplot")
-  library(GOplot)
-}
-
-if(!require("VennDiagram")) {
-  install.packages("VennDiagram", dependencies = T)
-  library(VennDiagram)
-}
-
-if(!require("scales")) {
-  install.packages("scales", dependencies = T)
-  library(scales)
-}
-
-if(!require("plotrix")) {
-  install.packages("plotrix", dependencies = T)
-  library(plotrix)
-}
-
-if(!require("stats")) {
-  install.packages("stats", dependencies = T)
-  library(stats)
-}
-
-if(!require("stats4")) {
-  install.packages("stats4", dependencies = T)
-  library(stats4)
-}
-
-if(!require("data.table")) {
-install.packages("data.table")
-  library(data.table)
-}
-
-if (!require("PerformanceAnalytics")){
-  install.packages("PerformanceAnalytics", dependencies = T)
-  library(PerformaceAnalytics)
-}
-
-if (!require ("GGally")){
-  install.packages("GGally")
-  library(GGally)
-}
-
-if (!require ("corrr")){
-  install.packages("corrr")
-  library(corrr)
-}
-
-if (!require ("psych")){
-  install.packages("psych")
-  library(psych)
-}
-
-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(dplyr)
-}
-
-if (!require ("compare")){
-  install.packages("compare")
-  library(compare)
-}
-
-if (!require ("arsenal")){
-  install.packages("arsenal")
-  library(arsenal)
-}
-
-
-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
-if(!require(bio3d)){
-  install.packages("bio3d")
-  library(bio3d)
-}
diff --git a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R b/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
deleted file mode 100644
index 5a827c8..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
+++ /dev/null
@@ -1,157 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-# merged_df2
-# merged_df3
-
-# df without NA for pyrazinamide:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-
-# uncomment as necessary
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info); str(my_df$mutation_info)
-
-# subset df with dr muts only
-my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
-table(my_df_dr$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Run two times: 
-# uncomment as necessary
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-
-#================
-# for ALL muts
-#================
-#plot_df = my_df  
-
-#================
-# for dr muts ONLY
-#================
-plot_df = my_df_dr 
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-#============================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-table(plot_df$lineage); str(plot_df$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioDUET
-lin2 = df[df$lineage == "lineage2",]$ratioDUET
-lin3 = df[df$lineage == "lineage3",]$ratioDUET
-lin4 = df[df$lineage == "lineage4",]$ratioDUET
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3)
-ks.test(lin2,lin4)  
-
-ks.test(lin3,lin4)  
-
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
deleted file mode 100644
index a3cc403..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
+++ /dev/null
@@ -1,27 +0,0 @@
-#########################################################
-# 1b: Define function: coloured barplot by subgroup
-# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
-#########################################################
-
-ColourPalleteMulti <- function(df, group, subgroup){
-  
-  # Find how many colour categories to create and the number of colours in each
-  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
-                          , df
-                          , function(x) length(unique(x)))
-  #  return(categories) }
-  
-  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
-  
-  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
-  
-  #return(category.start); return(category.end)}
-  
-  # Build Colour pallette
-  colours <- unlist(lapply(1:nrow(categories),
-                           function(i){
-                             colorRampPalette(colors = c(category.start[i]
-                                                         , category.end[i]))(categories[i,2])}))
-  return(colours)
-}
-#########################################################
\ No newline at end of file
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
deleted file mode 100644
index 31a533b..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ /dev/null
@@ -1,417 +0,0 @@
-#########################################################
-# TASK: To combine mcsm and meta data with af and or files  
-# Input csv files:
-# 1) mcsm normalised and struct params
-# 2) gene associated meta_data_with_AFandOR
-
-# Output: 
-# 1) muts with opposite effects on stability
-# 2) large combined df including NAs for AF, OR,etc
-# 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
-# 3) small combined df including NAs for AF, OR, etc.
-# 		Dim: same as mcsm data
-# 4) large combined df excluding NAs 
-# 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
-# 5) small combined df excluding NAs
-# 		Dim: dim(#2) - no. of unique NAs - 1
-# This script is sourced from other .R scripts for plotting
-#########################################################
-getwd()
-setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/')
-getwd()
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-source('Header_TT.R')
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# data dir
-#===========
-datadir = paste0('~/git/Data')
-
-#===========
-# input
-#===========
-# infile1: mCSM data
-#indir = '~/git/Data/pyrazinamide/input/processed/'
-indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
-#in_filename = 'mcsm_complex1_normalised.csv'
-in_filename = 'pnca_mcsm_struct_params.csv' 
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
-
-# infile2: gene associated meta data combined with AF and OR
-#indir: same as above
-in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
-infile_comb = paste0(indir, '/', in_filename_comb)
-cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
-
-#===========
-# output
-#===========
-# Uncomment if and when required to output
-outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir
-cat('Output dir: ', outdir)
-#out_filename = paste0(tolower(gene), 'XXX')
-#outfile = paste0(outdir, '/', out_filename)
-#cat(paste0('Output file with full path:', outfile))
-#%% end of variable assignment for input and output files
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-cat('Reading mcsm_data:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename)
-
-mcsm_data = read.csv(infile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(mcsm_data)
-    , '\nNo. of cols:', ncol(mcsm_data))
-
-# clear variables
-rm(in_filename, infile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
-changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
-
-# sanity check: redundant, but uber cautious!
-dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
-ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
-
-cat('Identifying muts with opposite stability effects')
-if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
-  cat('PASS: muts with opposite effects on stability and affinity correctly identified'
-        , '\nNo. of such muts: ', nrow(changes))
-}else {
-  cat('FAIL: unsuccessful in extracting muts with changed stability effects')
-}
-
-#***************************
-# write file: changed muts
-out_filename = 'muts_opp_effects.csv'
-outfile = paste0(outdir, '/', out_filename)
-cat('Writing file for muts with opp effects:'
-    , '\nFilename: ', outfile
-    , '\nPath: ', outdir)
-
-write.csv(changes, outfile)
-#****************************
-# clear variables
-rm(out_filename, outfile)
-rm(changes, dl_i, ld_i)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-orig_col = ncol(mcsm_data)
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
-
-cat('Added 1 col: position frequency to see which posn has how many muts'
-    , '\nNo. of cols now', ncol(mcsm_data)
-    , '\nNo. of cols before: ', orig_col)
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-cat('Reading combined meta data and AFandOR file:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename_comb)
-
-meta_with_afor <- read.csv(infile_comb
-                      , stringsAsFactors = F
-                      , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(meta_with_afor)
-    , '\nNo. of cols:', ncol(meta_with_afor))
-
-# counting NAs in AF, OR cols
-if (identical(sum(is.na(meta_with_afor$OR))
-              , sum(is.na(meta_with_afor$pvalue))
-              , sum(is.na(meta_with_afor$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count = sum(is.na(meta_with_afor$AF))
-  cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
-      , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
-      , '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
-}
-
-# clear variables
-rm(in_filename_comb, infile_comb)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-###########################
-# 3: merging two dfs: with NA
-###########################
-# link col name  = 'Mutationinforamtion'
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2')
-
-#########
-# merge 3a (merged_df2): meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = 'Mutationinformation'
-                  , all.y = T)
-
-cat('Dim of merged_df2: '
-    , '\nNo. of rows: ', nrow(merged_df2)
-    , '\nNo. of cols: ', ncol(merged_df2))
-head(merged_df2$Position)
-
-# sanity check
-cat('Checking nrows in merged_df2')
-if(nrow(meta_with_afor) == nrow(merged_df2)){
-  cat('nrow(merged_df2) = nrow (gene associated metadata)'
-      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
-      ,'\nGot no. of rows: ', nrow(merged_df2))
-} else{
-  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
-      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
-      , '\nGot no. of rows: ', nrow(merged_df2)
-      , '\nFinding discrepancy')
-  merged_muts_u = unique(merged_df2$Mutationinformation)
-  meta_muts_u = unique(meta_with_afor$Mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = 'Mutationinformation'
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 3b (merged_df3):remove duplicate mutation information
-#########
-cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
-    ,'\nCannot trust lineage info from this'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df3')
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity check
-cat('Checking nrows in merged_df3')
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat('PASS: No. of rows match with mcsm_data'
-      ,'\nExpected no. of rows: ', nrow(mcsm_data)
-      ,'\nGot no. of rows: ', nrow(merged_df3))
-} else {
-  cat('FAIL: No. of rows mismatch'
-      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
-      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
-}
-
-# counting NAs in AF, OR cols in merged_df3
-# this is becuase mcsm has no AF, OR cols,
-# so you cannot count NAs
-if (identical(sum(is.na(merged_df3$OR))
-              , sum(is.na(merged_df3$pvalue))
-              , sum(is.na(merged_df3$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count_df3 = sum(is.na(merged_df3$AF))
-  cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(merged_df3$OR))
-      , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
-      , '\nNA in AF:', sum(is.na(merged_df3$AF)))
-}
-
-###########################
-# 4: merging two dfs: without NA
-###########################
-#########
-# merge 4a (merged_df2_comp): same as merge 1 but excluding NA
-#########
-cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2_comp')
-
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-
-# sanity check
-cat('Checking nrows in merged_df2_comp')
-if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
-  cat('PASS: No. of rows match'
-      ,'\nDim of merged_df2_comp: '
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      , '\nNo. of rows: ', nrow(merged_df2_comp)
-      , '\nNo. of cols: ', ncol(merged_df2_comp))
-}else{
-  cat('FAIL: No. of rows mismatch'
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      ,'\nGot no. of rows: ', nrow(merged_df2_comp))
-}
-
-#########
-# merge 4b (merged_df3_comp): remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-cat('Dim of merged_df3_comp: '
-    , '\nNo. of rows: ', nrow(merged_df3_comp)
-    , '\nNo. of cols: ', ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-# sanity check
-cat('Checking nrows in merged_df3_comp')
-if(nrow(merged_df3_comp) == nrow(merged_df3)){
-  cat('NO NAs detected in merged_df3 in AF|OR cols'
-      ,'\nNo. of rows are identical: ', nrow(merged_df3))
-} else{
-  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
-  cat('PASS: NAs detected in merged_df3 in AF|OR cols'
-      , '\nNo. of NAs: ', na_count_df3
-      , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
-      , '\nGot no. of rows: ', nrow(merged_df3_comp))
-  }
-}
-
-#=============== end of combining df
-#*********************
-# writing 1 file in the style of a loop: merged_df3
-# print(output dir)
-#i = 'merged_df3'
-#out_filename = paste0(i, '.csv')
-#outfile = paste0(outdir, '/', out_filename)
-
-#cat('Writing output file: '
-#    ,'\nFilename: ', out_filename
-#    ,'\nPath: ', outdir)
-
-#template: write.csv(merged_df3, 'merged_df3.csv')
-#write.csv(get(i), outfile, row.names = FALSE)
-#cat('Finished writing: ', outfile
-#    , '\nNo. of rows: ', nrow(get(i))
-#    , '\nNo. of cols: ', ncol(get(i)))
-
-#%% write_output files;  all 4 files: 
-outvars = c('merged_df2'
-             , 'merged_df3'
-             , 'merged_df2_comp'
-             , 'merged_df3_comp')
-
-cat('Writing output files: '
-    , '\nPath:', outdir)
-
-for (i in outvars){
-#  cat(i, '\n')
-  out_filename = paste0(i, '.csv')
-#  cat(out_filename, '\n')
-#  cat('getting value of variable: ', get(i))
-  outfile = paste0(outdir, '/', out_filename)
-#  cat('Full output path: ', outfile, '\n')
-  cat('Writing output file:'
-      ,'\nFilename: ', out_filename,'\n')
-  write.csv(get(i), outfile, row.names = FALSE)
-  cat('Finished writing: ', outfile
-      , '\nNo. of rows: ', nrow(get(i))
-      , '\nNo. of cols: ', ncol(get(i)), '\n')
-}
-
-# alternate way to replace with implicit loop 
-# FIXME
-#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
-#*************************
-# clear variables
-rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
-rm(pos_count_check)
-#============================= end of script
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
deleted file mode 100644
index 361b6b6..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ /dev/null
@@ -1,330 +0,0 @@
-#########################################################
-# TASK: To combine mcsm and meta data with af and or
-# by filtering for distance to ligand (<10Ang).
-# This script doesn't output anything.
-# This script is sourced from other .R scripts for plotting ligand plots
-
-# Input csv files:
-# 1) mcsm normalised and struct params
-# 2) gene associated meta_data_with_AFandOR
-#########################################################
-getwd()
-setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/')
-getwd()
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-
-source('Header_TT.R')
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# input
-#===========
-# infile1: mCSM data
-#indir = '~/git/Data/pyrazinamide/input/processed/'
-indir = paste0('~/git/Data', '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
-#in_filename = 'mcsm_complex1_normalised.csv'
-in_filename = 'pnca_mcsm_struct_params.csv' 
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
-
-# infile2: gene associated meta data combined with AF and OR
-#indir: same as above
-in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
-infile_comb = paste0(indir, '/', in_filename_comb)
-cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
-
-#===========
-# output
-#===========
-# Uncomment if and when required to output
-outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir
-cat('Output dir: ', outdir)
-#out_filename = paste0(tolower(gene), 'XXX')
-#outfile = paste0(outdir, '/', out_filename)
-#cat(paste0('Output file with full path:', outfile))
-#%% end of variable assignment for input and output files
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-cat('Reading mcsm_data:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename)
-
-mcsm_data = read.csv(infile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(mcsm_data)
-    , '\nNo. of cols:', ncol(mcsm_data))
-
-# clear variables
-rm(in_filename, infile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-# excluded from here as it is redundant. 
-# check 'combining_two_df.R' to refer if required.
-
-########################### !!! only for mcsm_lig
-# 4: Filter/subset data 
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# count
-table(mcsm_data$Dis_lig_Ang<10)
-
-# subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-# sanity checks
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-# count no of unique positions
-length(unique(mcsm_data2$Position))
-
-# count no of unique mutations
-length(unique(mcsm_data2$Mutationinformation))
-
-# count Destabilisinga and stabilising
-table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(mcsm_data2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT: so as not to alter the script
-mcsm_data = mcsm_data2
-#!!!!!!!!!!!!!!!!!!!!!
-# clear variables
-rm(mcsm_data2)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-orig_col = ncol(mcsm_data)
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
-
-cat('Added 1 col: position frequency to see which posn has how many muts'
-    , '\nNo. of cols now', ncol(mcsm_data)
-    , '\nNo. of cols before: ', orig_col)
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-cat('Reading combined meta data and AFandOR file:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename_comb)
-
-meta_with_afor <- read.csv(infile_comb
-                      , stringsAsFactors = F
-                      , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(meta_with_afor)
-    , '\nNo. of cols:', ncol(meta_with_afor))
-
-# clear variables
-rm(in_filename_comb, infile_comb)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-###########################
-# 3: merging two dfs: with NA
-###########################
-# link col name  = 'Mutationinforamtion'
-cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2')
-
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# merge 3a: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = 'Mutationinformation'
-                  , all.y = T)
-
-cat('Dim of merged_df2: '
-    , '\nNo. of rows: ', nrow(merged_df2)
-    , '\nNo. of cols: ', ncol(merged_df2))
-head(merged_df2$Position)
-
-if(nrow(meta_with_afor) == nrow(merged_df2)){
-  cat('nrow(merged_df2) = nrow (gene associated metadata)'
-      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
-      ,'\nGot no. of rows: ', nrow(merged_df2))
-} else{
-  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
-      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
-      , '\nGot no. of rows: ', nrow(merged_df2)
-      , '\nFinding discrepancy')
-  merged_muts_u = unique(merged_df2$Mutationinformation)
-  meta_muts_u = unique(meta_with_afor$Mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = 'Mutationinformation'
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 3b:remove duplicate mutation information
-#########
-cat('Merging dfs with NAs: small df (removing duplicate muts)'
-    ,'\nCannot trust lineage info from this'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df3')
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat('PASS: No. of rows match with mcsm_data'
-      ,'\nExpected no. of rows: ', nrow(mcsm_data)
-      ,'\nGot no. of rows: ', nrow(merged_df3))
-} else {
-  cat('FAIL: No. of rows mismatch'
-      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
-      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
-}
-
-###########################
-# 4: merging two dfs: without NA
-###########################
-cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2_comp')
-
-#########
-# merge 4a: same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-
-cat('Dim of merged_df2_comp: '
-    , '\nNo. of rows: ', nrow(merged_df2_comp)
-    , '\nNo. of cols: ', ncol(merged_df2_comp))
-
-#########
-# merge 4b: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-cat('Dim of merged_df3_comp: '
-    , '\nNo. of rows: ', nrow(merged_df3_comp)
-    , '\nNo. of cols: ', ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-#=============== end of combining df
-#*********************
-# write_output files
-# Not required as this is a subset of the combining_two_df.R
-#*************************
-# clear variables
-rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
-rm(pos_count_check)
-#============================= end of script
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
deleted file mode 100755
index 5cc5f09..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jun 25 08:46:36 2019
-
-@author: tanushree
-"""
-############################################
-# load libraries
-import os
-import pandas as pd
-import numpy as np
-from Bio import SeqIO
-############################################
-#********************************************************************
-# TASK: Read in fasta files and create mutant sequences akin to a MSA,
-# to allow generation of logo plots
-
-# Requirements:
-# input: Fasta file of protein/target for which mut seqs will be created
-	# path: "Data/<drug>/input/original/<filename>"
-# output: MSA for mutant sequences
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-#%%
-# specify input and output variables
-homedir = os.path.expanduser('~')
-#=======
-# input
-#=======
-#############
-# fasta file
-#############
-indir = 'git/Data/pyrazinamide/input/original'
-in_filename_fasta = "3pl1.fasta.txt"
-infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta
-print(infile_fasta)
-
-#############
-# meta data
-#############
-# FIXME when you change the dir struc
-inpath_p = "git/Data/pyrazinamide/input/processed"
-in_filename_meta_data = "meta_data_with_AFandOR.csv"
-infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data
-print("Input file is:", infile_meta_data)
-
-#=======
-# output 
-#=======
-outdir = 'git/Data/pyrazinamide/output'
-# filenames in respective sections
-
-################## end of variable assignment for input and output files
-#%%
-#==========
-# read files
-#==========
-
-#############
-# fasta file
-#############
-my_fasta_o = str()
-for seq_record in SeqIO.parse(infile_fasta, "fasta"):
-    my_seq = seq_record.seq
-    my_fasta_o = str(my_seq) #convert to a string
-    print(my_fasta_o)
-    print(len(my_fasta_o))
-#    print( type(my_fasta) )
-
-# remove non_struc positions from fasta
-def remove_char(str, n):
-    first_part = str[:n] 
-    last_part = str[n+1:]
-    return first_part + last_part
-#print(remove_char('Python', 0))
-
-ns_pos_o = 186
-offset = 1 # 0 based indexing
-ns_pos = ns_pos_o - offset 
-my_fasta = remove_char(my_fasta_o, ns_pos)
-print("orig length:", len(my_fasta_o))
-print("new length:", len(my_fasta))
-
-#############
-# SNP info and no of MSA to generate
-#############
-# read mutant_info file and extract cols with positions and mutant_info
-# This should be all samples with pncA muts
-#my_data = pd.read_csv('mcsm_complex1_normalised.csv') 
-my_data = pd.read_csv(infile_meta_data)
-list(my_data.columns)
-#my_data['OR'].value_counts()
-#my_data['OR'].isna().sum()
-
-#FIXME: You need a better way to identify this
-# ideally this file should not contain any non_struc pos
-# remove positions not in the structure
-my_data = my_data[my_data.position != ns_pos_o] 
-
-# if multiple positions, then try the example below;
-# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
-#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
-
-# count mutations per sample
-mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']]
-
-# test
-foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')]
- 
-foo = mut_info.pivot_table(values = ['Mutationinformation']
-                           , index = ['Mutationinformation', 'id']
-#                           , columns = 
-                           , aggfunc = 'count')
-
-# table
-foo_tab = mut_info.pivot_table(values = ['Mutationinformation']
-#                           , index = ['Mutationinformation']
-                           , columns = ['id', 'Mutationinformation']
-                           , aggfunc = 'count'
-#                           , margins = True)
-                           )
-foo_tab.stack('id')
-
-mut_info.to_csv('mutinfo.csv')
-
-mut_info1 = my_data[['position', 'mutant_type']]
-#%%
-################
-# data cleaning
-################
-# extract only those positions that have a frequency count of pos>1
-###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
-
-# add a column of frequency for each position
-#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count')
-mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts())
-
-# sort by position
-mut_info2 = mut_info1.sort_values(by=['position'])
-
-# count how many pos have freq 1 as you will need to exclude those
-mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos
-
-# extract entries with freq_pos>1
-# should be 3093-211 = 3072
-mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
-print("orig length:", len(mut_info1))
-print("No. of excluded values:", mutfreq1_count)
-print("new length:", len(mut_info3))
-# sanity check
-if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ):
-    print("Sanity check passed: Filtered data correctly")
-else:
-    print("Error: Debug you code")
-
-# reset index to allow iteration !!!!!!!!!! IMPORTANT
-mut_info = mut_info3.reset_index(drop = True)
-
-##del(mut_info1, mut_info2, mut_info3, my_data)
-
-###################
-# generate mut seqs
-###################
-mut_seqsL = [] * len(mut_info) 
-
-# iterate 
-for i, pos in enumerate(mut_info['position']):
-    my_fastaL = list(my_fasta)
-    mut = mut_info['mutant_type'][i]
-    offset_pos = pos-1
-    
-    print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
-    my_fastaL[offset_pos] = mut
-    print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
-     
-    mut_seq = "".join(my_fastaL)
-#    print(mut_seq + '\n')
-    print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut,  mut_seq)
-    mut_seqsL.append(mut_seq)
-
-
-###############
-# sanity check
-################
-len_orig = len(my_fasta)
-#    checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in mut_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != len_orig:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
- 
-del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)       
-            
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
-print(outdir)
-out_filename = "gene_msa.txt"
-outfile_gene = homedir + '/' + outdir + '/' + out_filename
-print(outfile_gene)
-
-with open(outfile_gene, 'w') as file_handler:
-    for item in mut_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-#R = "\n".join(mut_seqsL)
-#f = open('Columns.csv','w')
-#f.write(R)
-#f.close()
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
deleted file mode 100755
index 7e00fb1..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# run all bash scripts for mcsm
-
-#./step0_check_duplicate_SNPs.sh
-#./step1_lig_output_urls.sh
-./step2_lig_results.sh
-./step3a_results_format_interim.sh
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
deleted file mode 100755
index 4c24392..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-#*************************************
-# need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code
-
-#**********************************************************************
-# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
-# per line. Sort by unique, which automatically removes duplicates.
-# sace file in current directory
-#**********************************************************************
-infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
-outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
-
-# sort unique entries and output to current directory
-sort -u ${infile} > ${outfile}
-
-# count no. of unique snps mCSM will run on
-count=$(wc -l < ${outfile})
-
-# print to console no. of unique snps mCSM will run on
-echo "${count} unique mutations for mCSM to run on"
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
deleted file mode 100755
index 6361b62..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-
-#**********************************************************************
-# TASK: submit requests using curl: HANDLE redirects and refresh url. 
-# Iterate over mutation file and write/append result urls to a file
-# Mutation file must have one mutation (format A1B) per line
-# Requirements
-# input: mutation list (format: A1B), complex struc: (pdb format)
-    # mutation: outFile from step0, one unique mutation/line, no chain ID
-    	# path: "Data/<drug>/input/processed/<filename>"
-    # structure: pdb file of drug-target complex
-    	# path: "Data/<drug>/input/structure/<filename>"
-# output: should be n urls (n=no. of unique mutations in file)
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# NOTE: these are just result urls, not actual values for results
-#**********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath_mut="/processed"
-in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
-infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
-echo Input Mut filename: ${infile_mut}
-
-inpath_struc="/structure"
-in_filename_struc="/complex1_no_water.pdb"
-infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
-echo Input Struc filename: ${infile_struc}
-
-# output
-outpath="/processed"
-out_filename="/complex1_result_url.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-#echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# iterate over mutation file (infile_mut); line by line and 
-# submit query using curl
-# some useful messages
-echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
-COUNT=0
-while read -r line; do
-((COUNT++))
-mutation="${line}"
-#    echo "${mutation}"
-#pdb='../Data/complex1_no_water.pdb'
-pdb="${infile_struc}"
-mutation="${mutation}"
-chain="A"
-lig_id="PZA"
-affin_wt="0.99"
-host="http://biosig.unimelb.edu.au"
-call_url="/mcsm_lig/prediction"
-
-#=========================================
-##html field_names names required for curl
-##complex_field:wild=@
-##mutation_field:mutation=@
-##chain_field:chain=@
-##ligand_field:lig_id@
-##energy_field:affin_wt
-#=========================================
-refresh_url=$(curl -L \
-     -sS \
-     -F "wild=@${pdb}" \
-     -F "mutation=${mutation}" \
-     -F "chain=${chain}" \
-     -F "lig_id=${lig_id}" \
-     -F "affin_wt=${affin_wt}" \
-     ${host}${call_url} | grep "http-equiv")
-
-#echo Refresh URL: $refresh_url
-#echo Host+Refresh: ${host}${refresh_url}
-
-# use regex to extract the relevant bit from the refresh url
-# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
-
-# Now build: result url using host and refresh url and write the urls to a file 
-result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
-sleep 10
-
-echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
-
-# create output file with the added number of muts from file
-# after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
-echo -e "${host}${result_url}" >> ${outfile}
-#echo -n '.'
-done < "${infile_mut}"
-
-#FIXME: stop executing if error else these echo statements are misleading!
-echo
-echo Output filename: ${outfile}
-echo
-echo Number of urls saved: $(wc -l < ${infile_mut})
-echo
-echo "Processing Complete"
-
-# end of submitting query, receiving result url and storing results url in a file
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
deleted file mode 100755
index 51a7844..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: submit result urls and fetch actual results using curl
-# Iterate over each result url from the output of step1 stored in processed/
-# Use curl to fetch results and extract relevant sections using hxtools
-# and store these in another file in processed/
-
-# Requirements:
-# input: output of step1, file containing result urls
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: name of the file where extracted results will be stored
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# Optional: can make these command line args you pass when calling script
-# by uncommenting code as indicated
-#*********************************************************************
-############################# uncomment: to make it command line args
-#if [ "$#" -ne 2 ]; then
-  #if [ -Z $1 ]; then
-#  echo "
-#  Please provide both Input and Output files.
-
-#  Usage: batch_read_urls.sh INFILE OUTFILE
-#  "
-#  exit 1
-#fi
-
-# First argument: Input File
-# Second argument: Output File
-#infile=$1
-#outfile=$2
-############################ end of code block to make command line args
-
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath="/processed"
-in_filename="/complex1_result_url.txt"
-infile="${homedir}${basedir}${inpath}${in_filename}"
-echo Input Mut filename: ${infile}
-
-# output
-outpath="/processed"
-out_filename="/complex1_output_MASTER.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# Iterate over each result url, and extract results using hxtools 
-# which nicely cleans and formats html
-echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
-echo
-COUNT=0
-while read -r line; do
-#COUNT=$(($COUNT+1))
-((COUNT++))
-  curl --silent ${line} \
-    | hxnormalize -x \
-    | hxselect -c div.span4 \
-    | hxselect -c div.well \
-    | sed -r -e 's/<[^>]*>//g' \
-    | sed -re 's/ +//g' \
-    >> ${outfile}
-  #| tee -a ${outfile}
-#  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
-  
-done < "${infile}"
-
-echo
-echo "Processing Complete"
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
deleted file mode 100755
index 0861996..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: Intermediate results processing
-# output file has a convenient delimiter of ":" that can be used to 
-# format the file into two columns (col1: field_desc and col2: values)
-# However the section "PredictedAffinityChange:...." and 
-# "DUETstabilitychange:.." are split over multiple lines and 
-# prevent this from happening. Additionally there are other empty lines
-# that need to be omiited. In order ensure these sections are not split
-# over multiple lines, this script is written.
-
-# Requirements:
-# input: output of step2, file containing mcsm results as described above
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: replaces file in place.
-# Therefore first create a copy of the input file
-# but rename it to remove the word "MASTER" and add the word "processed"
-# file format: .txt
-
-# NOTE: This replaces the file in place!
-# the output is a txt file with no newlines and formatting 
-# to have the following format "<colname><:><value>
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-basedir="/git/Data/pyrazinamide/input"
-
-inpath="/processed"
-
-# Create input file: copy and rename output file of step2
-oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
-newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
-cp $oldfile $newfile
-
-echo Input filename is ${oldfile}
-echo
-echo Output i.e copied filename is ${newfile}
-
-# output: No output perse
-# Replacement in place inside the copied file
-################## end of variable assignment for input and output files
-
-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
-
-# Outputs records separated by a newline, that look something like this:
-# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
-# Mutationinformation:
-# Wild-type:L
-# Position:4
-# Mutant-type:W
-# Chain:A
-# LigandID:PZA
-# Distancetoligand:15.911&Aring;
-# DUETstabilitychange:-2.169Kcal/mol
-# 
-# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
-# (...etc)
-
-# This script brings everything in a convenient format for further processing in python.
-sed -i '/PredictedAffinityChange/ {
-N
-N
-N
-N
-s/\n//g
-}
-/DUETstabilitychange:/ {
-N
-N
-s/\n//g
-}
-/^$/d' ${newfile}
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
deleted file mode 100755
index 0e07c0d..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/python
-
-###################
-# load libraries
-import os, sys
-import pandas as pd
-from collections import defaultdict
-####################
-
-#********************************************************************
-# TASK: Formatting results with nice colnames
-# step3a processed the mcsm results to remove all newlines and 
-# brought data in a format where the delimiter ":" splits
-# data into a convenient format of "colname": "value".
-# this script formats the data and outputs a df with each row
-# as a mutation and its corresponding mcsm_values
-
-# Requirements:
-# input: output of step3a, file containing  "..._output_processed.txt"
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: formatted .csv file
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
-# input
-inpath = "/processed"
-in_filename = "/complex1_output_processed.txt"
-infile = homedir + basedir + inpath + in_filename
-print("Input file is:", infile)
-
-# output
-outpath = "/processed"
-out_filename = "/complex1_formatted_results.csv"
-outfile = homedir + basedir + outpath + out_filename
-print("Output file is:", outfile)
-################## end of variable assignment for input and output files
-
-outCols=[
-        'PredictedAffinityChange',
-        'Mutationinformation',
-        'Wild-type',
-        'Position',
-        'Mutant-type',
-        'Chain',
-        'LigandID',
-        'Distancetoligand',
-        'DUETstabilitychange'
-        ]
-
-lines = [line.rstrip('\n') for line in open(infile)]
-
-outputs = defaultdict(list)
-
-for item in lines:
-	col, val = item.split(':')
-	outputs[col].append(val)
-
-dfOut=pd.DataFrame(outputs)
-
-pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
deleted file mode 100644
index c58dc8b..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ /dev/null
@@ -1,230 +0,0 @@
-getwd()
-#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
-getwd()
-
-#=======================================================
-# TASK: read formatted_results_df.csv to complete 
-# missing info, adding DUET categories, assigning
-# meaningful colnames, etc.
-
-# Requirements:
-# input: output of step3b, python processing,
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-basedir = "/git/Data/pyrazinamide/input"
-inpath = "/processed"
-in_filename = "/complex1_formatted_results.csv"
-infile = paste0(homedir, basedir, inpath, in_filename)
-print(paste0("Input file is:", infile))
-
-#======================================================
-#TASK: To tidy the columns so you can generate figures
-#=======================================================
-####################
-#### read file #####: this will be the output from python script (csv file)
-####################
-data = read.csv(infile
-              , header = T
-              , stringsAsFactors = FALSE)
-dim(data)
-str(data)
-
-# clear variables
-rm(homedir, basedir, inpath, in_filename, infile)
-
-###########################
-##### Data processing #####
-###########################
-
-# populate mutation information columns as currently it is empty
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-
-# should not be blank: create muation information
-data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
-
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-#write.csv(data, 'test.csv')
-
-##########################################
-# Remove duplicate SNPs as a sanity check
-##########################################
-# very important
-table(duplicated(data$Mutationinformation))
-
-# extract duplicated entries
-dups = data[duplicated(data$Mutationinformation),] #0
-
-# No of dups should match with the no. of TRUE in the above table 
-#u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) )
-
-#***************************************************************
-# select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),]
-#***************************************************************
-# sanity check
-u = unique(df$Mutationinformation)
-u2 = unique(data$Mutationinformation)
-table(u%in%u2)
-
-# should all be 1
-sum(table(df$Mutationinformation) == 1)
-
-# sort df by Position
-# MANUAL CHECKPOINT:  
-#foo <- df[order(df$Position),]
-#df <- df[order(df$Position),]
-
-# clear variables
-rm(u, u2, dups)
-
-####################
-#### give meaningful colnames to reflect units to enable correct data type
-####################
-
-#=======
-#STEP 1
-#========
-# make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange
-
- #make Predicted...column numeric and outcome column categorical
-head(df$PredictedAffinityChange)
-df$PredictedAffinityChange = gsub("log.*"
-                                  , ""
-                                  , df$PredictedAffinityChange)
-
-# sanity checks
-head(df$PredictedAffinityChange)
-
-# should be numeric, check and if not make it numeric
-is.numeric( df$PredictedAffinityChange )
-
-# change to numeric
-df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
-
-# should be TRUE
-is.numeric( df$PredictedAffinityChange )
-
-# change the column name to indicate units
-n = which(colnames(df) == "PredictedAffinityChange"); n
-colnames(df)[n] = "PredAffLog"
-colnames(df)[n]
-
-#========
-#STEP 2
-#========
-# make Lig_outcome column categorical showing effect of mutation
-head(df$Lig_outcome)
-df$Lig_outcome = gsub("^.*-"
-                  , "",
-                  df$Lig_outcome)
-# sanity checks
-head(df$Lig_outcome)
-
-# should be factor, check and if not change it to factor
-is.factor(df$Lig_outcome) 
-
-# change to factor
-df$Lig_outcome = as.factor(df$Lig_outcome)
-
-# should be TRUE
-is.factor(df$Lig_outcome) 
-
-#========
-#STEP 3
-#========
-# gsub
-head(df$Distancetoligand)
-df$Distancetoligand = gsub("&Aring;"
-                           , ""
-                           , df$Distancetoligand)
-# sanity checks
-head(df$Distancetoligand)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$Distancetoligand)
-
-# change to numeric
-df$Distancetoligand = as.numeric(df$Distancetoligand)
-
-# should be TRUE
-is.numeric(df$Distancetoligand)
-
-# change the column name to indicate units
-n = which(colnames(df) == "Distancetoligand")
-colnames(df)[n] <- "Dis_lig_Ang"
-colnames(df)[n]
-
-#========
-#STEP 4
-#========
-#gsub
-head(df$DUETstabilitychange)
-df$DUETstabilitychange = gsub("Kcal/mol"
-                              , ""
-                              , df$DUETstabilitychange)
-# sanity checks
-head(df$DUETstabilitychange)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$DUETstabilitychange)
-
-# change to numeric 
-df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
-
-# should be TRUE
-is.numeric(df$DUETstabilitychange)
-
-# change the column name to indicate units
-n = which(colnames(df) == "DUETstabilitychange"); n
-colnames(df)[n] = "DUETStability_Kcalpermol"
-colnames(df)[n]
-
-#========
-#STEP 5
-#========
-# create yet another extra column: classification of DUET stability only
-df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
-                         , "Stabilizing"
-                         , "Destabilizing") # spelling to be consistent with mcsm
-
-table(df$Lig_outcome)
-
-table(df$DUET_outcome)
-
-#==============================
-#FIXME
-#Insert a venn diagram
-#================================
-
-#========
-#STEP 6
-#========
-# assign wild and mutant colnames correctly
-
-wt = which(colnames(df) == "Wild.type"); wt
-colnames(df)[wt] <- "Wild_type"
-colnames(df[wt])
-
-mut = which(colnames(df) == "Mutant.type"); mut
-colnames(df)[mut] <- "Mutant_type"
-colnames(df[mut])
-
-#========
-#STEP 7
-#========
-# create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position)
-
-# clear variables
-rm(n, wt, mut)
-
-################ end of data cleaning
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
deleted file mode 100644
index eb24cab..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
+++ /dev/null
@@ -1,275 +0,0 @@
-##################
-# load libraries
- library(compare)
-##################
-
-getwd()
-
-#=======================================================
-# TASK:read cleaned data and perform rescaling
-  # of DUET stability scores
-  # of Pred affinity
-# compare scaling methods with plots
-
-# Requirements:
-# input: R script, step3c_results_cleaning.R
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-# output normalised file
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
-in_filename = "/step3c_results_cleaning.R"
-infile = paste0(homedir, currdir, in_filename)
-print(paste0("Input file is:", infile))
-
-# output file
-basedir = "/git/Data/pyrazinamide/input"
-outpath = "/processed"
-out_filename = "/mcsm_complex1_normalised.csv"
-outfile = paste0(homedir, basedir, outpath, out_filename)
-print(paste0("Output file is:", outfile))
-
-####################
-#### read file #####: this will be the output of my R script that cleans the data columns
-####################
-source(infile)
-
-#This will outut two dataframes:
-# data: unclean data: 10 cols
-# df : cleaned df: 13 cols
-# you can remove data if you want as you will not need it
-rm(data)
-
-colnames(df)
-
-#===================
-#3a: PredAffLog
-#===================
-n = which(colnames(df) == "PredAffLog"); n
-group = which(colnames(df) == "Lig_outcome"); group 
-
-#===================================================
-# order according to PredAffLog values
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$PredAffLog)
-
-# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
-df = df[order(df$PredAffLog),] 
-head(df$PredAffLog)
-
-# sanity checks
-head(df[,n]) # all negatives
-tail(df[,n]) # all positives
-
-# sanity checks
-mean(df[,n])
-#-0.9526746
-
-tapply(df[,n], df[,group], mean)
-
-#===========================
-# Same as above: in 2 steps
-#===========================
-
-# find range of your data
-my_min = min(df[,n]); my_min #
-my_max = max(df[,n]); my_max #
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-df$ratioPredAff = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                      )# 14 cols
-# sanity checks
-head(df$ratioPredAff)
-tail(df$ratioPredAff)
-
-min(df$ratioPredAff); max(df$ratioPredAff)
-
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-# should be the same as below 
-sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
-
-table(df$Lig_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "Ligand_stability"
-# my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioPredAff
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioPredAff )
-     , main = "ratio rescaling"
-)
-
-# titles
-mtext(text = "Frequency"
-       , side = 2
-       , line = 0
-       , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-
-#clear variables 
-rm(my_min, my_max, my_title, n, group)
-
-#===================
-# 3b: DUET stability
-#===================
-dim(df) # 14 cols
-
-n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
-group = which(colnames(df) == "DUET_outcome"); group #12
-
-#===================================================
-# order according to DUET scores
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$DUETStability_Kcalpermol)
-
-# ORDER BY DUET scores: negative values at the top and positive at the bottom
-df = df[order(df$DUETStability_Kcalpermol),] 
-
-# sanity checks
-head(df[,n]) # negatives
-tail(df[,n]) # positives
-
-# sanity checks
-mean(df[,n])
-
-tapply(df[,n], df[,group], mean)
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-# find range of your data
-my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max
-
-df$ratioDUET = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                    ) # 15 cols
-# sanity check
-head(df$ratioDUET)
-tail(df$ratioDUET)
-
-min(df$ratioDUET); max(df$ratioDUET)
-
-# sanity checks
-tapply(df$ratioDUET, df$DUET_outcome, min)
-
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-# should be the same as below (267 and 42)
-sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
-
-table(df$DUET_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "DUET_stability"
-#my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioDUET
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioDUET )
-     , main = "ratio rescaling"
-)
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-# reorder by column name
-#data <- data[c("A", "B", "C")]
-colnames(df)
-df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
-           , "Wild_type", "Mutant_type"
-           , "DUETStability_Kcalpermol", "DUET_outcome"
-           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
-           , "ratioDUET", "ratioPredAff"
-           , "LigandID","Chain")]
-
-# sanity check
-# should be True
-#compare(df, df2, allowAll = T)
-compare(df, df2, ignoreColOrder = T)
-#TRUE 
-#reordered columns
-
-#===================
-# write output as csv file
-#===================
-#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
-write.csv(df2, outfile, row.names = FALSE)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
deleted file mode 100644
index 877215a..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ /dev/null
@@ -1,131 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-###########################
-# This will return:
-
-# df with NA:
-# merged_df2 
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-###########################
-# you need merged_df3 
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-###########################
-# Data for bfactor figure
-# PS average 
-# Lig average
-###########################
-
-head(my_df$Position)
-head(my_df$ratioDUET)
-
-# order data frame 
-df = my_df[order(my_df$Position),]
-
-head(df$Position)
-head(df$ratioDUET)
-
-#***********
-# PS: average by position
-#***********
-
-mean_DUET_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.DUET = mean(ratioDUET))
-
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.Lig = mean(ratioPredAff))
-
-
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-
-# sanity check
-# mean_PS_Lig_Bfactor
-
-colnames(combined)
-
-colnames(combined) = c("Position"
-                       , "average_DUETR"
-                       , "Position2"
-                       , "average_PredAffR")
-
-colnames(combined)
-
-identical(combined$Position, combined$Position2)
-
-n = which(colnames(combined) == "Position2"); n
-
-combined_df = combined[,-n]
-
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-
-head(combined_df$Position); tail(combined_df$Position)
-
-write.csv(combined_df, outFile
-          , row.names = F)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
deleted file mode 100644
index 9ebc62b0d6fbe54e858e3c9da53c9878c18035b4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43777
zcmV)QK(xOfiwFP!000001MNKrd=$mkbLovVdJnya^g{6x2~7fq5GkUDBe{^1G%gp4
z6)X1Md+!AmJ30{+yHXXf&{PmX5o}1wx3l|Z_wC&7l@!GPm;8P&H#0l)=Djy>`plcc
zyqPHxGb0Eg0VFUehy(;m&Y%G4pM?aGaOpR!a!K)G+hSL(<c?<HIf(@Rbn$}fYN}bB
zHWFOS{1OC9aJK|KB?ysVsRY9$xE|uLbiF0GOoB@!SR_Fx;5bWy^Ch@kf}s-FB)CTc
zM&JqwMo5q+!P62<mEd*>E|p-h1Y;#QTY{@47%#yq5}YnUssx=RxJiN}37(f=js$%q
zXfHuG2~LyXR0-NjkS#%m1j8kGMgqpUOM+wx91={B;9d#3O0Z0VAt0msB{)ffyCf)+
z;5-S=WwMmuSqZEXFqsdMV2T6}Nl+od6%ve;;5i8%kYJ1i*GX`V1P@D4EWt$*luJ-3
zfm4Dw2}&i1mmpt)t0Z_rf)^!NBEhQ?+$cd031&!;BbON=k4SK(1Sd-nCBbM3Zjs;u
z3DPB)F2M^D^phZ0f-@yZkl+*v9wba(2#JxPT7q5@L`v|I1d}D0C&A+qTx+09OKgiP
zyy+6?9EvyCFkVz*p{=U4x-zlWZf7!(^d>-(iRES{->yMn8DDPiHs|xd3Lk#<{`cSM
zch%J8H};7A{F<20v#0&Oe9{@&7yXm_{mLVYN5@&uUH4r3Gs5=`9Z+^(K;_I!zdbeY
z{;vw_-gvuBz^K0m-TKhi)4#qm`|yWnTyb#X-i%q#{=MMR>#utF?eW789QbWa-jVQY
zLZ7<E@$H+FZrd=b<j_CIrrtXC>za4&|6*pJ{`JG&T;8j8_p~;vb}o2lR>}kWr~P<R
z?X_(tZab;%;eX0Iew;AtqoG&d_WtnY@sobA_AI<7`oY@{Y)B8S{4u-W@g<{P{JboH
z3>=kv-jJmI(G$mQczos+A38%`-FWx*`x_3`MqIyS$BXx$Uo`xzv>BfSFP?mT%A4P>
zdvnRh_0F91Ay@Xe{=!Mm-*$U(ZO*6Njs>2)@B3acLCa!OM<4lR;Jm(fc1w9-$dFUQ
zx{SW?;%~lRzGG|cU-!IxBxpo_yYR~&>hj{SqwzaK{@VH34Lz23xOme=n|>TQ^TC@p
z&j@+2eDM8$-`4hqg>mohJ@wrMZ%<n{^zi4UTR%Pe_#+J|M_;N~{riG;aa&fs@<+e=
zRYlfsMm@Z`Px$hJ4YOC)Uh;nIjk(qP>z8jl=f0;lOm{r|<YRZweSXqY_QSt^`plzq
zuOAhB)%)Aad!P5=o?q78KV|Ub%eM^(z0fiE7ke!IuxnBMC)>Z;KIpHsLyPYo`}t$B
zfBpT=_;YTY)?vg+0d2?ZO6&h+-m97OS3H|JV(oj8Ba`=>(>L+7@>hy~P1tz-pLKmc
zN?1|-*v^_WFT7~X>W}u%d};A@-wvgRUJu@Q^+#{~_Qn_Omaoi6Sl@8y$p=rHzVGyk
zi#8>$df=4J-DCR4+`e)2$H%^3ob+1i3tjI1uIJ)Ir(`}g!T#4D3-7)0y%(m}58U}z
z#()<)t=trJ-PHNxM|PUqz4(*YZt3pqG-C9$(=WSb$JNEX!UBJMW%%bc7k9Vay!8Cl
z$tnNbv+c>-_gxTn1_^&{a_q>Mu(KlWUst*JvcetJQ>?=twJ+Uw#w~}Yrc~7>4ZC#4
zlwGHV_Kkk*{Lgj=?3?lPwOuA$S#j2<oigfv9+%VM^#^;rG9fjkWAH;iox1dvPx{a6
zIP~kaZ9-Q3-M6~O%!sKCZR$wj$}d*GR{iDWorW*1v<JR>_I-BSk8@_P`S_-;iKo4^
zdh?6LML(BKYk&I250-V?`_kN=J#Sf67jkV()K^31?uc2t=&dE))?`-BJpH>lQDvQW
z{y6d6^AB4(@30ShqTlmtr{B8fx3#BNzi`QA-EN5A`+DFb|Li?DujbV&-dgwJ^0Zl9
z$J{kI?~5P$-aPxl4tpX)o|^yt4HtF%BlWRK`xAE5bXqy=-2(+TT~WK^ns0N;fBN#h
z{L?zWc4YL3PY(P!e<Dddt(WV?Kc5M!?Y{MmYj1vG&CKTx@4e~0i*6bCK-{_;M*lWv
z#wX9`9}HNMaNZB}V)AO$?t6>LUV7*6qh1()uCsXZcQsdCv!QR?!$Y#xl~>#qG3UUV
zOzXClSsj15Wpzl0ixW!*)_)tkbJXfyXWTIH(eHY9nY<?Kl92PGe&|<m&(^6I#9KC9
zxOC9FQ`!Zba`wRIb6+0O5On35l#CS>oo9!q?a6uNrWLP7&KbFBeg4t-+qcCGUG&fm
zb<3Wh7Z1B8VE7X&KfC8j=c8Y`_TPBU4SBy-^_qF`-p5At?l`jM#-hce3Zom^P0HxE
z>YC(pw*8R&*LfYv-d+>=)N2nNnSRZ{<N;%&{`m5Pm7Q<-C}ruYkLpj~@p0Oi%yU0X
zJNLCW2F4D$Au#EMy(@|yDoVP$;HE{}suQkW?f5i(Y5Q%Vv45_*d~Dvp0qfU2^Zk24
zEAL)7@xa4XHzwsgy5W)Pm&#uI=#KMGC*kQgWYKBomEUw_yU%~$d9wZK+2qV|nJdp(
zyS3og+`!&{#@##N=eSd%3jTiLj3=j$T6f3UZ;<o8Yg07-+e5oYub%%;@E)?}>7>a+
zQ+hr0`<}={H~sZc(VT9}>`~A5FWmM|aroSjOYeH(s%tOKZ}>3zywr>PrF^k;P0jGQ
z5AuEtJ#*8U`}dx+;y_hW&9Xf!pT6_{qA|toPJZ?3GapF5<*AiJcD%kOU`6-2cL#j2
zyWNy!pN75u`H{y0EeG0%hb?_(#rMD8@b$PWuKnfq5pRF9zTmS9$5l8_t-s~m)e8!f
zzBsn+m6Dr2-C0@iz{^SJUi`#@_}#PaP1!s3@oz>&eNyPmd%s;x#f|;@PP**9tin6)
z=<@!myVewr-Z1L#wt2UFM9#bTv((V~&2McUWtm%e!?KgE>HW(orD1D#et6B=m%DX)
zJ~-p1on@s<?|o{2yS|@~9cVu@dc=d*ytn4ElnoVo=ftnLHhppYn`iIZwkG}hPv3rh
z*tYF;$9jL3bmhQ`rxrx*Od2=ptlhB(E?aZ<{!8Lh^J+GnKGRNjjJV;2%l9W-m%s9s
z4OQ(D&nK(SE!j3_`TA?mdp`Tfnu2$q>78-(oF~4?q1W8lammV~vG1ikwEXd$1K(z?
z=;2yFA>{Y7-&i?0CicErckKNuGOcKw?W_2A_t!p98NBt=wmav~>9Lor8l4g{F#Y`Y
z&y4OeeAKA;2It)3X-OaaesbR>Ma#3S{jb?Fjl6S1(2IW_dHnO3TRKg6_vAlP-yk=Q
z+%YOMbb7&~r^ZfO@Y+aM*N`g%PRkl}`y;Mtwy-xocx>3umPalbcE@Skez>M#`>41(
z_F2z<WzBgHk7;=Rtjpfr{!;JlS6mT$#iWC;@BXaa{8tNho$<(lldiht^oR=!#-x7K
z{oDO#t^RFV)XcQIzFpkorCly(SgdXSWr?Nrv%VgC(*<u`)9}pTv~JIQShn2pTfw}?
zOOo#BS^CfBKX1C$F>PMKv*#yI-9CNc{)xM8Nt#{z=$vgq&%Zn8P<gu-E*dv*MyG>E
zpDa6V)kjyQ^nM_VOuQ-Tr;jd}Kl{(+mCs!K=gV*WwRe5>T_b`HmG|C!T})d38K>vo
z_t|^Bx@3;{dw>0*g<J3IIXr0opxYnb*CX=B(}(SPveOsmbhv0o=VwySnEp?Z^MdQ<
zE*(hzx_`s^+{><*xcZIW2ZJ80{`2K;=UeW7dj9CYg96%qdF9MsN-q2~vERnpGgn<Y
z<EG_5TyXm4f@``>YQHvMW!Eh^dv1U7+n$dWU-sk&U3Q+dyu-1fXN-+H|JKKE`{4cG
zuU}qV_s-{M-o54UtUIne^T-3|TxHGs@QK7(=f!otZC~JxKg<aVv8+icz5efIH$<*j
z>s<cJbX%JqX#q(a$6lA$@%pDaToN|x!uR_>YB{>TYj$RO{I2_M8nxi5Ip=jA^3jh$
zryTy{ffb(~+;r{fIg1lc%`CgNb4S+)iJv5I9`*bK&prF;EkDh;sQ-r#Z2jktV@EST
z{N%fLZ~N?rLAHN>9B^0GpI0y1zVy<Xn}5h&w_;pk!>TK<eEO@eGT!U;=ER$Sbe4Yi
z{j~3&YM<Dpw>7lk^-aIMJp9kI9{oFT#h7VhPtBS!cge+z*BrU&-9ypW-ao{3(aL+4
z#@_VtsMTN7H*;<e-(8>3qr>b4pT4#A?t7!Rbr|r<Mc))eR=@Ylt?M7W`<3UvI(o@d
z&OJ}w|HBmlzdoG&-RUDA?im&Nh;z&3h37pI{P>$4Y`^7oYoGDL!)I>jePo*bo%d%~
zuNnAzj4;*<md1po(qKA}4a}uUKwul?S00E4DuY^Gf{+UPLc2VKjFn?3p@DO4rF_OR
zuezGML#ym{E~l+R%(SG!R?AVSh|6pxu4*>+K8DD!04YF5Xal4QaL-tuSy+Ql_ysmq
z{G73nYNxGgo}Ken<EWa?zbvs=R8%kWB{x0uRkljIC_l?$?pOkjNZVpZZE;C;MRlE1
zz!2`NUX;kOFh&BFO8*)f8jhL}6H#h&*%Hf~QW67#Ec2lF!{Ij?AcFp8J#XKQ<+XJ8
z4|8&_%*?0%oU?uWbM4Nh8>6QOb-KBZ?ytWjzUuaC>CqtpQ4`-di#EtV>DDj%emlI&
z{cqD>M%O(2Rm5m|K#J{Jc{a$M&C%Xg=0z4Z4`TDiJ0*8#2{IHq$gPsQo4!mer*aAA
zCMTs7U{4|T6k*SF&6Apzhdtx5rvQ6UnzYnB>=}<e1=v%BJs3g8SnR=RGH{v<lqMq?
zc`}p7Vow3~6k!ielf|P2<)&m6sqWMy9w{(4b*vh!(a78d$X$fo(^Yq7b`f%``Ho3W
z%__v6Sv=O5WL4VCU5MN>kXwtNnNo<{JUlorIYkqv<jEe7JyWnp#U*)ik$Wt1k4J8l
zI5i^|d&XmrE*hdMEqyHZXt}4V$`F*7mYs{-W08A2a!)~S4NtmOUU?a*xyX&f$0PRy
z)t#mC5SW*ht2xJNP8309acpvOGS~i~vB~M<wGWx8I?r_E(Mo!3YNq;OTyj!IKK4w*
z9#ur+l2gYZ_eA8*NA79Jt))%R%13T3PFC_5<i_Dz+U&G3$eoYe(~w(bXMA!}dLH%^
zVh@g%g}$Vy$Rtmys+O`R4|`Cw4E!ZS{Sq`jIWrfz^N_m`xwZHyX?e(9h}<}iDv|M-
zS-HqP2Dzspw-!G;a}09lA$K8iPeX21OcRn*GpArrA@-<sV1m{dGxrqaR@<xz>6*$)
z?kUKP<4i^FLgd!+O3y@iGEv%0lr|HkMRaDOyfRT<nJBMJlvh?t9&)S9PfSi#HC^_o
zt?tBBO^qct4-cG}K239Kp&8>eXTIi~tU0x087P@rxD&IJrXjaV$D}m14Poxd$UOzQ
zwfNauPb0Y}Bli^Ko{HRs$gOsF`N_#zPs%)nnkOZx0DI7vbX91wX9n_MZ0TzL%#)43
zWMgbv$Csa+nNol~C|Xv^IP95*J!;>VpR8uX+$#V1DVpX<ZdDffsp$pC&BFup)w(P>
zHSAd^c9zyuO-{~CpNKt^uqPjTim*o=LrhMds5vKTPPM$JBx`zFke-%{-1*2o6}h$e
z87cY5U5MN`PLbx$M)9*z{A?6o8@Uu@W~%Y0CMQim?gHdir8PA<Ejb^13b03wR+yZk
zePN!d$fNa|%!8t(C1Fn%@}#GY#~xm&frYAcf~O@XX_1(xQ1fJHBU|Pv(ma{jYVSHN
zS!?Shx61ys)D$gFdiFTv*5YKOBe&YTFgFU%MB$k@TvgI(nQ9LlT$G%Yl#4wRv8O=u
zB&SWr9`q$OeIoYgzD(0RX(_qbGYNYpV-G@*foaP~(tRmJo=ikRMwZSq1$!`pEJRzT
zCa9t`t+SWh6Onrga%=H3v>vD^Gh-rhYvEa$Du>gPQ?%g>^Nhou@yL^fzht2=shU)n
zXDs&UqM=MvlgD8Xj+TwFW$V7=V-LcXo}P<6<FN-v%S6$%k+kd?k3A@#EX-V1(m3oH
zk3BezHe#KglAMd&<B?k}n(1j;^C-EsIBDr9T;(coy2^&+9H%)^go&C{BPClUD{y+Y
zhC&nZjO1j!Cm(wXu}5uJXJlvtM9DoKx$}{`5V=)FpOu`Xj`?Lz9`>ls@+@uk&fIF&
zv(mKD{j3ZWCtK2JUl)QR0N6%yJp6KZJW&>*z^_{*Y6loCT#yz{i-i+n;Wfd+3(3O8
zW8utMcu84!17zW1vvAF|@QQBXHQ2&?77LfFg-gZ4+j$G`t1Y~twaoHGDuheL+i?T-
z?P4UwSbz}0&44Yl#VPLpA5v7MV(|@hQn6@KIbK->7M$)=X3FAVPU7Rg-uvRP=$!sr
z-(L9PitL<$ciqn%IRmC_?yzh~Ku+&V<wd`oq?=ROU(S#Swpw^cPQQEE+TbEJUEj~v
z{r2SHT^w(H^dGCPKV#sZ_0jU@ne}bT<iGkTc};OqebjU9*fL~&l)Uy>P#?vD7k*S9
z3E`3Qn&X1{Na^dTTSxjRoB$Ky@P+0!dG=bDZLXujagL*E9t1{CLqRQFOf5qRJJiD#
zT^A7LfJ&wWl|(u46A0g`1WGREtt?w%)!eCRnma8Oxid6(MzZG4#O^Gvgt;l)8IYT*
z2Bs#dP8D5XZe}*ez}6di@R;N@)tSlZ3(QMaNoDJZ$epdZQ&D(Yx|%dqB_uB`TXm*u
zp&8s6kf)N7m!%?OOKCiKY;w95oSBN;D%E3CGkGvu9^=8|lGO~yrK{PD%TiH~%T`N(
ztwSMqmgY`DaZ*z>cZMpd@yVI0Q;i)sUMr&U*(!Z(g@y-DPzyb9LVB|1)FNbRp;>Cy
zY-xiBPfX=80w$)bE>!{(vsE>il%|q1DO(jZTMW?LDM^}JC5X9Eob+tXttv|po1Sy@
z(u`bnrl`C~^KMRGzA7VWdd&kTtNF1xGmkJu(*tR;t2#B=OH)+UnW<(bO+-01HrwQG
zHorvfRLz~1gxp!0Tg@||Q00|P88vqXa;w!VP?{F1PE~0Fr=@EVGSW3?hUQdRn5L?C
z5Sztm?qn3M$~1^gx-@qNim#$%?kvr%NlTgvsZO=M36v&3sxwQ)!6rD!ou#?e7Kgc$
zkvm&+r=zs#2&dW{1hH8Ga;u6yT`jplX}GUBRsETssk&4lOQUy=i;cf|@C;21Y|PAq
zXQ}OR;4D>V0%v7tq1oxaclUJdl1H;9-2VanD7!H0>fqn#mJNfiTy#?#`toV)ZyH_m
zcSF`^6~pMa*UrA~r{AOK*0b-5>oM~=>$~41ez{=&YxMJn=bb<9iwL?d=c`W>23?@y
z-B08{`s?aVD~?nyqra47U-M(??R4M9vb-xSZRoy7Ub$(@r1^B;;*+2K>eAtKU$4a<
z52*N&?!Bjb*Pa~*)4gXfM_ao0^b0cHfBbueqj1hqG;kDA;Ix<KmX+mK^B%M9<T{tl
z<*2T5RFzdbD`h8q3ZLPqC@pp^sp0#+VFlH-4iq7hrLA(|D8XaOZ4UlDEZ;HDR#iHY
zk6$D69JR$2j(NqoYKPq^@3gT+&}qAt6i+I#Rn*v>mDLqIiG-rKy3SQnUCFbEVo8Ls
zP)4dm9gpH<d;qVU8##=nRb&&vWW&4vw<Ne(D)_!cInR2B<nAN^8w#)~58I>dYN91>
z+sg9kUiKFRzXwowcW^f|==auZ>m(scfq{sKL-5PTPoeYX)l^iwc=HmRnwXrJq#$b3
z;F);+Nr+mh%;Y_IGjiHiRaX_)fa(-O^Z-U&OoI?LsMcP=&NLqeR3Ye_<y@kSPjMjQ
z!ddOAhEK<o-?o4^u0g~x+W?1L>X3@?RL6`W_BRUbg#sn$*r3Oz@mA)jDn*#F4p|l>
z`2n)d4W1EqX)r^!q^bJjsH&-R>5E9t6%|W6Or5F=byXFPT9?t63R_K0#S-m{$W5FX
zHvzL}8-zyKq|tx5-Q_6pq<P`xwpvBul*%A7qmQhj^nuerTNPr)Tk-U4Abv={q@udY
z?z{BcbGM1|cSmtON*(hTR#sP4A~*H&H~mSOT11S21p2^lFU)lS@mn7y2^tC^h)dL~
zIxh4_>7?(y8;sap?9Oh2QFcN7998{LfeV@T6B`6m9o7lUt;MUTp64iG=ey)j@(l*-
ze!~BJ)iO&}W9oN>y=tDT+y{Yr$~t16-DTJf7*@*=si$YHk$36>Mhz1r91L}zNA2KG
zRq#@$hl;Gak>e;c3LQOK3mpu26LH>hi4yJ42zQ9n?y7TEdCHwPn~4p?tET7#9%w5q
zH7}P$4|+1(s8m87wTV@A6%_`Y3dxgABsxs2IAJwT`@&+IJm~PXgGArCL<X4ThuPp2
zJ>3J{ZuEuAYFDKU2tBACbEh<Tj$@>r>2B&G6f0)VQ_+iM6;)}kt+maw7cX+!YKVKU
z8##?JmLN?O5(X%rGa3w{B@IY$)F#*}oOWC3k_4%+YVB37#Dv_^Qb*Oi1Y1?L)ap4C
zYJI{rXu!~fMdgl?@`Oc>ii!lMy{5ueVo#7TB-j^AV-1E;#+2wqdHm`9EG7OI3Ldhh
zH@Fj`braeUDN)owgJx5mZ}3C15pp5a<})n9AQ-Ctym`dET1T+qXEB-?MDxts1CeYp
z#4g&g2~kI)G(z2pvi`|_*^Gc)X5^z_3(<Bpjwy<zRtx6;9kdO7*MwkSsCHrEx3FSI
zm82jJm&x})d(}eWlhLn4-bhByK?rLPECvCK?Nqet+2>I>ej8XG4qB?kLkrSw^T1g9
zVy5b~(x9n!VpXY~59xi#cXUhS%ke}4o&ev#pFCLH(P*+5Bj~eE0YfY8m2>S*)@Z{7
zSwA3=8p|kgJ{VHW&&UVsvC1W7F{lwjUzV82+J(c!y?Px-tsjZM6CfV>M+^Q|b^dqM
z*_TO46PjcBR)NHJV2FT9U6NT)S6y4@R2Q|_f?QpNP0;D6#IaQi9nR{iN~u?s{6|#<
zfr@kiF&HqaOPRWXl)9iNRMl!{sog2xdRDO(>E#eE%9Kr$j8{?Xs-UrF$-S(N>1)4K
zOg4GV1aB4^wJkv<6^<Gq-+*d7KK5l!6jI@+bTC^40d0wH5~gD~Kx4RR>n&`V4KLC0
z;kKvcH+?ZVYKzO9cC@%5M-_ynUtCb`@esq+w<t;4_KM;fo6}Z_=lH^SHyhIRg9QdY
zDk>Dm!J&&2Av^c!=Mryem*BLfMHXKxO@wNS^@Mw!Q-;bT#^S~1pOzSW3Q+|ELeTXk
zOzNzC>P0m33#<e>5T#vY1=@uu9a%@B^)F%Y%W~)qX<1(r3b=6GDEMsyaibw_B*Yy+
zc&*pYocP#B4HNqy&UK8-X>(K+m)M*xd#%G(C7`kB(e%b>_z0JPE=m5$f?un_FkRx`
zh=zmfpdl+BSeR-OPn*stWzF>rB~27iO*TQ*QHa-8@K56G8ujQtu%HcOs_R?^Nj#7u
zvY?HY=1TLj`NeF%$AB0vH-CxsK*^bAl!V%xT=r(!G>Sp_&Y;|d88FfahQf^)@_<Kc
zZEI|`E<v?sH#QSh6>9A@@F^ArTM|7mX-2d+;rvdtp#*%-5*w00{rA%aK4ysVgNS*t
zvhTqjz7zK<Hra%)erEjYgtpj(X<>vfGBr~qI@AkOVa>x-kxxu5L6OAYwg7?p<-8!?
zHiLUMxYh2h4FEeR<!P$Kepw0f=~4$^oPBEtf|y8@?vCN<MzpStcV!)jHpRecVt~<Z
z5SJaJ91U?#0lrxLlTiAOfRE+T34S{hrTb*vaZjRHSn3aPIlhh%mrecVLcERw?snj2
za_B{x)ZI=M;|u-j8D1K{$zpn;);3E=Wl-BiKF;)1+kL3rB41~jD7?o$%Af)_wy!ES
z%3rMTK2&g#uQ|TTJKQ%ueCnbYz6gCfn~-j{SbmZI<ujN2V@V^sCJQM0LB;LMdP5?P
ze;n62Cbwk1p(tO5XCmYu0RAAz8~kmR1u>-}b6P?LT7*Bp=(0ei<Lf$D?!k)xEbb5T
zO@32x{N|FM1@!44<imc$l<+w`JQRL~MnW56^;NeL-G+37U6nP1iF+fVn`JMwRn*y0
zW<(>dxIv$3i7zdX<hM5%<FoPb2;YQ08^>>MrRBYP7r@gFc;S;%o(EUnNW~xCfC`S&
z6b>|b`SXj;SYpsJS)kcE2n<+IgizVQh^!5$%*ss4%#LNt%g95zQcvl)9L9ni0(^u}
z1pEq5ID$w6h(^HE#2|=65D(BEK|F$venK~Z9wyKYK^qPHUfAUgwFQRrs3FVZ{s5vi
z96r>g8MvWo?z$8~+&h2($cuLdK|~uV9@PA8g@$=D)USTHenkj$VjU9xnpVda6RoxN
zL8vnuQ2Y}?sG)*T!}LNe`d5Uy1ZCO_GUXWsA>dtHsE@$n8nqBc5Tb|X*lN)q9Gz<`
zneTGis%qIzNU_1vHD28<#8Xyb$=Yc1ULJkGNVKrQ3n}3$8D=L}*(qS(h;ctJA_+)c
zkcLS3D4GK!;~c3y6eLzYIo#+fb5>XC1;E$Zr(vc8h+Y6*X<gzwt@9i7#8osH6Lgv%
z1UYc&s=EQ2vec+MTR8UF@LMV7Bw?u%F{>-!wF~MTwdHKZ)KTT&>v`4QR3@y{QR}i*
zl?W=^52<XCkLv4w0;<xsxVY5#sBJV0FSgf}R5(iQ>UzDW@;CW@n#iG;lV!LT=!8W4
zlw%P>zQ90K-Wma;%`y2q+6dDf0tg;cUF|H@5R736;;`n*mk+A!HmCG~b&S7x*f$Gt
z-^$Q^571Lv(9rxiYYcEURe=_uFeh?1mul1+;P<K#LJQ&(e*yn79)+@<YErhI%Fx@k
zE9Xa*n=d>GviBzA7NJ3q%{&v?v`Aks@Nb%jfh_#WN8qvCghz*uJaSz>8-+DjFYM2-
zpt1$>Z|o>Q?7#i#>#OmviEfydjfb|VwR~Eze1tkR-G_E#26E3t?peUA?o@|KnVxPU
zQ=hidJm|_)6vnrE__o1h^i6DYCK<J-2=AFFEl$sIpM~12siJ&Cj89N=oW%|LxJLNr
zc%hHE!!ceIhi^~J_7IoJSYE%2CBniU%20>cfS4?l0WaVg28<gOz+{BGAI?Wd&|-at
zJd>G?$fU?L6Y(UPhwja<s^CFwTdPBBb@;DT2a#JJfFE%;I@+F#dK_KG(ehk$3_3FB
z`77Ydk?@@=s)vy0hz6y1bMbhs=h&OJBK$vkj@_4o#03o|s})4UF%`w}I`dN3Bp2D7
zRk8^tw>mjaUd2r)u~n4RRmj#R5~`|Q369E|3OhT{ZZA!6R3*5|9kmH1teT8WkQl71
zaMdPM+Lk2DwI?icR#(kS6xP~;C43c$?i_H+T4dge$DK$*O&Me2Z^F@Z+g$eEM1WaY
z6hc_Os3WGw;C1-hIPH@DIE&?@lM9Wr(M#M_ZkG@2dYPdX8|g<-zw^%Pr-hpy0uNu3
zs2veC>Ygy8p*2oNrOmm-;8S?5y`;LTRQYJo-EsBM2&}M|xdfskk%$C(L$qsxD!<Vz
zsr$W5_YO)7<atZs=OMM!B=2<0D+l71ijp&>Bu6F0cS_Wp5Gke_6b|Akn`f>)=LwC6
z)+O4N+Fz_7L0O74DR&eS=knvd$IzltBNmGm8|5)sX`^(zPfcyAmavRWS|`6&lH6Jb
zTWyWK#AUKA6(X~OFLNLq<$}=g3dht2<1|`9C^s)EY)h&|TOqjBT2jsL?yGH}K~XF-
zQuGKV2YF*Hz>H3Bt*S$4QELRAeOhhyrt@?C(0PyhI-!dbD{W`1_q0Mx?$ldO(gFvg
z2x3Ee6&8xx6cFw2m;v&jx+;gu2b^N938|9m$U^bLjwenXRc4%Cjoh9Xyw(o}uki&>
z6Pa4#srY<l>bhoP>bmA*s_7+leKRq2eWRGt+LU>9j#7K^Jg1|S=sKBev3m1-gu>8|
zm@2DF9c4>A+p`K&z8c;%<KLDL?*r;^N7X_{tz&M5!F@R2#&qRPYoYKk?tBl$qfxxr
zkD;4xd?tQG<2ql4Zh@}q8sg{R$lEY&u|Xi@6JRn*3z=K(a`BTF%{NRlqD0Foz<8YI
zMqFsAz04+!REYbK#8a{;e%VsK25HzOdz;i?ybDoRY&QLLyKW8J{J@9FO>3jvtP&>q
zUT>&Lt`_g%|BKu2|K|2{OK<PCZmoKdznA=5a_g>jd#-gqLYLwH&;1A{8r7^ssQWQB
z=;XMS2k}@TU!*}Vvl^b5bvwZdhb4_nCwLlMwr(|O(7M$?JnM=!#tSe#5GjuFf=o8Z
z{22M+EwU**&sM!{)mtAdm-P)m>4FwlIM|d5*&Wk=Ut;)O?(Kt#UeEDNLW*;zmpBYh
zK97TWQMPj2L(FD6L8P&kj7%tV&s&F&*I#Z&34CD1Ntk+q7{PE8kj8evPl179E*AkJ
zU3cQ!Z6BS*;K#lvL+VBfK$!5{)1Xak?nMI|qvD>_U|!63HlSTP!{W^shhwdsuNJSh
z5p!$AWGyo=kLIaUZq5$Z?0kVYI{X3s!dh*)(@{0wm}`FOp-E=dqD!hPYwBEf!@VuH
zy(e!=w~-2#Vz7bSRQ4lFowh~FpuJW<k=7){zj8X4A)2-2=f%b9%MEVdmbiVp%m_zt
z3Ea6k>@J-1%ml?MWCl?x<rbn<N`|<KXnhV-P*#j*Ls$nWK5o?|gt!|}V~t)Y#N`k7
zwFCDgqTbUb0}*~4w=MW%fu12mxzo(UI6ejwffx3TTd~0wX8IG&`b-ytv^)*}?FqjL
zkhTY;?S**m198~liQW*8rDJ#N`+|EQg!hH?gACFS^&$NbNIwjv9}el0;Wq_-Qz1<f
zq)9VKlL^0B@S6>3G9b+eNW<hj3c^l;Febkop!sAEzgWIwB*+8#c}vIm&xJh4BHG78
zoQXj5DeyZ9WR(y8$>5&?{!_sp3;qJ|=YfAJ_zOV>6Cm$tMAH+-`)CNW0}a!`UkbF$
zlwcOPXT!IX1U$^m>d#rkuR(yn7K&1Y_?L-;glMAQX(8y_F#$JkL``VXb|jR;55@A^
zJ_3$FJ&yB5`h)b}MT<S>dC4=8=V?SgwsK@(jY!PvNx^?<g8^P*V)e)<f59BtaFKVx
zT2Y#i|5HZ!d(xe7;<3D*@>LfwPQg-<sSdCU(DDfw{b#(HWF{(4ED45>#O$9D$-$!$
zF+U^rqI_g81h#MWsoM~}f<+U*6zo<{3MN(pPh7<K#l=D)b*#WeoPS)L+gx14`^ZJ0
zt)fCm+E$-bW4O}hayk|ZvD@imXNyYL-UM6NId*6DcxUxo69zhX7TmLf|8_6+`vs%#
zET4|VV?NIb{tM0UbRr)6j>j6)UUc@4wu>4ogD#%)5oLXcKl$Kz>Pk~pNp%I5Lr$0~
z1KnVJ?h<G9BKY8`HNwnvCBA8X*g3_93f)bgez~a7-SywI3=1}g&&iFxb@K2n*2A}M
z=HHMaxu;9;@GZ>4x85GU5#1k@!HW&LI6B-f@o&DCV-Mn6IY!Qt8j<2fj#5{7t$?B@
zF=@%zdBDUHA$EdE>~NOB42EAw(2F#_=`iBBw;ta$-kJ>jEtcTwrA*9?^5~<_<5f|+
z`})VzMMgY%;;|p`tyH7rQZ2W$<Jw}e_SfThMdV=sF)qjmhI4{lI{~hy80eX6kDZMX
zmxGAMl5`t9#uDF3Z?GQk>muhv{Nv*ee@kyD@vZb^Nf*n8<^&9a**Uk?6H6kbw#HV%
zkIoy%U}Lah;;TE2!6A;dag1Pj^~o!Wxy?X^#~3OOPeqjIV-d-%S~2WZU<AX7?<Evi
zT~>CyJ~P^qh)4YvN7MNxqv_+$10m)GmP|Yr*fOz>rx4@Tt<5}_ZEl6#eNZFPq+MY^
zF`nnysIk-@Rr-5b{2HpY%+lX$@c9{#H=jvz`M)V9x16JVSmCW9P4yrgGeuzO(mZ?@
zh06V6{sc||8HB%K60XC8tTW0hN72N+3%9tNpgo`avBgGTP3HzFetCK*{XX45TE=uP
zrz^^Ut^lB&uL3ig`fL2j<$^3ZKFRr2wiuL^B7aQn29xIB;))XR1RyZ^^D7?<3<3@U
zPHu#e8)sy{;Y2g;$ji$DmFpn-h6mIvJW`NrwBo;&``h>_-!{!CUmx;4uO;%ez~G1B
zje_4eCH!0-9_y!EV*e$%G-tjR7}7JmF^YdV_qRoKjUsw0$ZXYw>1}_YuNTnF=x1#J
zUoGzCA#Nzd<<@+8IMd%M$gel>!OHAx2ouvY`l7w$#p#Lxd}eX6Mdx^s1GH_~(5Ak?
z<<p+%^{Rv7zr8__D;#OKko!9+{>RMv1Qw^W66YeG2IPJRZqG+8CAz;r0SM7{7lRR)
z#T6|q^XWUcjf&fwD5ACf5kh)o(by|pF$m1!qCdZMTa_(Zh_JvSg#{KQEHLt6GUnyY
z`Yo|OD*5UA+U_WiaK7Nf7kBtV4qvq4<;xdiI8A(!B}ZeD(;(Jy)Cc)n$5(J3yDI)y
zxW6ajHH^3)!Qu6T2)rjK{#S8d?duq&7t!hmuNN&bM&UTambxQq3SQChGM86x;FbLb
z3UUhu-g_(lYs~6?tUv=_4B)(QW6VDMHHs&Hbf6DuybkmwIvu#^zor8gm=iHxdcbf0
zM#^!3;(xs{avVssK8W`{|0Ow^)g>-3PjzV!(dv>9<0o~zJ%}{l_$ikBck1h4(s(@^
zLUej|$A3-FEU-wyc<D#9atjyM*7VDY=HK{XUsxK{F9Vt<a|+)Sf`%yszQG9`s`%9>
z9EKBjMT3PU_Blz3o)2a7CQp{A_*Zg&iUAsJEy#x&^gPCN9A`c+w?w{D{I`5;IbLg_
z{NMI5|7ZRsxxe`@<-4MV<V~T5EaUi-;a6B?6juvN1;5)WpRn2}zJDa9=XVS3fJe8Q
zXo2r$_;`N4;`4*9Wda_pl^DDbo`~`RtKr%z!dj21<E6W0sS?KLIH;7>xghuBxvaOh
zKk(Zgc<zels2qPZ0yZCw1K{!4{ID$m^S1-QuLgN~6{sr$o)*6Z)WZYxM$oqrFc`s*
zMnEEhBp)FiL57bo62WKyD}r1ffy;Lc0$<ZI7>8iOafTuUCjp!WFv9>&M{ov$(-2J4
zz~8%&`eXE!HgD#u@X>rv*9i>GK!rJK6BpSOqc;J_)D^@_HcAuXa?F*@$mkF97vxSE
zo3BJ2*PyYZf+sS9?UgmIC4SHlTr$^DRjMUmq`cSQJ$Iw4?%d44dqT$JcwBMO0O52P
z<s|^4?Ex0gkJ~2lch>NAC+@e_g2zdNS(k=@Usz+AsK#VH+#Y-w#cRyPlt(brA?7CT
zbvOdN;0Yi)p}gY*jg8U#Rk+YiwCr6!6bj>h-v`{QQJjBWX+4yksca3(u~oh%@)dOY
zQ4^hh%tY>wBli=?jW1F?iQGQjof68<$TkT-OM96QykZ<rAsj{)zW9BJrv;hfIQ&S-
zr6|r+vFr?VBMsAFqNi?j8u2B>RS)SKS-J9%hg&+-=o{tpEW%+Vk6T4~1PF$@1bokV
z!FR5o_?}1jmLqpIa=#$RzN5g~?ZS7n_?-3&h3{N1)wA6m%1^{~k&k(ETJJE64|Q4I
z)spkqcDq=(yvt=&=7Ky|n8-6r$D44hSqbZ85Eg3?*3CUkILxZVOE3rvGYIQ#5T*t-
zmt0;(vUml#Uqx<HJwf=cG1Czi7E9b$kdOJ6*F`ul6CPeO;o)@yyl!>FfCsg`@<O*K
z{CD_3_Zvofi1XB+s?>ea9Srp+jmzMR?f|HJlDUn@#_)@p#zY`*dJ_mbjo2)~1`V}p
z&7mdpHsOC6;(w)~Lg1r~&%<zrE%Uw%Znot58n{`P_zbuiS8sxwEfcQr(Dh)cVD~co
z3+ZMf_Z!Hq#u0|R$93oh^tTSbJn$vNZ5@6QwQd;=7g~p3t-~+*uXXs<I{a!Kezgw2
zPVnJZ0CBGdUVzc)_w#At@)G$JrH-0@04&U8+(Skc6y?M%LVm#GwHi}vvGWN3Rby&g
zSG<`Cd$^%-`z~7NQ>(rlprb2hivDpS1*PG;K^|0HMHl6A+8kA?9WP9>+58C4p-kCo
zuZTicgR6-CKwKoK0i)wGqHNv=5zV~UL=Xi#Y|snB!yr5yh4%)$tm+K_yd8*g7=hzs
zV-seEX(Wh|<;xBroJy3c91nE1gI`8>f1;WA>VR-@8rVewc6or^U1uXjc9G)@$hQc>
zIuOmA7t4oV-0DFzGhU4T1n~EQ-`<cvKXh0C^l}>dAb$E0?I7nR0zU%<eg+V|nJ>%_
zI~z9)_+bYVY6O0aOnh;El89#N3-gmg^d`TC13#A{exitWxgmxqy1@=M_|a@?y=<BK
z^o;l^Lh3>7PGyH(7sG*(d&<<J|88#nqSEFnDKF-a#;EB7h%wQ@a(rVV!~;{G34VFg
ziV%I5>mj{(gh-cO{Kdov>hZc6i3-wx?<{@~)_)g`NIh;uhcAU9bg<?&g%%etv^i^q
z)Pm_NZS3HX8dYbl-cS@vAlSlkyB!;@|1R39d0Zq<oHU87OMe55ikRgk*t`liQWa68
zMtHm!uTq5)I&DmD{@CJ~LUO@2RR>WdZ9IsKxNGD(BX*^~&pl}f%O{!`mt~~fs}<X7
zYATi(NC^4Yh@8hFG{JHgX+WztEjFSxhIp<`vBYz2iX$Ft(-V!YP4UFX+O))|Hnr8K
zUMpg6M?6;^!Jw5`d7fyl%F`T`rh|`?f=V+>NG{mMa;r3g%`0z}W@BS3O=o>R>%DY0
z8~m*y8-3ACZ@(7kq%A-YpD;-NFStK|_+)U)WB+LZKR?LK@OAZ!&treekGEQuDq#rE
z7dRcihKl`N^>}r7%qD;qDDZ<soj5(t{rqYVzjgu>u(kX<>k0XUjnyk(4ej!4EkQ&Z
zATfOW%6&MZBbpeRX^`tEpR`3I9{D8N?6|V<*Qm*9IMH9f<~*~JIKHyX<p#5>aIBA<
z_VA_dL>c_~d$@iLFC#8ne$Bp(@_ilu4(18gbAL4PX%hY!j(;2{|7%(z-&kVODEUCp
zx_?O!kGDj=7HDi4-Z-M!H(<Xp#N7@-b5#0vM0bDxi4(2TTc8JHG{zInh5-8o+W{N-
z_YOp}7vN7N>fDTF=tI66{-u03{!95j(GvM`JXZ;L0tMiAW4NA%AYkF#wnI3AU=zUi
zWOyBkAzA|#hoG$&z;|Id4V@8mLcr}ya6i5~)4h=pCqN$nc0Z;cKz{@Sya2va!}%GC
z0N<bC{21M*8Qw_1w2C?a;O}ZN2i>x3g#123nX|ge!^08299Hr546wFP=Cs#n!G>oG
zj%i_ojZiRCp9|{j+81%V=sk?d090a`3KKyzk!6l5M{T(uH)70WV<r#rvG*$){C~WI
z<516hpCD(wKjv<%o&N0^=t?78R#f(0;PKJZuaMgx<=lka{-~pR52cCsPntpQo=DWT
zg_PA7-D7!XRXx0<+~%~ExYRxSXp|{&&*T9wgz2lPR7+ik&zfVAX{#adj*J1O&?Vw0
zUoJDmiHag92R^yh8M*n2*>?tMjPk(>#q-p?56XuDoSlFXBM2YNLYdW6EO829!OjY0
zRT867+vrN$V)xv+I<GUJ9LM_(etUUpg?*IL)+R`)$UsWl;M;ci-66!;>7g2L^uqU6
zANh_$O}D1baywUHo9p&qQcJhy=a{uHY)s6?HEf)CfEb@X;3p3FHYMMl3nAL8M=TD1
z?I;|@i6q(<E&FAyLvNytMSBvZC1Yu$AZ;8<3p?HFHg6cD?Eq;z678iU=I;#tF5vG9
z@w*Z2MI+|#0sfwlcRPc;Lx_H>v=_wbW565VN*zG7ZBfiyKM%YOLHRHk>IZMkKg<hn
zgAMZbrEG^2ZOfI#NrE^jC{7y0$spRRN=9Xy?t%XiC?A$~B;p^J9p`@(_(y~PBrp7D
z8|2M-%Y`^O5a(nA-p0c3IQShe@RsL+w^L9)jJ8Qa+2%Kqw~0dD?TBg{Q$Bj&X)A{4
zznfc|R^DVTl^+<z2!!owZPFo)tV@dx3~$R#96>3L2^a&2ZzRPmmReA-Fkh^lU*Twk
ztz<V->%vxYiuhgFN*0&o&k_9nFk6^haaBrW-ADwyn>P``V<N3)8_Y3aQrcEpTI{S|
zRO<na^tX%q)uBO6buH$C=>N>97k00`QZMXg8&W?{d1%HAjafmqWB~>s^U|@1>=34x
z>N!{*YN;TG`{^QCKM4MxMY2?cPGY#R?hWByQ-m%Ph69ixcFd>`w$XikQ7RtS<+;@^
zSGCyA2>bhP)j67YuFf%@rR1MB<PhDTx9S}0Sqg3yLfCiT?J2(W_Y3%cY9NOD>skp9
zGh)Wiwn^B2_sWd09lys@{1&gqIXYH-sK(If$hEKuwRW$c>BUOAj~A_Qc;Q#ELV0Mc
z8dHZo6!Z@!1@xm)0h{6X=Zr(l64pk@uNo9r*vrH-Q^Hw?S4BM(&N{d?5W*P+Z+ceL
zoc+CUoWN_H5{?TP_LwEr6<+%L{kYvwNhkV0Gm?c@w!D(uogCL@!0+NQZ3dXu@tZI|
zH<)12^`mo%;R9IS>ihw-I^WYjJo^MZjqSS=h;Q{W7S&6q!P`u|i06vgn|Q96eLS=8
zU&-eX-JiFL+1IlS+_YSH9qTtQJ@-Me=k8A$*Sve;L3mBeD-Q$72~<;abgcSNQ=LSs
zsf?RJq|qJb0Y5u!!wP(yD=rZ_Tnlu_{IHe}d*6@Y_|14u<Ohxy@Vw6bfyDTH2tNZ7
z0CB1yZ5zmw@o+YxKZNMs&^d?Whj(h^kzq7(FLLpG2<I^X0siCbV>rl9S?uDpDIJG0
z#1{|9`BFxq%~(c0<h!dS@>R<O3Dg3OBIZ@uwNM%)e(291S%o(v&9TI6(r<z0$zKcI
zuow0Na4k1roZ9BJ(9LoCOJAg(zP&9--(IttYJo;lls+_y7HBi^$<Q{Wc^el{RJC!5
zAqI_0j>h}H(761<X~PdC`BPEi$Z>5{emNfb#v)u?rQ>m#(fz8x&1gYo<`*saP-Htl
z%c5N~$|RN;HCl6hlvaBW(lVm5Dbi}I<ozm_LPyfPwHS_Js<pVY;x9UZh6wvPZ5<T<
zZ`|LN_@_Z#4a%kkYxDk=$d}>kLbPX_{NcZQGtv<&#M4#>{OwMA5M6hApar|r1C8nK
zd+O=i*HZMcyt)zXVX5Y<<-Li%miHkBwLC{7{9mZ$2RUsCM0?!CAMNf(j%)2ccs%lr
zMYy;=_9y1Oj}a|R>3#YV?Qtf5%3@$M$|RN;wKPs2r8UTdw2Y{1inInGd<Wd|>A;t#
zAB>(U^cCU}v^&m_fS{KN^hMCmPZ)$?@NtGj1iqAQk_n_DNcR)65M&=`up-DYfiVd3
z`~>`b#KhwalM%#e(3GnOcQxXWyBuzSOQj)@pJr8FD}eL7^XAn!t6kOnJnX!A?pQb`
ztAr_l%`AA!$=hW=DP?Z_)IjTV0-C!y&E20U*Bn}Hs<zrxZMCV|YExARYqhEBO^-a(
z<yM=jexz|3d}+0*+G<m^Wo)Xl{A9x;jEf*u56Yd6s`=oHE~&1psdL$jfj8A69lR}v
zuQ|zjT1(t(7KT>ir!BT(_g^-C>IrnXxSAqsynaj;87gj%_d#AGT2-=DC51d#T$y%Q
z^k<XPz5#HTIhdHfk!ZRQA;i<QycVJ5RZ}zT()UBZkjryM*mX}kV)NAOBYxUZB>4fO
z@lR`2XsrtWZtF~rqS}H<>t1FnaaB797F5?dToQdl6RMr1cBj2GVRS;#Ad~83L3LUE
z7|tgQX!H^yTk@?{YoTI%07#WaIyH`}d7KSbxxLcPM?EFg5*dyvn@f!zT!z>TtEs4V
z72B&8YWcuDa4fO_Bn5~kiAPGQQLjyNl<>f|GAGJ!B;eV@<Jjs-9sh&w@RD--LT7b_
zD&E!re*=O`t4rtETRs1?o}e<RL27mT<PI*ct}3-1XHRJPk~+!f_}}pcoo%<*w4ghp
z!d6w9;H+L~cQz>`w8BwUQe9Eyo43;LEU9xYX^c0d%28WutNM@mLTjq4Tn<}FGk8O4
z>*m^=b^mRjt9nVbHb89k{SWyf7FJixuU%xDXRm4v`5z3?Plp0*Y0cn@u%#s|sI$3T
z&gv5+B%=`_@g>RWNl6J(OIPQdYpY5~SyXzQzl4=kI~UqrE_*ZiW9G{_xoqXt_JlMw
zn`Q|PskBu%=GHaQ7g=I++7{}B@ZS!KE49s+8q<UtN6GxU8YHj(l@F1nbxvE=yy|~7
zsIAjhnINmB8-4!|KSkRY*Vs$tr9Y&J{)9$Jp%v9myAxqO9$_(M5=ZU~ojAdvm36f)
zo4UaHZ+U}D919&K$JrB7TU~{gfm^^A>XLc_XLVJJd*kOh*^IZO-0rNNZ^zx(fAdQu
zYY^?a8GNII+Em#VI_BA2)lSqP_#PNjSzTA<vN@{k`kLJw!6CK^m!s0wyuM1?V&8nx
zH8xi{v)-tqsp+8+O0m{B9gCYeB&tkmUu#`bZZ@siJq}@!j>;OR{T$l~7nJ6#tE$oo
zzlDQ>OKg?uT37RWqGV4(ZFPk-==_(%LZyjDbwx#Ucss~zktNPLN3A4DX^tulOf-3V
z{u-yOTu5`1lIlv^3I8E*p3Bh)XYkq5l+JdXJrPy*MG155c2|YH#X~}BY_+vESBv<R
zL%n;fM?PhLpYh)uqWr19A!YKP`WvQ??{7NwR}7DZ-}b5UpWAN-_znlRRlgc8{KCO)
zQ#@VZdmx1KUrtXHe2)UZ`W?a}fPQbk(I&sKfTzu!?C&o1HwN<Izwwa2_}fnS^))`{
zza!*l7T?!=-S7c_?ZMA~IlpGVJY72;F6agSF8+%7n8{hp#|u8|As>H*df-+cMEw$f
z;iQ=Q%k@UAPvUPd@GJgu{p9+j@@-O{YW)%A1>tdC%GIoV#q#vVca-oOCj9bnv3~LV
zItcBFS-op-lE1fK3-HN*{Y@{HQzX#Af5mbV^K)ybOyuKlI*_B89=qwas5fSMXvQD^
zC0>4^f3P0&u|A0MhV&vHQ+z^tzx%f+B_2@@^zD&`b6)^#0>FMx-!=*EO$Om&d@+ug
zm(e#dy%|nVagD+~(QhUPGrtkdVjkXP=?RA?K0TF}H@=Ky;!oN5A~R3vyy5Zpdt-Ik
z3|C|7gFkfpK|imrBuIb|1fITO2x|hE<!hFgH!#YhsWcklGs@E&4c_P!<?Ra`kMa`J
z`I-+-XB5^Le$DdrMwdU|f=qZ7c`(9hgo~U1FiXe3d!jLxXxsE*wmMh!z>4%id=6b{
ztE#hA46Jky$rz+dV_P`LXB=V*vls?Agj{Pg{LNKMVTQB9G=)i?Y0c#~rO0axAGm5Z
zDjVl_lUnMDMPG$TUlSziU7x-AO)+@m!rvUdrT4Cz{`2IAWPO%oQ%l>s<h}WupdF1a
zZT~f}36k}LZePm0Ir3>jjcbDLohVZFqaSIsRyTKh;{|`S_qG1ER!z_b(@~Ar_RekW
z<TY3SQ@D+4nysw?d`+qqO>Y4kkF(YfPMWoLHsffZIocU-HO_b5&2`J?6sDuY=F=U6
zF>}lGN&b&D8Z+K+1|9W2i#TpFYl7bT+yeSB6ZDj;eg<tRYnmgi<sHrzbNG^b6XnzF
zj^iNk6U5QPf;;i`@C4DQ|DtA|XqCFzGd)4-pkIy?L=OP?KnK7774e=Rd>t2uwr3Vk
z6eBk6;5U}&O{~YlH)e4phG^!aW00T4VQ0}I;g{i#g76sh8-wEUxD0OxlwTbBjRoL&
zMj|}12rthw8bLV9rwu?j(F`2%-zXH`2Kgg}c#$Z6w2*HbA%7gkabW(iJg{8YiMLKf
zd8(!(xVwPA6O<3f(_ZkmMbHj_=OvaImk*~E^A=^oFXpK|<kJy==ZopUGU8_YIW64H
zY3KUY4)JEh4|j_?6$|;a7XZr!hhu$<2ibC2cY|MEZX7?B9}jDfaCd+>y-;`uAzzM%
z8PMvE;&J{s&25RkF2)h9zIF%bNt6d>x&U+~nyz#Me`f)(ZgDwsJ&A+#9f@A&a9PFy
zUY>7v_~r8J0KolmNCsF};!&QwjH8f@`8OWM<-p}0h2n6(SWf(VJjw^>7boBl({dSb
z{&*R8gkL5%R@Mms-Ql-4(Vl<c^5HV;iOP%XS$9-korqqRTsOEZxa@d&aQ(&YFy@!@
z*&Xor1RA-na2`7&S@1fD^{N}u>k!v1tS?;zoL$i`uD?7FkuFg-U5TdW_#2lclRc|P
z{ot3&gXhKTV_Ou?%Y)Y;#>-HHa1I?&oIxmFHz8g(^o;{63w4HGbHY2+vA5kV#NE+c
zm`i=gtGOJ$@O3=1XdFGhq&g0qp7;)T^=n+55`Q8e2Zt|J#)ATQsGc6g(sH~vjvil<
zHx6NQ3E_Ck@c*;Jg93P{DZXf(hpOjSx7FBwSwDgZW!BhjC9eNnZ>6KO)^U#ACvRYF
z-NHsVLsM$)HkW1?t_fbhvh{~Guih%hLd|yD3F!?{OilbtzDDYXvGd<$r3D%Lx7dxS
zt(&W__^l!ShBR+`;=%jxvUwcz{rEbYu>)+1_h0I|nl8U)t3OTfHiH%%uW~+726jys
zZ_9T}CkXq^==~q-kc^*+4ROt_C@$sOnt_s!)5soqNGx;ODj{hEOUkxT0+veu8X6ip
z5dHHIk&rTq9lK(WPSiQo!G3UMb*Wu_&!W^(Tf;p;OYAmvN>H*yjpK6Ks`$$gfu&AW
zr%PN*xU+l_cLtQH=}IbWwfr3|{u~BeXC?<wcz5FdM1=Zah5M(-dG_kc;u@PIk6MFg
zX$UcXM<ck(mguY`x(7croVDfEixMmB^Xyfn9M@6>nZY|DD4EOVYC0+I4`eXBo~32>
zhKY)!4SMoSDty!DjpLBp>`__u0p2D6!{3JGAE+@F!|?)RZm$6aO;ZE`>G@+o$Jj5s
zm-ibneazS|zNE0;5au(U025pRq*1bA59`D<xQFK=UUga&#bh(F)L!LsNEOlZH}n$I
zQ4eK_-xX`%@me2KQla%RCE`|}8`DS~YtYx*wy2I7J!p;}(N<;K734dcc-Bew%jyr;
z%OHpw0^h>mHynN=P#U)H&Gu^}A$=%O_F|&|4?lNMPLzG@VSqapaKyuJTS(9LaEl;r
z2T05CbOE0D9`rz>>^Jv<{G%YO3kqZW4<O2(ZZAYjF9_%77rG&y*<Lf-(@%ikUO-n*
zqV4<kA=<t_^Y?=|y+OVMh+lPD)uqlQ<@S>K#ZLQzI)~F<T5Pk|y2|Y?M@g-Kz@kUc
zQK<B7#YRAF^e}4pc1*w>pvRph5+A7l9xmi1E2k0OAkXP$;dDmngNd=wB5SK`^X<ia
zC?$%)6H1}&9cIJ|jvs15LbTIfS-sF+T<MV70eM8`2}>C9Sa`F{3Qrvq8L<*hJeOL8
z=S&5Su>_ixVx(VqG~#2bs;jts1mzR6(a>9hjL_rw>WhkHjw*-C4qx#*jb^zpW}=A4
z@}DINv5h`{2XFDC_3_t=-(!gHRRX7rMd_G7jx?@NBII$o-NoL-bC;CaFQz2kvjnwK
zs?9l1Y{c3U&nnwa|2-RzBH8!$#JDyFFQ}`QRl!566vvTt&>^uz3Jk<~YC?z)trqV_
zxxbU1=xsuN+f6HfX=zQx5@RA9Vxm}_upoEJ*nEMi&Uz|(V*~+6T89lXH0#(rs6m`j
zJP!)HOz;mest9Hk(F0ylQ12PxWq+*VTCDpC{_|87jMOm9<y<nx<|--o5-+s0I<dr7
zQDGd-=en#;X89xX?tEDy%$m<&yVF_iBp#I_r-@3j*prra93xY2la2Y8*&G%2(ga6U
z!W=_X-n#|EiZF1{oDmVeNjb+%%Au9gP~SF>cQlaS5}rR=*6<`EtpUdCR%!|(PJC&Q
zy~VQ-OMua|FO+E0FGl@D_|B)OL68Qz)V9j<gEB_oQ>oCUoHM-y;IC2B!VW-J7r@yC
zXle&E@nL)*xbgQ8^gS54IgMe!6Y!X&@T7FX2Z=iUbZ~!=;-7_i?yywPWS|EiL^G9S
z1kE9@FycPVWBm0;eDkslmi<ai@?jd@wgIJU?rC_KAV-5gzwk<Y1#@~sedEu8<2U#6
zhOhN2jA%M0@etC|^(&s}>sLFVX(Gsl*D)N<XJKabDV*r)ld3mL%c!&&%pa-vLotu>
zOZ80gr5CXt@$lxVKN1~bK(DF?A@JKqS0>63o`pvvIfk3aQIruc=NO{3HDa3*F37;-
zuTkY4LEQC_%a<)};C3PyXjSlQ<rt^PD;(1tu~bWEA^K&rrqIpuiz1pnF+Xe|WOIKz
z_-zlr9hC2JP=_LTym%#i8~1l2+BFb$k|K?;od)6R*qbHnNc?M1I+EsVP}~UPV|$UH
zK`4_zAfz+VE{O!{7ZO!1i2yvEiDym03x+{Nn?><OG`?WS7YbQz=ZkbK4YU&C1lij{
zna6pBxGrX}mer_vhQXRw2Qgn^ZR@zrJKWdBENw*E^ov}UXfKOe-d2WUjA<9=7M55(
z!xomiTBj9ShH*j$e$FE-@m^LZyvzm^{YRtVA>7aM=50zeF>edX0bd&s3abZ25XS4G
zhtafL52A^?-tz`CdZ~uXQ_sSUj^ZHyMgU4%!e>Sns7HLnEB2t`2%B%H81<kb3chGO
z()M*k7)o?Au9%h{5t?^sT2C56ntuksM-V>zHHj~yev!3LLcf?J_KPs<QD>yQ{CgAq
zMp|!(+a2V@HilTg$I4OMTni)mzED_M_tqCSwcT^yMrx+^&xd}uS=vGOe%ZJ^w1BL5
z-H!ELR*jihE9&L{#M(km{E4+FI<@TmZP&n`1MON|OSNkj&?O7#2Jef+6`X)3LbyBl
zdsx7&@EgywGWO#5It%i|bO#wgC;|h%%-~<+m*bh;h_VcA0<=L8Lp0+T9OAqHkJk=?
zXi%Xu(JtX*qZM6*Fwr1`2>9}PPcP8hH)zCR7`AE<V0NfQYgFuuT}~U(H~uj2RXfEH
zG#Ny@nN{|RVzkp3g7#EHP|TQl^J<*cuIl2lYG<Xb0!8ON70h}p;&C&z2QnmJp0m2H
z#+(34`v%jTG%V^yj*QJ~_R7Ev!iFPvf9}Qw6quNps3dFGprW(Dd{ztdl8uTkwASt_
zb(EF4NAbe8g}`q|BLSoAfg?*>gu_4!jL8eGtCBR)J%Xw1qh#GvkE@yxrr;97nGO#y
z(QhO5%|Zg|Y)+f9Vu}P3?h7B0rS>vgU4;uyT8MJR8WC)uW{mbL8q5#G`f><Xe+t-X
zmybxZ`bPBUGf?vo%9|6S_`6E}z_zMiJ0(m^nxBxmD*HlPg$WT+?2JflxzTA~Z>Q_L
zFe*0_zTm?rWygq<48{rTOim`or}g**f-lX-K^(pi7ecfSJ|4b%T9D(D57t3Mp}5fy
zCl-F=AZ`rNEGhB0+{y~?WI7Vf(h8sabcXcp4bpR~FC2P6+-|^20`Y57G+g|ypW%@A
zg^7siVKrd_lPu!(W#VCkS6C=CJued?*GlokjM)*m;M$rBhs%I9oLIjO=yi?Ky@BwB
zKVdP^l)qqhd|zZwSSjo-%o2Nu90_OMgq1~00xy)ttStJmBHB?<B2X$N{{Za_QJjEh
zjFHj=5Mv>m#WOhDCkWlrOO%_i(x53FyHO%<bl}*+AvtmVRvTJW*%txA!{Vpvic<7R
z<iUqk-r*+ULWQ>UF{;pJ71`ek>_M?Nnb4<=4W*&Tf;xLibw#Z%rKPVYE%2o(g}R?Q
zMaepQBg`_;I6{AugyJE^O<p7(?x}>l{Tkp~ZG_Jto_ILZtc1Q2<;uE0ycX?ActKUp
z>P0^AhrX+FRdJL$t7{;x(R!CTF4H`E2{P&-<{36^;^cz-iQ{JZkS}fv^ao(w_mZOZ
za`O3fV}x_AqpCEq%u!)-a;=w<R4q{d#T^S@wq=`hxee_a=vJAul`cz99*%bte4(>B
z&)fLBN*97?u|R*;GOKiohN7LJ?%W^dlf1O_VJ#ppPrC;!y?*yVkrN)+!-}IE$Cq98
zo*?_~P*y#GuLPp)xLb%@zwdEyY%mx;uHzbUsoMhh#j}VW$~XbXZ>}=t`t3uVjBBC%
zb*tZP1$+Srj8<#Cz`wZ)-HXdh*~PT<=Ke?nc^lO&ZUPn6tv=cWS{XUvdKLpXQ%K8B
zpz*{x62)l)abk!zfo5@8uQ-xu=5f7Ep!sB((ITAGZ{`WG@p$=PoB(eU^AS$B^b_d!
zIC?gi^@?+&{$hT@1lDBuY#AV?6XvdNc!jyDPorpqxm~l2rVZwR9!J#%vp|odYxj|2
zV@BDQfxhtSUY3?YJf1J3Y5&KN3LXE#7_y&#V@RcLH1`<N0%eWHYW<DIYW|*yg;Nmx
zOzbAyC${97Sbyn^(9x;rLQB$#`cGBP&`+)Q)Sr6N+ep&<W8^rZAKO|Xe};pvFff^(
zLA3iCe1VwJoI|v}3Z45^b*&AOfd%R~=OIqu0srx_lMjE5+I3D@s?SFRJRS=XAXNCy
zz-)&-3PH3N;5*E15r|JE7{I?aR`fx4d$pwQ`ZBL<&U_`jgE?Nf7ZHv1-0ZYrZ=&c_
zZ(@GhFk10%<FsIU%+9y#S6`0znLJOW-KH6(&heZKcv`x~axgk4Vg`QJSU12A5_Izp
ze%E*eYzEdvOlxGeiAQGBeSh{y`EU?#nt#mQIMbLZ)vg3bWle>>(q84VmnvxFb6?^S
z5=2fQr=6%_<gP~SdCTGM?7AB-Qw%l1nmrsq0dK&(d8R@pRtXWeo7EhaoDA)438t_N
ziBX}|+)+#QH~(pKN1i~3D~KP+(3#-!zn18am^iA+?M{a)Q91HraKywdNf0xW2j#*3
z4nRCs*}D72ms!gg>Q?)amBZlLa$61egp^cQRM$BL(!xrewnfGB?Muw-JhuF#v7}f}
zG;2Tc`7kw2l%qDWq}=ATmAK@p58pfDb_3@D<ZC<!xMwu1!4#1tr9q!gMchF?BUE9N
znjtxh;#!we+t~DkNdysdL`}7$%2hkwSv?oND(es12>&w((+M`PMmFT&fe2mnfO$0n
z2|@E}Y+@+~N@XuZ3$|6%l-qnL1yye_!y;|jNI-f$fx%t~#4$-fvZ0aq!^)~9c`Lez
z*|Q8WC>_gS^Bj^e1==ab8ke51T8#<IN#+b8@kzQ>J>PDCJT%emldKpys;(Gv99Xfa
z)Gt~rIT~x0+y?L2;r2y68W*C*PgUc`VttxhVJn%>BU%4eBVhxc*lkb=)X_qSrv}Wu
zol}%dr`Lk;eLKbHsZ;UaUgmhE(TDV4FH;^4>TE~`aOGUg9ZQJ*;3lTove+naiB@2t
zB7WaH2GdM=V=soQ4Yc?{*Jf}v;#B?rP$%5(1j<8&Al`-xe6#QoZWA2fKZrL?f*gW)
zqlsh?#2ZQ>EJsVP%Ezr)iI4{ZVAIE&LDxd$eo}KQmL$9!r$<8rOR|SnOV{T%t>?F-
z`kIeH-|lw$UQ(QDXDK>)kFh=pzvYMm-X|0qaHL_KH?K;u&LdW)2*t=U8(3%@rMYut
z84qFoj5|~HdyzJfu%tH_#=(OPONJ81lBsBdB@6rp9lDx+gl8JW4qee@FE~vG38pwl
z0nX7HPNRNOO^=2E7ORk#m%diT;cs8d=sdZhW%JNMh`D)XziPEXjr@<+Y_(oAu`XLC
z81<d4wYRnQHec-x(C5u+oVUIjP2Uw6d7rZX7izCohs2SML6|{ZF`8EUQ&-gVCUwPZ
z>guVkG<G`dYh5v#?D|_*8a=D@qpmc0=Bl>7(R_JJZGpW^_3)I>?`HtQ*_5vUGBW)D
zSR~#oiDo(uao_;}%NQQ7bw#{&MZ6IY0O+qL*45gJ;ofE~@!j};2v|eoI@(^rEz8wO
zJnJIY!wy8*LFT6tLy5b=e-<~4cs}*W;zS$7i6eR=|M5TvEALdI8CAn4AUhE4E<B@g
zEW~GcM?<)SC|3x_5bZ&C9*3Xc=^?>%h<6gC<8<|eG`--rH{_iFdG|qa`$3!m@H-IV
z_J=quU$&6RmWkPE))Jzec40V*APm#Z9+|H|ai&8Urm-EkS=kLIezg;(_k|+$R@gBL
z{b5l{CoziveHP*3cm1JIuLsQ8>XZD3n<k7;I4mk2nsgYYH^L*_n=}<o6c-IF7-1Yj
zGu>U}T{t-EW+Kju)qjt&LE-sE;RXjx#j7{k3YN3NCO%Q1b%>1dVwa<IiI0-?a`kYs
zC<U{thw5u)?#Vr6FPutsJN+!2LiOqTuctZ|?U7ygJGU_=?8e$^i|5TNX4apJx%Fpn
zeEUOZtOq*Hl+=j+IO3@SdM+Pd`C$3PdzPy<YnR$A%{l|d&{mJZ?Wu)!=sWYbC;n7N
zGQgWAcJL@E<97m#@N^_58lyZ7WBMYD@UDQzrzs??g=Wo2XB{5#47PETi_>=@P1ih6
z^mT0lpCUey9}9GO;#qLk%J^a@8J`ypAv#M}{9Lno2MSKQ$;~_;jYcgcx<5Yh3`8LO
z9hWoDL3$n>nC6x^^K5}`PCFB?-3JUod_)u9EK{BVJQN|`c==!pi~I~bFBeV+=k<75
zqio9g&v3<Wc>YuE`o#~rZoj3^e_Ehl;5QQ_e*yRVc#?fGj^CU&Fkm%Zy%B;><9e8*
zI&l2vq(?agZ<)&R$M`9qm}aC0^BGMv>)=McRp9JV!-%e-EyWEpOB$#6?MefzA1a^@
zsK`@C;SCROw0JxVGT_dS*y4(2)R34NMu)g+!vPoe2H*yxr3|Atwju9;WMVlEhUYEN
z0kgd075_Bu$9B~Dk{OpjZ@gL98Nd^6yg5Ie5Fb1*acRfLOCgPtzdv?CJ1aEG11f)-
zl+L92sxZgX+;%}3on28`Fu#5v=vI$Q^?s%7cs=I$2=T8y@P>!;?*#rmEcK~t9UQ{E
zfG-zxViClfKsy8-OrSFWj~5AGbo9T62jFpdI)*F33-m?M-v<~BFbF{(0PY_GFbn~Y
z!~H`LAd{EPbx4_QgM^!#r&R0@`eu>Rg2TMnK~&BBjcYKVOTgQ~f4pewEN{7-ur?+6
zVxwP1B=g|+G$Y}m{2F0IlG|kTWH(Z||6-i)iE1NKnb{b~$lNAobqzC(Elm(KW0Il@
zG}VZtqrok$Wje3bL_0x={qiPz>*l)hXAsu9xvo$DzrMMyI~GoaqqfAS!?6Ok(}Wsk
zwhd`?K+UMO`FMNJtb6wPwjoatsOMhpV;jp4KPzs=8}CATpi=IsFIhR>I1eSCU{w3X
z76v!DbJ1zn?(PKO6NP%KR0d6$QD33<s{xwTCqBUDr|`x!2*jE+T@#RCf^a^F9>iRc
z({2|w$iqs?9TlZcdzFGjH9n_PiFx^sT4fi>GQ+GfJtiOyFSk{dR@jTHY)!KY%%}=B
zD041WSe#Z4ZX&j`4C0z~?c(=P_nygJyk?9mV?wy{=?~}3z09oEv)L`oT1gr4%@k8V
zqM2f1arlENArObnu)0E=a3KzVe55<*0-Fi3SrWHt%r@$yiM9g6(!>#MgPpm{A#Dca
z5l^(k1l@rrmX5WpMG&_G!qWxPbphT70=>P!-3__<fr9}MzZar|hjU~8J&1MyA%W<P
z@biO2y@+N$t{>43G4R6*+{^|)u+SG|F_`#uprBc7D|^Z>K>uCb1RNyB7fu_v(I=c9
z(5kzmHnGfB;*uXc#tDRD0^Ten>t1ma88tXEplx<nRoEBWb%tP@D_i#0hUizl%*Yc?
zEV!BY5>71WOgN$NY?JU8ywEww=({I+g%b{<e#{c-7tSVl+$uCUo8Y#EEF3oQxW$Xp
z;X?=phX}l#1PS*=PDc60c$AkmvE}h&_3^XB@4{h#Z1H=%?z<(@m-fJY>)tJ$=~i$%
z&*`4@qJM^4@)$opB4Dt^@3V2+;L-}ks<r_UV&yh|-V;+L91sxAajQ~pSwq`iTXz#f
z+s3Ve2RvdeFEqiMViGRau4TSxmszXP*j5@BFsuFkJn@M%JlaQE#1>?nF)jKhJB?@+
z!&Svh^wDOu<;1hld<MrKgy`|rB62XI(E^3cX>3l5$moQ~nVjAbqS+Bbau~$F2N3N9
z13%e-_OxM>Oxpm(d+q*iB!YFe<F=95l#N=Dm%33Kt;jVT)7(-Uwdh2K`hvBRPJ6*R
zj%XXCd}CVMVAkH<j8W35FIaaVzFYWGyCA+(?N3E)PnxeHx)J8j3)UTp_ZO_A08ct;
zUK_yepd=>(URHamh*H}I!S^=sJsL`@9OxN<7%zgbvx)I(dQTHi3vuiIPtXT(eCy^7
zCTSkKdxJSziuhfab2X~_Z-(RCEjAo*p9s0X!XQk1nss<^;`Qi?uJIld_k_?}FT*{k
z1<D+q{Zr{v&i<LdO0AuR)a*5e6aPmM&?EC#oyWD*D8h%6ViA~+=00YUzEtDi2!arV
z2;j-mVgv$w1cQ%`q7mR@8L<G|&(p;bW1Fca;P;UOQLc?GPpQ0$v96RgLB1TY{31+%
zf9J<c+7NeH#~73smuV~lZ}R2wIuQL0Fb4dXN>>1U)P>*v>Q3}lFguw*PXxWZKwtl%
zF*gO>Ek??wihQ$$hY_h3BZG-2TSmI9yjmgpe_<Y|)v!*h`3=)NTFq~qpyoGPPXWQ7
zL0Ic4AbtA(2d99v9yFLA;d9y-)H$5?QuzoUoV%&2t}4b?!Qpl|emUCR1Rb2j(5%xM
z>M4g-vlxw=K>d%K#SrvWJEvthPBl8Y)#$MqlWb3rZYnWfOfG<5HiBm7Iry?0(^fVW
zVw%dPsG}k6DS(TmIf-aXh-?aimk^^MA3k;B=Qd)AZ*Oh)CdcckFpJv~Uw}TnmnAQy
zcOgVy!}xbkbP7v}W><*)${*v`FF9&GG5_x0F%V-u&3<t{LJKGE&yN@B!b_4qEI~dg
z_{)vt;m(hKNz{mLVfk@_klxEUpJUN4Qfgxw{#`hY(b1E12+R+RsEzUuPmvyvGIkPA
zad^igYM(N`azFhckGBXPZ@Hh^(8fxfpIHCs7-1%tIREfu8*7C!UNboT;YPRql!w>+
zMw-#x&Obb6bdTb6>kUCzMvHWJ(0@Nk{4Q+Bc%vuDxE3m7Q6qi@!w$7#{(`{<H~IvF
z4BqI=F`+Nlr1fwcj}qc0>yx>x1>+5Fyma%zOZR`77qefyiJ**MmHN*&J<#!kR36^+
z;|EF}EqVL^$+|>nC&l}ld?U_|`Q!5OfXA>mHN#_pUJY&5lyiS5@ozIKlpNRbdSOfC
zYXP0tHn)`Cm~ZIB5>LG`r#hY(-(nVLXl&ff&u52#pP#|zT!a(dvu1Ly?yr_dBF-$(
zv$y2-n6Gq2>FFvuQRr%;qpRgw1q+OZIDH+6vEDGf@I`O<T#ETwzF3bUiSDpYCeBwh
ziz4bP#={Q*#VB~P8>5f0dijnxQOZ3|PgizJ(Ip;|^5M^4y~=88`NosRmv39W97mog
z<=c+@&y?>-TrOP<_~+{`?STJ8qLeL;!;b~9urP?jdpU-0I8oY-p+u8S8-lH~XuYMP
zTgqtM0;6w!sO}))2O*Uaqy<KdES!zDqlx=Sk$vlgWZ7k@Mgw09;=s4ZgNZt#)2(>o
zZF%v?1K)ygLRpN$Wf9{?S;PWu!;yUXVbiXHe7l>-ubYYdxZGm=kQ<j(6J&8Ru2UVB
zl9u}Om#z#<SA6r))u?tM74kPLlVAj)CcyPL5&;hj7Jwhm!N*KuJODmE(%w5Xakbk0
z8AD`6-2ZbM`Wvn_)wK?0Y=LO=o&yk<ao63LDvsxv57ri#+w6r^(F#LjfZ_M`O|J0`
zS{itVi}B#k(+S>r2{C_I#S~9~39bNletnUL-v;Yr3qTLl$1%Ns2b|1&z0E^8vN+s)
z{ahZeb)T-q=KcTWeL7y3o1-oA=SI0$m${Dd8+UNi&Ju;{rMixUdDC5>GieBnD!gzg
zL_H<1YY0NnDfkd#i5Dv!#NpdIaRM`Z@sgiNA0pt!#-4=30m=zw9#<Ypp;O8RG+-Ru
zZ~$+RxGWqET7k@J&>F?K1_7Usjh<-wr!mG90t{r}&E}C%M#jbvymWR!ARm7f_&7XW
z?t|!kdfMEG9V4={O~hn)U^R^QOHDbnYH6O1Uw06O%e(cNbJE%?HnCUq*6_K_58Ov<
zO{*<p_bd*zed)GYoWD^`In<!X`<aL4LKRYYO+~d!e#C(I^y2s`^X6RN9P7JdOPSEr
z8@^&5$*g+RRh7W>$^UG0y)`!9;}H)g&xFLzM&!1U$YqjCM9)8XBhIi3>7OTIN#L9(
zRMyqH5}bBdowF*zmVk5c+Orr8aN7A=Vg{<IQW(b@-CCE;<*0QzN_-m`cp45i_vmbv
zNqHMFegx%n7jip1q&e&*e=~jamj4w7`Oh*?BY$unGsDTt#8Y9%^TNh2Rrpfcf8x?+
zU5J&)O}eEgF<+wOEgy?7nvcRq^_T?M(r!4U?Iz%3-Fp;@!;joW6aS9qb%*f&KvM_!
z#*hCo-0W5`TSh(^!WfQ`L^HtK5&UdPH5+Ixf;5;uOn(oeY@;!E0#R-)O#(c9Ku&!j
zZ=9|l(8}BcAl+!lqnao;n@fRzW_Y&(!cIc6bP(-`BcpvN$RQDahr@3Y{3b)5!%&(O
z_?8CWQVq%?1L9=CZ#Mjn0NzJJ+Du403Z*>>;^sgcE5yk~vdjfO$3XshAUF2z*oRO)
z;~?%RqS;0mFQl6Q>6lDkM&EdTOlGSA-z1=KGSGJ_{1(9PRG@bX$}eBQ^^qX|X^{3b
z_?-?oicp-H@a=T?HVeXLNbtRZJebU9L%bQlA2ZB;0OFiU6un_`VMk$$C71(pA4RmG
zYda_>CY!ks2d66)(urk14}Qym?lR(D)@MVU3Wzh`AWjv;sew4vLL6Skte!~iS2G@~
z^?Dn7XanFphQdANAE5s(-k={K#upCuXp7I<+k?9JL4MJ%j4DO?86GdkYX9_v$L!Y^
z`3O@nuksOY{`<wRH#!-9;n=R}TUUX#cJa)t{^YG@jhlZLF8+%a$_AJfO{9l(?7QbP
zv~DVFc!JoINQASs=FfkX*eXiuDr}0GSd1Z>7%O$0qqbN}Bs_U2io}St7|*1+X-}*^
zzG!sLExvFJ*X_y0c>VXzf~YIZinOigbd~-VQ9I(fl@s1m>nwinp#Sc6+}qPJTK2t@
zhwsF_Rvt2v3jr&(-uU&@nh2LH%q}OScq(`H7faAGHMgEF62da{bbgn8@9N>ZyI_nT
zR`VtpBf9iwbv0(%)=KrwUR!J{EiE?bQ9K{xb?XV@j7fI8xZqZ|dia+Q+J<H88uqG%
zUPRE7_}4Zh=;AA{Uis4&_41sbN`DBu(C*em_SUEF?v2_$`uKvmL(3Gwf2vSQ+>(&P
z8&`cjb2Uh$tDjzqe!o!IpLlM`2I#*hd7_8?v93ovg5_&#GSCYxgFMi}z6&ON2McAg
zz@+9hiRK~1_ww?Tmp^$9^&?Lff0#c0PVsx9{(D~|xgq?6#q`68apM>2sB+oo$vX>f
zBJjb;ldr#+lq4M~%KKU#SQ%tCU`QsORWZec<$(VEg_x=Om_C~rOVg*j`gkZ-Ix$x)
z^RB~9s*N`9<qM4&El4B7pXm{gZ}z9n_+t38JkT%F$>L`dPaTyTUs|_US5>8~rF)`g
zgrC%m)WuhxVl-=Fjq)4iXMUrJaXkxX11dgH^pybdy9L?7RJPcZ^#*i7X+HO}bpn5n
zEC-=HEHF1d5sp&BG21^|nl1~BqxeNIe&L7bsa@1jChz>BPGjw`!d6ar1InB8)sl8t
zoAM%Zq@HF~AK`k9?h(f*frkKqKm<+U#{x5G?LwKQiu<F9Pg~goaQx=l%C^7^onP<a
z*HHLX2YzjVUjyM+T=+3`3(SDgQTGJy??AL8@%)+rI>N19RO>>tBkx8HQqXMAkGv9M
zGRqA>xBI$4M15aoS=gxQvl-%8vjh^m)jUBr;(xQEoN`y2mAIB}?mVcW1M_hlPjU=s
z$$Wh+haS3eu++3%^R7Kro#3Z@5}Hvy3|~(ro%%3(Z$Ihj-2&w$-rVX9W!0OQ-`wh>
z_)p*m%l)Z+%_&um*8+n$hQF8MAISav{G_j6^U^nn<7JCD(ZqdD#=Z@J-~P(C!92}i
z_#Fhl1L1dw5^pDuH%#&O<^DvUc-CZ=*o-{;AnU;`nJ+ifD%LPuvv{jGT#<vb0hbM$
zi*nk+mJ)A&ez#6TmAuv0zLR{ComOT^Eg(A&Wj3fK@@4o^lys`e>{Q?Aap3sPNlzO5
zveh0VonYUDH4!hCAC&a~KJ+36I0;eP3puViZsRozUvM+JGvQZQFY$|hj)&uIEUh9Q
z=4a~~{?vhNay;vR#nlq|GJIJI{;3@QNI&H;vYF`_q2RCM_(%Im&*)~RXOxD|(o?_}
zfFKA#h!@}^F&>}e!}l-SAf98qKwAW6cPp`ZZGN}2GlF;kZkoFz05|8&O?mV9-4Ss5
zdmu<a!1}>H0DTegbo~$vKrj$sFarL{JoocBeFYfS2uL-7bOf1x!e|6m6F3<`F2ERo
zJb<wX#(MysZUVqW1pM1606aV&V2Y2BkDw4>tN=wmz$_D(fk2eUYykfK3<PHaoCRP*
zz{7alkpOcMlmOTf@GzF9%nLXW%r}8b1l1<60Kq5?8goMsA8pnh1#VMmt3hYJ8+QsA
z2GQ~<$L4;|hYhBga`IJIwlWrJ@SWu!4SBG}a5&@<3v|T8Z(E{0F3WMTe3OX&geFhd
zfw&v;#5AyQ-Vk<$I9&|l@J6yH#OVQX5+Dvs(;IkVVQdn~8oxe33v0|+Bh(Mc)CTbf
zAo<M|_!tEF4uQOfLfXNQ|1cy=CbQ8H#u|oX2s;VI>j-o(TGQb-18}l|D>s&!Mf4|q
zvk{*o4fq@dd1n*tG;kl_CkMWtgz&I9xscZw_|1dgvG6+%XgHa;>&tlHjk$RP83=s8
z4frPs_$LECe!4dsWKIQ{pGvfo!R)sX(iK2Cu}06+W+Ksl|Cio}Qto3AExzgYSrGBG
zedc~@#mgyEpH6RLd?CcMy!<;Y>i_9pFQ-M__}3p1Rd(|{#TSl@Hp{78BRL93Ouh1<
zKXU4wk2W6oVBe!X^C{}Z5u<XF=;-RJ-p2UJZ>*kQzt4undCbq!N6ar?A7AaXBWhhv
zM@?~QwcDw^w)zxaPqwzxb5!LEN1iwoPMEsY#18uJZfB*06Q<%>sXoRf?)|J3!!4Xn
zO%{0ddM3@&sZt_b|7G!oL!oNvg;v=YB`OCO@klJB#^$ukHXm8yu4akn+G_2_ryh-H
z77l>=YNlJf%4|~8jKf*J-95`n+aq8@PkuWUBj|w;Fn>=yLZ9Dt4qZ^^D4AbcW<qCz
z9<6wc#jQs6@<@i)^OibY6Xi{a_wwxRL9QrVxKv;{&#avKcuv>HTb<~uORsFy;_v!%
zpv`rP*u7l%EW)>Zl+8y^d<o}E-P$qXOsHE=Hps(w;(Jfl&EJuTaMo1pVa^ld3+F`D
zYQqnRyPUSF+OleAWwBg^d}um+EK<Enn*Cx@h2y1KyIE3KrCldCVw9LP?v@konM`HK
zeURv%$+VyYIB-Zx9^&DhV!ofG9AwfCj44xV?Z6m5$mGioba*rJ>F<dFI5^ef2f|oh
z+(u%QpYn-n0r^A_clq$;6D}*hUK~QOePQtdZKYwcz>rwngEv~gZmJx?F0^_ABi2uO
z#x|oo8NL{Wp2Zx0JfbH?kaIcE#bnnJ<k21I?hmvUA=<g@+aezD;k0&s^4_i)dFOay
z1U&dJ&;-z*zb9|{a9-k+{M3DReo<c>sIzY!2)6&KwXw<}Im;66?}FsX_fPoYMtq2o
zx1DfU?p&<<AxnwbpTE4G1MQM(C#N&f>KyY6gKuwpoJYXV_fd6&0uOi$2kT~dEa-9i
zp2Ypoj@je%XdKOw_CQ4uO?)#=&wlxeFM+tBL_2t8f$_Hw+kC2;C6E*2fK)FeT2XZW
zy#rD$Ayp6gj&6y3Ii6_W<=G_pT42!0_<+sOaPIFzw1a0-xNM6ERp>I4QMw(cC&aBG
zC^KnL4AbFbv~1Sq?_jJS(GA8<h-0*U$??=vt<`CXe0?p4{<?B#xnr~}FMR0F0<%%(
zSN5AM7jXX&-{f~5j^CVvyuk|o3pxIwe$q3v1<KPG{cX;CIi8OIPk;#oArKFwgbLpo
zh(_)iL?Phc+aQQR5a$Kj0mKO4%}6*t$kN3Gx*_Oc0=*FQ@c?}R_yLvv2=GCb{sIj4
z0*MHQ10(^YAV@-x<^_xnpo~B;$^&>iq=FB!oZJZD`hpLuOh9l7f_yJ91>h6`3cQ0q
zbxUtgQGQmJz0q`h4CEOZJ{+WlX;<q*+|3oc8G?g&`2Fm!kQ=}G{6d31-5?RyF?YKB
zFpy^%rtxVx9r5hRK;Iz~e*DR6w!e8Hc^b+1NdvjCazE1G7N)GqA<K?>3yfusDo1U(
zb{MC&amIkX2m@OYv~>4T7;csk1;&Oo==`WgaZ7{hqY3|ItCE&EC?7YglFd*lF#cw;
z8mYtSkJZQ^oYp|WwfSw0Q+}(?s9?PnAU3<2X3m6a23KTWr}(N-t`eIIf@4u|Sf#C|
z#!)qIgh7(vk_xHL`Ix_m*b-q#oaIs@;=*dHZ1e4obL`r;6cjJamu8ITAupyGSd|wE
z3>cB1QFWQG4XhEHtZe6s-6kPsY+Eiznfb`3zU6EYHV>ZwZ0hTlKWw_G8>*DJAz~G{
z(3c7v%^PX?<;mp6bvw<}EE_&2;H^&=pBczP=}ev+(6+h_aJxwjXbbX%p2m+&44ZtB
zA5Y^HpLp@E0Vyr70Y)9BKP5iNL{h$RZ}=)gaK8N5fM+A`EG2)Jr<7xEeSx6m7lxZN
zUydgd@C5h<{!SQW+JlrVcuhhp|BGEt8(6v&D`TYcvBu`KRYHD5iSj(pKay*0l`YX(
zY2>ji<_`XnL2zxkt;Vh%ny9X*uH*V1RLjOA;4X7iRPZFWikfm8Pg3i0R?p|rxOoPC
zxq!b4B2I7IUiSbhRkjZ`A+cP&<-}qFqQ#M#4NGE!HYq#WpfzQID6L!JhAgr<7V-y3
z)VCOd760E2>IhkW%8S2z(r8wOa^yTAoZ{%K)0>iVd&&GnMQ`+xb#7gvNF@>1ClARO
zq5tUUSSmB{kI;&;c<NIy5sG*l#rV<2CiK1F7<f<-<sCLRc`S90AHgfgJniAYNad|=
zCfQoGE=luu{#icVsQ)aVb%bsPCM5h+uT-r1$l6nbfX`7?Zg)CdvI6id%BvS8R@mp+
zt4g)PVNZJg1zMRail>d-)lICDh^s1rHz(v4UKO;`7s?;t$v>s@4OWVpZ`6BRUzgLM
z?yJ4nJ$a%EMYLRpM*swV12hA&hrrKwy7><HVelWuqYk2t82CZ?zwu~+z%L)76Zj2K
zF9s;Tysi-KSJ%HZv7h2?Bdd#zsv<s?l9EYlDK);7jzB3L^}T8+X<CKec%dIY9#!LT
zYbiCYlpIh>estG97LB8|Xqr|ucrfd$(En}~dXt2%(7{_RVmxQ@b#ztOS{O|$j5$hS
zC^G;bg+8ZM=uH#4!8F%r;SX&Uev^c6IJft;?f<7$_>C97A+#<WH%^xS!wbiF-0a^H
zmoKZKW}wR<$hZpou}i{Nz0IDqm5&a$^-fk}fZ=cKbGyfzme>aUj?zsHq_tt-@md=O
zuRBVju31sNf<<7<jSXlKI9MN_mBGmk+A^=YuiM&5wd$h3gBi2#gzv9MOp`B^=d)8*
zzHhSM(Ap$w@mkf;r%m>@N>e8{eQiyeNp;oOXtbn`u@oB!T1wWSUhkzrZHpSzpg0)p
z1Zz1lO;fFm7#!z`;}pa9-VK^$`rt+yzGEERDn>8NbpY{NVOwh#X{}wyw~?18Hxi1#
z5CK&HPq?72y0*^ACmx~921#9oO^KbyVjop7jpYiXZuFEoYHKPWo90I!SiWe9CW3Gg
zF+O)=GIgt~V)uTDHvb^RbY8+R$Y<T~av3Uud=YzqYzf7O!kf&S!2o@@XO5bC4s@z8
z2fCxR!`I@qHc4JPe0I?39-%8{VfPw@P2yo5PCwo0B~D*n=Yok*y>kyS8+-&cy7rk+
z=AO3>A79o9sc=*}T>SJ<h$LxSRVi-;;VEE#RH>xGQKON@MnAWE8U5TRjD99VyhgGb
zqFhO1vHLYB1<l+=4N$<xPy;Cx+N1{a0==^VHJ*kAnJ<hRTSrPuwRo+~l3TNMr%4y>
z)98s;&FC@J)y~o}HmA#8>#*@Nf5CES6P8??3mDB0co>R!>gFF3y7{|XyLl~MYkl&m
zn}5Jd*KFF=8g%mpbNzCsqiVjEn)P;fdFk!`5PCZ!3Lj|gNtC}<&2Qd0H}5sV$Jt8k
zt|@ier42d2TZeHJ4azWS>g*W@Dsx`n25q`n;e05GPQ8azusuX|1PY66h9}~KUFG&l
zK1Yu%VI85P%H~p|2bUpM!)hw3UB%2avYO9adqu6u0sxZ)#FIoMrPQcb*$%5*QtWcL
zD(r$dIu%GFwL7X3YF)OujtYlsNy5Oq)5aDJ64D1Sb7}HQN3;T#Q|WKk^Y-0XUQ2iX
zFemrQ%zXOKIoroS*X~@pF?xDXr<?2O{`yPet8TxR9vu=8HSvwJXoLKdZvC?Fx5K;K
z|2F+)bj`zGMU18gq}Z;NXVc%5?{q`<%-Fe;*IRd7aOmsaYkr{XmM6S-_oF-Mk-3xC
z{r2P$`m2;kN=d)`Y4L`m2VbH4rGH}|{KWdnu8OP2+<Fl`2*1aM$%N2vR-P?k>Z8U#
z8o~&8fF6~zUtryF|Jw^cT#-!=K>ROc?@X#>6G8uGsm4A;{|rm+|I^})^vEBpu0Lbo
zpVl9NzC$BRPw)0j82#yE#_%2W7KzHkyDI5^#?=M?Sbt-&=A2A_1$;Z1R8HMmZ&7L=
z-F}!6e4BNJ!Z+PBSw=v2%K=m9mn`<kQu@1skA5SI>q5HuPWE?KeP}yb@YWZ8zO2V<
zQ}<doO=XTkdhpkKUzAv)TOr@WO!(*hV*N%E=Cr@QQ00A;k(Ya=^+38@pw@LDhkt;M
z1B(3XEs*{#hBc@&{o%^YNvj{<Pk&zhILnEyk)svS^?!Z7xx=y{^p|CF!P9?$?yslG
zALwR~|9VLGkD@=;Z*MGjE>C#Bo-l6C*-rlf`Rq~nuP4vRjM7bz&pMF*pECKGbhVP7
z_0fx123_e-P!38q(N+rO`nwXZo+#mTJrl?0o9QvBRHdS&KQihAqUgt<7k?<<t&ed%
z1${oO&`W>5RnD+JNYO+3&8O_|GkRDlSGr*zi`a`EW?Y=!o$gljhyMARTu}8EM*XA@
z=rKh<tv`ai0~9%1-*w3fOb>uO_bGZt|5otUTcDgbG4<SVjP8)tYa|6cbUUkU3qPX!
z;P(*JpZ!X8tPfD=r-x-VPPV=)aUo?v8)S0E)rTnZtq<tI2<}-QB8%q~`UCLyAQQuT
z_tXc<V(Lhb$;spC_s}jJN|pcU7FHY;S6g2K{D(mwj)2^iY8yh2Kt20AmR0zt=ub-h
zv2IuLtq+EB`waM50qxGmpbx*CB`Y%h1?u}=`2O1@Srh2XpdX(p^`9P8^u)Ro<nR|O
z#*g2rCrW*y2Y`-0K>vOPJiNW*<+&E*u@2-FsnlQV2GEC}Kwox%ejQchN%sIhzbSs}
zHmLW2FUJDq^}Sp?1M0(oj_>8-JD(l|Is60d#BWObLU)3EcPjOpeh=-!F{Rw;KfK+8
z`nLzlZHLk>TR*u>Ca6A=*9)jOL4cpreNe9M0rZ%n&-5=vU#$xIne~B6J+@u}?cs~i
zz8sK0$I=f$zDI$t{j55yexg1c+S$FJ*Sn$J`Ul#n)j;n7ki(B<@*kCR@4c4(&a{8p
z-}Dcb{OIrXF;KoorppzXw`=q_S!}1P@@Rl|?jI<p15i)Hpx-(IzXw5IzX5$cptQqu
z6RQq)KW5#h^pEus5Pu`IM{DJ3kVpRky?Ih;$E{B(`CHe3JhnkQ*C02C6L^1GPn7<(
zK0+4XTq`f%KcQcMdQ9JhcI$0LUi2`uyFWwu{ix7o-LBMgx(CYn0LbST@Pj_ngHT_-
z1i9~mb}vvVH|w{+-+R#C?t*^tu%e&!fn($Z^qOawz?RT0(9VAc<#`pze-GsUry@uC
zhthysZ-;XFL8(X9mlgS1A5`jTJ%RRSKePk%Jh_3SzkwWHgm&==)Pvuk9r{+G&-yy+
ze%BpP`~7X8Z(E^#y#dA_2bK1Y9s~V22<`edXy>_nVZ2gLpuB$qI>VImwSKAStMyCh
zSN16FBmG%vFY5`kW4}W^cmw=LmHwhW1nM8}*M4B#@vryML!d7QfZv~BoCEdQ`V#cd
z8<cud9|Us$8^&Y2pZFWvlW$}JEusgYz1t7<Yd`4g3rhW>-$DKV8``g}N;_E}1ntcs
zXg42+a&!T{f0Xj74+XtFsPIuAtng+18T9CHr970$i{(_JJZ^Q1<Yq?fZj#_m$=z9k
z45AqKyH#>`li(x?n31&`q~ER*^pxOa$$zu-n=SoLli)7N&5YumA^l1dGmA8VwOENf
z3AQjpbafIuPh?|!7G{v|ISFo*+{2{b>C$hy1k3=OG)A^CBXu*S-#nsN*?vX>cBS=c
z2{uaMqls+n&C*Nyy^1Kt2Var=sYEgAcC`feNWkW27fMh8<L;*<cz`I6biFUZLqwT4
zvJpKS*{_svFoTqZ(r>c#yND>S2i`0BbBOYo*EJG6D8ZLRF*sOHWb4V6ED3TY_v_Lx
zdo1iS>6d-~T!Q-~cvkYqNWU);dE#ne1|MIO;1UTgBFY0{Otwi9bdcOnN^r5{W)n^J
zNZMHvoJJJmg3=7d!hYY7e#a7df^Qio{Yo<%OM>)!o&={8`87mK52Cz|$P5zBlHfKe
zj4dYIEy4K`R7w8+5-=IOCH;;eicv#m0CKVfpAcnHVz~rQNbrsXYbE$zf>$MYRDyRU
zSSG;+2^LDQT7u7rGP!1x^0`DY<aoIR+laE5QAw0X*(OSGg#<4X#USA$5=gSPFrHtO
zV2k8tbgU!F>xa@jifoeL4hf2h^6;IcUlzVF@~h-#crK8B6N&P08=L7qE`gIM55ldG
zzOhAvKc!zrV~GSiB)2qcv#>>*Jg6TpNWUpWF^2diQH+(c-k8+~_RD5?t0dSZ!SzJ3
zUVelq#vC^jWpQG^1dA0>$ge=YC&BF!{7sa_oZS-alfJXLVQ(ToFlS+PPpYScjo?{5
zVr3`QGYk8EtpwjmZnmhy9@b+ERX-5ffTm?QQ4A`w`pfFEBzp_1C#?MTO0Zpm4~SyC
z@+YDgNW5QybBMBt!WK&YAc~PpR(B=&StPvVItg|X#VB*N1ZyPND#2q!HVkWFbxW$h
zvLQ%m=4@g7e@e91OV5_VSRG|;$Ztfpu5V#7O_Jaj3H~CAHU9My{74kzkBk>qzdw?I
z9lV(%!KFm8cqFwG7AAXEhuB&JTk2+YpB;!{ZO4Za?2+OyoUHslmY_z0LlXQf!Ics`
zOq7lNucUY^zu%=_rt_@ce=E6}Y|fS78zMh=Xkl`d>XL=&>(|QcJ``3GBwc|;h}`6)
zlmhH2#GWGTnXY+K)AF!qJoXe|4@#4knuk5(v8Mogim(SG$QX+~I86pllY!D?BqL8|
z@>uLCz@8%P!D+I1w4mIStRmH&n#3ao=BAESgEbnNy8yY1kbAo7&de@CZZ+RA$*Ea|
z*fWdA8k4L_o4E^-dj@iA@iS8jk(-AH=Ow3T;*>nu<FRK7_NcfdPcCwgMegy)jS{D3
z<YLcw?9oL-bfu+_#U3s9R8<**^3t+%k$Wt1k4Nq)$gSZ?*UBp|BQ+Pfark)To}jw3
zR2~BJvT`-&Sj~wds4R|6PEO|9A2c>OeZ2M|GgarAjyzgPk4?=~Ka5LG%E-r_Y1pHR
zXk2pY804Oa-1*2o4Y{?n=~?;6t;NYo9)sLCTuYmsHU_!#k$W0)tL%(VPD;<io<i)w
z(X!B&6cw4|NmbQS_T*s?ik5-DWT;<)#wTayB6l8g7b3S7KP4>>xeJjS$5ACRJ~Jy9
zxyK;)H00LeXJ?K<?mXl!MDA(Gt%_+va%$!j>?y<^wGK?s8e`_3g4}AGH6dM7S;;*G
zxpADS$X$rsT3+dy2u~(Tn~Bn9qO^$4Oq5q9$}1D)m5K7oO36cRmHCOusj8;S9<|k-
zn5wC<<mTal6Vs<@PAxQJoaW5eoRc-DmMjA$Qww)ucG5KDR_U0OrnVu>JsG*DAh#Aj
zTkB~g_hjUrg4|P)yAZk6?k+z$S?fuer%>~xBo$x}`jV~+P4>({9*iwr&7XO)@t16j
zP3!pblQUBaum?rUN*RYe)38VF`|^|3Y?xc+KR-p&Jjt!fB0n{~0J(X1V7^+HC8vfx
z3&qaTnySgknduX;XA<`0V^0zGsAGu9$rCl_B+aRo_mpH!PYcr1a*;b9xu+tx7C$2;
zAGr&W8^<Zq+}S98Hj1B(;%g(9g3L@c{?z283CLZ5+^V#uCZ{FmV^0D0sL=|OQ?xJ4
zGZlHXUXyuHw6rAb$wHp=wDH)(3pKD%l}_-q<RmQ;^Au{H3~gk~JVlx(Gh6LlrzLA`
zz2sKepO%`U#YxW|hum76jCAByn-}Ir;h88r6NjrxIxSP}frE>Zlag|=XCn3#XrAP>
z$=HLwq^3{A9^IE|nkOwK7kegQ&t&XD2r@8j8A-Y?g~*eMD9Fgtd8S|wMv#SQ%hUu_
zl%{p|l6xX@PeE=ieumZq6=h~jL~boSD^ulgdUA?3oME1E*fSn^vhbHI^d(i33iFJ`
z9$hq)X=?H~?7`8pF}7^omwfC&*wWK;v1dH?;AojBnl_S_J>#(l<&%Y(%SsxDJ>#(l
zr_n~N(^Hajk$XIHt3@+CO=}({w-zTY9fhl01x{DlkeuT*CyFppb84hyt7HXE&(=_A
zBA$_)toP(&Pa*cG?dpsSZGb4b$0K(>au*`Es_3(llhiT4?8(C(wOO8}&EA<?&3aav
zHoBjcf#PIK8f|hNpv+d{s%DE+$Iv1zTkv3u5Aj5MF(V$pBKX@0{<g>;EVywT3(;N}
zk3jwigETlkzs<x?I`Q;vgfyW<JA1{$LjiXvKnRKx4!@D$<|lPR1RR`Z&RbiAI~?f1
zXTCW7{IpOggtb9=#u><f(-@4x@i&$x5YZ9<ZXO5UF^fYo2}XSIbhw<jd^oN+gpZ#9
z3l{j~z`|mXJR<>D8^9Nd(s9{ZAWjSl<L5Z}t5ku2FA(t;P1GkV)e~vaKz|6r6Hl~b
zMqCE*fIkHA^0>S#V&OLg=;Cq+ggm)E@HBApOa6@mcO>B9ddSm9gFhDV^L!$K_Bcdu
zEb?>QZBQPZ2F_Ou(d$JFiib|9sV^UKUSlACo^KGyh11Sy;W#)gV!8414@Yrue6IIA
zjPoCa<iUBud2+l#@GSu7<>efT@bElC3BMhth4C`s{wPEPFRO4Q=XOGTu5+Q_#^v2k
z3B!xDQkc@pT6pJT;q9}9H_jG5I<WAL%fh=13vci(y!W#3L79bjB^KVBTlnC|!bde0
zKH9eMzQ)3P3JV|mTKN3M!Uvz0S-$iXg!dFo;;c1Yoq04=fBgSRQmGV*td+9H5ZTQT
zAwm?g%_v*4O!h1@LyCkX+eenMt8CdB43ceRkFkwW3^T?$W-K#i{Ct1EbH0E4&bjBj
z&wbr{-tYH0_uPB#`}uyJ=fl}SOGodEX<TcBJk;U`R3Q4vw9QYq1rLWu@z>7#UKbQ=
zSPulU#dkHlnyS&74Q}Qtf-5y$MPD|U?s*nkv))q5zVc*CdCr@$XmtY`-#4q2kXsv9
z<D(|_0`~4Jw4nc)x0c-HNS`v?Nm5EAw4q*|Yi$1HK7&o?oKbk*!R?Ov(-aXNbwN?@
z*=V2YoU<ym>Jbi-!mgR@+gEwK<NsY<@Z<bNP^r=8yAF}m5_nsy;vL_~?*D}?5NDcd
z^m0eHXD#GvpX=!noy(Zj+s0~_F{J1Fq9<yag()PC*9Gh=)4HIPQKl~6;@-_0P(0=i
zlB4GLsm!U4;juLKs+Fh&yX=4{i*$gO67t=aZKOra@L7s$Xw8+di%S~sUO4=?evPle
zI-GNBkSKP>Gd3~-8Lbj<4JMe{$zd?BnbP?E>2$5nOzidSZkd?FSq`U%lLG7(HpaH6
zT5Ecd+)=htzv9`N6HjLT*!R1L90i}H*3^|xxH>r5>djlQwcj3!*R{PfN`;*n$vmgN
zdEx+lmJlpqIbWKkw^U(y6{6iYf8qdmqQ>F|X7|b>!S&OF8(V$$Q==WT@&=<j=DM@I
zpGcerqoV0jvttgYs5KPih1Y_KfQxQf>H>!zCs-#7yya@-{YyaQA@8H0w(M=?b5HF`
zPXH#a4aY|Y8}41c#sGg>zVu0kJL#mz!(4AI9pM)qbu|_LLOb>rFCN*SZsDD?`~<!6
z77}kbTWvsQ{>Vx)`S%Hl;JL?A?;H$Y(`Dy5N{peldUTWIPlLpYe@SF3-J84kHa3<^
z7g*h)wYeRAO@Vt+;_(o-MUqi^wt_ZqqSOXdi*gwK*Fgo7g0JR&qh0uRKO(Rb-gR+P
zFoSdZ68ql>mAO-+AKbc(WFy`Vday+-iJsYzHtsU&6$x>8o3%22s{3!vic#w^LpJ6`
z;pB@{$t%sxt)V)9>$sw`VxNU_U1Yvc;eWAwP|tzTDj1ah;r%?$rss9wO!^CD1#{tc
zmrj~|!i&v7!%Jqvvw54Z>ppJ(DujR)B6^<dk(#_`E>*NjJZj>6@sH5AvrNmAA9a_C
za2MgZ^H;*&JU|eW(%V@VaQcRQ&KZO4(9A8$g&SWsDErw)z<WIhe>HpXmgjY;JAsa>
ze$tR3Epf|6{hALqzJPa<RRF?6Ac*Axo%fU{$9q*qqwX;k2b6ndu$6^KXqd?TId-;U
z<bMv<o66E3FLdsE?acnaN`dp#;i)Uxx#aPewGp8jQkKoyDdUK;h=H3P3r-ZD^lbLW
zOAb03ZGbcYVd*CBHl(NjD(`g&>NhE*D&nf48Fo+kVDhnf#_a>j@o!H*r5Ks5Nbpah
zw;iv>J*A{*cModPzF344#w^WK{l6IxW{i(R>mt6XNHHHW0lj=Bj;AmWK4t(+2{)xP
zn*C!$lA#iWqPLig-6xu_)k^_>Wf3isQhLqjN+{Dd%@5PYZJW=18=F32zXf9DjOX}Y
zYT>iV8MkisPD_bMj36nMmPRyse3V6^+xr?n_|HtpzII6)c>djyFU{0k79;{nQd<)}
z@OL(kHR2CDfI_k&$TA*NWfa2oyGLReH_3|Dw1F@UB}}pET}ZBZF#;W7XPU-ST4)-6
zFqS3j0S0C(fEBVW%+*Sx2V5@pA@cm2@0Q|K4*ZK`JwC<`#F4EJA5e;=tpSYNDS+^A
z`=J`t)2@vpXZ=qJMCabq@Q15|Ek9-tvldnk{o91+Y|%x<Ju3$Wn2@D6{$a8iLr-9<
z>DM7O<5a8WlmCqyHJ5)~y@Yky*gSdQulcwt9pF?(Sq|$XU*=gk_=e91exqGH6cNV?
zgYoL1)<X@QGe)WYOMw&$fnqz$d2q$i!rHtwg24fc%fMt<S?jEX_0c|O0I+-tYOI`<
z-buqEP`#tkTWR_)+JB4vtxo8{ujx`NYS}oM!$n;7-#i{|U}=0tsl7#dU(cDkV7C3W
zBFC`72Z?FWvhNYQmJ08dZ)$&C>Rt3Z2+$!lnxU7zjGrMMhl~?+7S7xkp9o_o^w3^U
zjhkp}N2s@HBZj1^xM1K<0J~+X?4y6Tiuy9vCc^M-2Iyb8;{>Gxt3n~<BB|67ZGEsp
ziynw4(|(W%rui>5{0_DcU#kPK3O;Fc5E~{d!;5lo7W^@3Aek(j!Gq=d*xv3xZQngi
z$i0TQ+bG3ktjva&+}nFZ@PBFk*8hwHp{2_IjE(&9o|Ij4Q8^}lK+hYmT=|H&L|NKa
z&uImJU2lADeXjz_K_QqHX*>v`xJ_j*Y4O91ANlf>QT>2T^Byb-$g<7BdZ{DOmX_O?
zm5d>T)jNi9Nk;_e{H4WaxC}=WBd)5RrSdO<&QVs@HL9SFzK$0hEtF>m8vmOJj@{9C
zdOUPP^mpJrrp3lBVWmf;zy1%+QWDdaT%&StN*ika*^CywlP}ABATGn|+*z7k)4+wv
zFu9<JF)d7EP{1ka#)^qRM^IBlC~T2tzn+orXytttT#0l<HqRY@ptL$@zzSVY+t<L>
z!=_xyKFlh0`_0ToB)mkEU`l$)=phUdG+$TWQShvC9J3k7$`J2G<~f?!Ec&5B)idJl
zEg}(O;{Ud{Jz#bI-&%k|2{-F3dpKy87&kIq`dt!T`d|p52jh1vNP4?vOt1=p72VjR
zzv|vMgsBVWpW$+0|BGoV1v)BW9*JKtRQY-BNU_`45cc2mr4r!7W`2lxiDOSCPxQdl
zGp8k1^>IlAB7Qb7Mz)ba*1s$vK+FGxc>jE`5NObF`H_=&!Jj?)#sBI1{d30>ph0sv
zSiC^+@>d?q9xhtvt5D!mQqpR4YH$(ISV`=Wco8uG_UU<19`N)MUr~bAYBWEyisz(@
zNYp3br?`scWV$Y43<irFYsYd)uj|3sM#R|&ylnd%9s7on&%F!ge<n6YUOLG=;=*1i
zbEW&WUfX>oUQ1{c^@<|uUQ_a=yx3O~Y9B`5gq<LmG+ko<#Le!)ci!2%h&>@Sa`fyB
zTTkf6D5W<p&aP3yE9$2*Hj+ATv0O}aN_sn9|5EBe@NbE8x+oiUKK*H_=>Cd7!Bga&
zDZ{99s?V}%sfzW+??a(8|LW<Hppyv#B2kA)k;y!)@TbW<I_0U{I_G+Bb-cfH+^Ag`
z=ngo=<Vuh_<cMH$^rZfQXv|!j_&RcZVe{fr!0806$gw=zA=rOiamN32f~crfg7k>K
zv&!N{!EW`FhSx4cB`4}TJCdM^yxVGOHEf3RtWz$x5Yd)#?VYdtAiA-1Zo`JAWC@x;
z0_O0H{}V*#i!P+ID}hH5yv;Sj;7Y}cJO7G3VpZ1dQA6&FlQx|rmO@5i@tww;M>?k|
zTBmy|F1$(p5`C>b+5}e0)ojV7_`&aDf|m%}qBd8D@f5qF(9gsgkw-H+VQdMZB5ci~
zY|KZAF;Nk_0WQvSr%wxRPVrEtc>L#idhWwHCCuAp*M*Mt<6eOC?RUa<Gu(#uyC-1b
zY4CR&oD%YL`xC>DK=4z{{i_K{uNE(Z^mP`F+2gNVq?}#dcmzXcI0y|~)W1@S%iDZ{
z(+h^3!TK;9C!vKq;pzH`=FJnD!BJxVg*Ko!+VGPS3LxoqWhOf{TKLidJM~|<B79ee
zIJ<Z4D972=M`{7j9@fl}5aSaY{9Nk~a{UpaA-Y;laoM1eecs2JKStPT`jlZ;gUq@l
z?i4LUxO@7nKcQz3&4KRt?;NMgbE<>lE>-@|W5wM9p}Pk{3HcG{4<pX^d`F*x;ZD#S
zIWca^Tbw;uqWc0z0&4_txlsFiayB3;H*o}5&gB9occ>4beD5p<$_u9b?LjnM()3+A
z-h1|^%KYg~%6B>v97CM!@dBP??z1J_iEH1~58c7<=_bv;qqm%;23#f*WEJNrnp+1#
z#4a!0l>y9QB#@gqwD?GjM^`6;ZS|Ph(O&r&UZEy)Sk@l$=F&~Y8h1_!rOB1f(+Oj@
z`TZn9=bk3f%!=pAp;4ERjj6kHCYPB%&}CH9$&fVvhRRcgTH?~Q9(sPk@MKa4aa_vz
zHj%rSW#9QUB}h~d;J{;e4ttzB&3|C0CQbLmo$d)%m0^AfO8MCyq%62Hd*TpYm|7at
zVzn`u;~e25B}h6DOE}47JAPhr%!Z)GF@$|Sta`&5(}fcFiLhMF4{9$!OmZx6Em{D5
zbvgG1mt|U&@Q3{Z2!X|u``b9-ect1E$Zz|<$5bSJT*mG;Jj@cpXML<MT6?7p32bX7
zEXY_u$Y2-h90b+c|4k-nPyj7FE~b(mlCqCFFYO!CXqGFQK+G-7WF}_Eys=N>s=Fsw
z_Y=W>g4Oa$#NH!V`7KV`PB+y-^7jj2<<ot>ft)`ZWeWsLTuzCMcMSV}$ma5#N=yTP
zO8@)Ja9x&dh{Jd4yh1QvBJR~0E)gkqU@+sG+ETyoi+5~8zy*({X`id^5lL6e2Drj_
z-#>q-!1-eET|}6kpfV=oWQVUjr(PJm@B?m6NUu&<@Ek&Hr0dn~lV2DoDtE38^LKO-
zou#TWw|`0JWp-;^bxk{0qs{qGS!}AJNMvD@(@!Ju_ONkZM!)aP<2B9iyl0?KkI#*C
zWBkwcNwdLM-aL%qz`^p$>Gx2;N9NXBS|YBsY?OWu?a@eOcB03fniFOT5cl8Jr(K7j
z!?&Uz#zY~DYuTzebbD?OJy}R0%w|d(N~f^fa&=(OEcH`d)e_S0<*rHoyLNiuWTv7^
zicb{OaL?QLxZ637v4lu!PX4*6id(_aNmp}R-5|#N9I?e0SDyDFt{>a5M0vD>-UMD*
zNW=A+aDsg<LeKX3sP?_>7C*Orbv+;woF06|(wDO#s&ya|az%=txT>oH1F&gou?^k$
z1QkscZciFaumYdlcaK`@)kiDq?mkrGi^@>elT7Y#xjpLpUTLw$@{w;!<SEH<^#bCB
zft0rNI|4*E0-keAQE2)+Ah?!xZVSo1KP~?#Snd<<<1>1f@4aA&nZfMsN519dDUlhK
z*M?u`<&5I4xZ=geezRrPY{f-J_XkK?QhiR2V0^zju|>>YXMMYvetP3+pBnE^i_7*a
z@y*s}+%2sFN=rO9>YgkOjm%x`|Dwg$QoA4ZV~&60T%VaW$K#YIr5_X&)s(YboOdt(
zfAH~OzW)o%F4(FIyzBYh#29Sfu=)^tPq|6?n#C;%fa(37hg_6}>VA^_D|cD{vM0y7
z<5K5uk!w6+8RhQk3QmLWj_Du592@LU{N51r_wd}{=a;&BKfvSbXCY+vH_eHusKxpl
zLdc4--rHl_3uj*DbD0}kOlI4^eJHPDI`}2if3cJAacDwh9O=7hTE{PM>+_Pu-<BR?
zY&ZuKke(Z<HwP<tf6YmohcztQq}_U-DX~c0Ntw7YY+)JNZ(&UuuyspC7N0V=*>O?*
z`TDKQ7cu^#vi`Fw(*mwvGcUV-&onc$(Reml-;^~ye((S1k7xFc&0X!Qu&L_s?EhX@
z`ptw-UEV4FQ*xz-vPvoDr;2r3^nVkwgE<L5tus$4Nj(0d#nEkRdFL3hK3Q}ufN4zZ
zO8$_3e5*|Qm_U9`^MBCmHVz`o=7t;IgE=3^N+hyh23-7Ae(n0P=rlg@nL9f-z2tup
z<QBs=+jqkCwO_|bo^qFX&oyi{yVzm(edy#6X5?P&I1ChKid6_z@%qUWA4#g=uC2Vl
zO7xvp+{fHktIr1@dsro-cM)o?FdL-=c=-An8BW*v;H$%^Y!XvO)}k3^|1!9j+YLd8
z*YgL9==XVKe>7d?*$8Lgx-nyVeFt11H`IOy*4J}?@@^xpg<5NiC<d@*8V5tm-VoKw
zPWXaQA)-#>a`od~;k_fa%JYJLkYy{B>bQ^zP|z2qvlKTlix)C>yK`^81dpRmv+!!L
z2l!R^ODx^?*ujHv5F1fF_+Cw&rL`~;wA}K72G3Fgus=Bians7VGXgE|6re^3`7$SS
zNKQ3$rXL2(&*DPwJ`DaXM0fkE(e7q{h8b*lKyZM@)%pq=8kWz$QpQDe{j-~X*Gydm
zSFtYoZwv<-JzR?HTy!CQv2bTKs|CIda#&DK<AZ1>!v7v_mu-&&i#D&IZ#|pRZ5;4t
z#8!T09i=^e3Hv&{)`CzE5NGaJoA%;W%jKBT)S4{ldVm7VZAEy6D-EHm<~_4N1K9Hz
zPm4E)EzAtbP-zgvP+Z&i{gA<&2#mM~&>eP-RX%^$6k3Zu>W&4?ZO`36T_+5TLhV&8
zWmg!dnI(&ARqd_8@w7rINwaT&*)j}ccAx-M4&ryk*#xp?7%grNWT`!-9EG4ox{~LT
zbNk2`#T-Tp=YrO?$d7q~zJU3Id0~k47`%;P`ltmX7qz$`6HlUCW%A_?fZ{3zr=?!4
zB<E0fAgCrVjCcX%23f;$DfAaX6*k3a6q+)TTnbUWyWJs(x)DLQv*M@M|LVT`c+UZZ
z4(y))gs~v*D#$z*_+6gw@YFAYDZt!Uv*|#N`haRq^yOD+gaavKp%SEcXg{Z3`|8ly
zJ4j#y@YzxPkW_`5g-F{=e5(R=u~W~22l6KhxEr3FI<7}njxQAvYdDld{T*+{wvQay
zp?n;q&MdOic^^mr(xbM3RuZRzH{+=sJ%+xQV5=j=on1W8JYmE%YriOfZU==kQa%8^
z_=q!O;*x8ELZ<C3LV|qDo}(^E<5fSgkZ^^rKc6Mec|~fhF=icpgOz!inl)}QA3*)4
zK_%497lL2C9j`*NcKW#%;d$a_hX-xYkJLg4=Xwh*Zu}o{w?*~xT3d-cPLo#LD4rXV
z8Prw2Tbd+dZ64AW(e44uSFJOeZ3GpP=Mguz59EhOhhs~1#O8OUP#Sw^7rEQXfxVAx
zen3XB3UD2bDUI05u;mKq7`$e5!boiY-_P+blGaCKnU!9mhZ>kVTg}NcDj%m`(0d3M
za0>9ZqwJf7jEMTbJ5vC`w)=Y!-S+GuuXk%XhDGNL8cmAr16&kvoD`f=xQ|}D-O8A_
zPx^yiR^Z6Q;ysXh9qKKZ`l|iodIl{YzX$LPVWmkBLoBKUA2a>5VTit-ceYQ&in6K%
z{R7q#*r|hurZzC@cKjq4`w_MVs7_RC2UdrYpFG%rg5jG#a8;vF4A@hxW={##9~>$*
zOUNPhJ`~N?DMx~K4?`GSnj{{piuwH+IQ{lk0C!pCaDB0bl2VW3eGp=^(qXSvBNLwA
zmjahf8~X?48LvO*B&F3xlQ8_01C+xaokQj8a1&jf!n>wzr`&T))ClsKEtQL;z;Jw)
z<Zhb`veIROoL?EZ2#HIp)RbfXBlJJR^5hgWgtTfyQpwv9bgSECrympN)g}LnuP761
zCnHm!W(DHwC}@8rLzdWLMZUY;PomBix2z9mJKP6V|49EQj0%=~yk=T~WNOS#E@HGl
z4x^BL2x=R{#+>ypM~!Il@NmbxVABc9$k_pfeyOZtsjpTY3w;iGx#ez1qqTy~`I(gu
zgl>V65YrM<d1GnVX2YsmQUyE?-WVw}HyFrG+zYdbXuDwFj$-X0z+|SpQ}>tVkr!mI
zVJAg6kr7WE=B@Y8Ve&f3W2;(7d*dzg3{PUCEKZmoNSe%T(FBFtsW4Y@G^>xaNv|b(
z@EM#8c2j@y-6d>$GHS=4%rSy457+pYuiOrb1zyhx;G#PnM&i@8H9u578}ytU3{1Ka
zLS8;2Ub1paM;~-ZV9hDW+zK41-(!siEkP$UF5pu2^{t2~x8i`ImMsMo{T(c;0D#on
zJlOwJOW&k__H`u!!g77fg%0+=_}M;I=!JsO=<oAsB@JF)pjO>gnUCKoQ&)y%N5JHb
zU{5Qv-$i8Bce^8^!*<}_y^<~QG3b9n`28zRM~?eg{j*}Fa+&B|k{MR%?s_N)w$9A8
zia%@#8tAhO*jdJo>R6GPqf<OYxrg2kE~I0Vm(Jhc|F9%1rv;7Sdk+#;Jogd*q762?
z`cKss@yw~&z5o3MQ<oPyb)^mv;t^3}7<T+-&gy~?=~>gR<QY9I?oJTl2K`nUoaEUm
zHXeSW<-up;Mj&o`G^Q+joQQ2u0!Zele1uyaV6bUb<;7*0n%XUs^zMW_<@tjJZF=<+
zGc-3GS=n+>CI>mvN`Wgh#%_M2w}MT4m=UL8gcvpAbVz-nxN#c}<aL5H#Jp=+A6-6~
zL~5q-0I3=<m4&)leCwe(J?iZ(xtkeq*hGt($ok$zTqr5cXvjbNo?PKm_CF(PMjQQ4
z>z=8?7#j*z!T5{LPDNW5Ah@rOu;vdQCdFrR<WR3Y!A+E+F*y*1T83SGd>@bhsW~G{
z?i%Gr$UHn#JD*1l4-8Mrlv9{}om%;94Id%b2C?b2AkD0ocWyvo&eOw4?S>T@`XG0?
zo})R|q@vd;%e)jSv;Lk|t%e>xi{96($5&;Tw83<*R*3ID-Pj$At}`6_2VztflaX`H
zO<i<gpqB#qYsnamHzWj#H^V>czDeV8L`SsNnEB?}jwIQIJOUes$gO{&wOv4Hi_$u>
zCPN7DThZ_vjY-&1i=t7hxvq6$gTuPjuyV4mh_&dC;<|R4pH0FzC?Aj3A~fLR?-7&y
z+Z&KTw#BqPz?3viWq4$8?=WKfXA-EGyadlnhu4LnpHQM-!y151jC3CJtNVAIwhui&
z-wYYuv<sQ^6UGHHAARek8?8&JDg`~=z6v83#gChahb2|I;07E$9d{O%&2_?ACfFa(
zX5VF%@3moVR{eM$pQRycwSAk@t(tQJ$2O<8@|HDGcB_Y#xr!MIE?u9_GhQUm`8MV|
zECn+RQRHIp$y;Nz&w+fqAX9`ZZs1U3((JMa4(>g&5}4w6RQi3<0*pjBwRv_YW8A12
zpVz$L58HA!W}l{qyemGB+v}54^T|{}Za73xY9UUV*&F2g{YCc6ps}{Sz|EQ3MQ7CB
zx#bt8d%1AE9!3(vVy`t|D~2TQQyQMzwA?msl`*2T$4c8<nzTo)j0s->DJgy4(4ZwF
z%wbDV`Z&M^%FtolqK}vEeBQKMF93~Z0Gk=FE-*>o<ba}D@;R))iP)6h--3n+3^90h
z!?tA{d9UiTqV|K3qdM|R4i|O|m&JnXco}Hom`|qB6|uIXxTLzP)wG8Ez`dOK`G;De
zY&aOB{5&3ZD2Jo{o&!w=IvIwf<-#U^`dN%R1@J--U6+kf<fSqm+EWm6<K<3=Mp^7|
zLCd{f*cQuZbIhreqY=vwHVff26xpEAu2OpyWY`;G86)DeN0}_lhs)I1?`dr%M~4@f
ze}7vbLN$xqRlqW2<`T~8_0YojM;)z9_|agkWeNza-wwfUe~G;Uzt@iNSSSbKO$zWx
zt%Ep@Zh`SlJE|?90Kd|Hu<q3MoH*3GiD1zIABaP44k?vRNTU#N;Pe63251ywqZkF$
z54i%Gmti7bqLjC*YUC)uVl5WEyT)x~BCt297j0ZVZxO=ZTDO_4p=P|8VJm*<{S1n7
z*~)<`Xa+wL0yXkj>{oo+I2|4k_HzO-D#|Nxh8aGx-hRbtVlnAKAOnUU{w@~4g&V2~
z4_3$JQpi6krc&eUOr!3h@HS<f*!HSjwFRJL_QH|iUW~*CU&dVm3A>3iXoUSu^|#k=
z#NG7JZInO8HfJJP4;!6cXz-%ixp407S~9Ma1A$o;O|H9#GJ#}>ZK}?M)H`2XZL+q;
zA`hxQ_x@p`uT$9Vt<C>L1HH_sGl!%_(qwg2MGIs#wjnsDh1F=wOPYK?!__O+oCy!9
zgD*`jjT1w}#b56)<(ZN~0En%vVUOGSc%5lqYOb)+*rqRwRFs7f(r>Q~g001*byHj3
z<ky2vwi>kyw?7}CF<F1$El3(|H<61J^6VSlQO8@2x!XH_r5L?^g1L)UZt!J2lVGZ@
z*W_?)rX9k39pLkO>l1h=8hZ5aUt2ykYk05qPx$Xk-*hn9b6(p*^g%u^h1F&kO{V*D
z_8XyZ5T^U#n~hER4}}mYu<Wb$iYEQ8+{S<v3G0LgG7~Zb>hW7`9O?Kyc3cguK`-ht
zIM3nfR<>v007`W8l%u}}?gkdA6F9KGH^0C1IWcg+Iy^>udoCr=l(3)m@tnPBV=CN_
zSlu5LIFRiSBL*}TLOh&ppd+&eNa+tgK1*u#SHs<TZ=HQ?1)e1p^Q8wDWT}NDE9&T}
zf%fJ_83E$OSzdtbknrWrz<_ylxVdIss_&3cvY*vI&%l6HrQX3wabigh)@{Aj_a?5c
zl@;aFaCHXy;=n~e%x$E~cb#{3#W;62yeq^dz3w0x9<&?o$BpbCh1=1zwxpSq73=jf
z8x!V;a<k|#yh@aL=O7#BV{>^FU{}9!C(HNHmF$&I*z;;1PTUBk4-EzuY9HyO`R=cI
z(Xp3{1G~;*J3(-*duq3b#S~?>gx2@ALn@htN3($(a_Ef8JYj{fh~20?t<m$vdYc~y
zWQv4)2Gd^t;zeDrna$4@02YxU-;!16qA1q1egM>1%NPq8C%l-{8rvV<ov$tv&^E^+
zL)|E<p1JktNK5|v*u5c%f%G{&?4I)n<xiDKa-XF&O|Y<FqlJ+{fkzKKmDT3t6A_b$
z?Fp_Hu_CXl;kBp6>$$IXS+#4t9&OxCGTrfN20vIwJ0_v?yn-F2JrBDA&9J7B?9t^5
zs1l#7q4c68lE5Pny6-||6ZF?6HNb8&hgu5(#<x*+P3Y66<@r;FAP0KS{<8ph3DORn
z^`~lCnNAwWSTlLOYe^?bG#Y=VC&N(-(Z1Zk7e=6<?63z|omyfk(>JW*&Xzvi^UtTY
z9l0DTpPhwx9`eVU_}4<@AM`ACchQ~RfB3-G=sq<MX~44&+pW)}F5>Mt8mzu#G^W|-
z&614Hdy3-_ulSk7vy)q5h2k4@^?lS%ZizUWaCRC<wW0q#LDt8sOOEjjfX=lae(@s3
zmd+SL=pNe4g=i;>bAo?z;T?a%0wDCLa&fhBp1d|Eo$$dVXwPs_nHy8OaC0-<;d?b?
zY)=m`-(IkX&J-ve+XF<Pq!&g#sXCL_mQ+Xfd{OHthQjMOG=q3wJn_HjbTrv9Ia_1N
z6}7r@DFh1iy6u$Kve=2%xkLOOwc@+hwq}Nsz5ZgU)Z*?)2+R4yE+5RO!0`KZYL0U6
zWIbLn4Xx}C{yp1JNfniD3<*n7k^{jbCr$sPwT`Dx)-La+i`B76n&0BGP00+AOzLkj
zt%{^3>w?=~#u#EvE6EP?MX71{K6C0i8R_(cfeaze3bgu&=7gVxwNT=}3THT(;l)G%
zmXp!wZn~X+WaXR=?I+OT%2*m?a1fHcCF|6W!m>c4YCK6{IcVw)Sjye_uH`Yce8>5w
z!{*wLlgRt{uZ8;D`WuB+156Tw8q^Gim_X9Pki+4*`Vh_oG$mZsi=8N!4?}!eyqzb9
zQo8@O0H9Rw5H10;9Qz1J87>BhkoVl2mQ4c>ocvD(9<*Y$euHb{fm~Fe)+#YnAUohH
z%<#JzWR6!4$@+o9Vg`;1EFRPo4`><VrB6dLM`D38<X#9ht_2F)!eHZZaPgD@2L^13
zImY4v!USLjmF+sDuErm4oF0m&Gun8F`bkKDb+HfCJagEVltWR!o;+V&|KrASToln~
z1%wnwOiNHv0+lS;((h_c4JGP(hYJ$a8_i;sC8mV1y<2UMDj}fpmjrCsH2uq)5kI%p
z%^=SEv%5N1U$n{j_KL3Dwtv<ZzH*-rn=0^BobEbLp9es^U`+uXO`nv-8-ehJ2ew0#
zm6Izc=D&Wp2C~b5xR*JoJZq%A+rHI!yK2o1_IJ?2KN^9lN55uy1eE&>)uU^aUmD@}
zc`B-aLM|H0%5(G{Lg^CB?^SFNApwWjWw3<E7$bn+TgYQiSh(e;Lib8Y-6S$-;Hxth
zJ87LX7qmiIH!a^>4_&BIE=j1VtPE3O?#yn~RHD`WaW%zBPeD!Fabrf3jn(k2el-d7
z`*Zd1AGD>abmk=w=2zDU2Eq{U&7{`rbe3IFsDYzs)?PC*whkb~-#VskVnQuE88MOZ
zxC+>hika6W<$vm$Jiiy?h}y4aO#+U}+ZM7UkPhvuxjQqdceTnP6+%tePzNotd(ZB8
z{Ur2bxv2Jbe@i(Y9EMo`{a<7GY>yeE#i;Wfd3XO6!xV&jj>~;DyOy%y?S=FEtYYBV
zIEzx=>}MreC;VWkv&tkq%EKauyW&_n2QQvpPn3Q&V;V4kp_Cnbd)tsgc2vzML}4tg
zZPv?EbGBq6VhRiDrs|7g+HZ>6JaQuIUb<m7BP;NcSG#0xc>&)=5#8~&F6L`KU)!f`
z<m30w1?*QDCSAZ6egqRjTtL6JimNE-*(fisTzGJkzk4F6^7BiY4nsQK;54^ek;$KJ
zY4aYgElhKA*HinHiN*I;Sp(@$m+KxpB?w-+eG2q;`-&xa%QcIt#(Qx8{_gWz;c<_F
z&K=-WZUC!~l9>2(EZ+gae7B(JozEA%!ICY2aDGiIj&fqryP#EbDypw4#w1X>%D?8k
zpF65vIT<3DW|lUun`*0QAwl8Jr*Nll8-&7A+Z_8<(bLNS<t~h>(=<6p_&vpc!L^_@
zF!YwCsu4sIW15%%aCcA5>~M>{K2<`Bc;|6&-^~3)dRrI<{4C;=bnLw;cFu-xOi4?y
zLaMvw+GX=lj$9uhynbrN!|5JP<!L%wGUR%iT8@k*nDck)wb(vD7zV&qlW}%BWcRG3
z@;c+3CHO%^a-Z-dMXvC7tK94w#3a?sM{@SB0bIvg)gtVYCD_bWxW~IN)2E?UpR3*g
zUi}#EnJSv?3@EROzO_cU-!M9;+T?MGU~SznI;4u4);rz6ejb3rNV-qwC!5@8XUVod
z#DGC=T>Ae3e{gVmKhAE;?o*{^W;(c=*XK%CfoibVGUFib=|Pt*fB&h$3Qy~??bgVK
zGn1LygMK|)$~=SMMAw7bXvssxbIP-rhYWYV?my)>y$(2s1>PTiC^@98*%yf1a~38*
zE_EN?nN>&XFX^A44E*O^$Py@MH4X<_g45mHI#oxeP1DwXoJql&8icm(@DSFbTCcX}
z^i_Fds@|A{^@0ao0lAo}j+!nh=taU7o7PgtDhFtve!&u4?z-${*IX?i&2@?J&dMgZ
zotN<HVp+)AkDo0D@F;8e_UF;x;0wp@vkcj3J&^}2ifPK)#FV}P@Wu6ZDh8b4hR?Kf
z+10$IRo8JL;79bSwOf-<3tEFQ7kqyFKTG+p|M$b^lHoLeVN0;P8!fd>Q!`l_Fs^VN
z6P3(i0Ox!RJf4g~IL+#(T-I-#UaCI-8r!hI@z_t`2WkBY6Y{}%A0`G04dC~!;dj&P
zFKHM6`>wa^VrD)u4S=7nw{KuBVAdoka^;j~AA<}+`%^^=oLg^RB;2xSV02%2>3A<S
z!?HN|9=twn;(G9FvRZ+N`7>KSyV*{#Ix|ihkWoWnyqAtmK8}+NfB&3!|9U~|ANnmz
zuqe<tRlqvs{o%jTcl|M<yYw%8T<S|1dc~-l?fj^BeL?cZqwG^Z<>XV0PcKbEt{Qv|
zAqRIkiLn-#hD`%!cs`F8zRIX%-{^@r!;^b-g2$xGNr!Nl?I;S1XlkY<k4+kxIEc&m
zz>4br_k>Vw|9>m|&Bs@6RV1a}upmnmAGa?z{`c<G*G(g(K+At!62(WL+|!og5!>5(
zK<f|Hze;;cSqNjxv=2$PzFGTUbyI37y^*SMFMK&>M4$n}hEz*+cFCWfyX3A!g_89A
zU%m0aD~^L5r^7s}Xk%iYZFQyU4uJLRdq<RF9p2HoW4lu;BkmCE7+d{-d|en2>%D(T
zsF|_1eL8WkOCv&mdX%lDk6p2EVKe=w*ETB3;F@iBbAwgo&z<jO&02LB@QVM8Y=qdz
zmH>(49Odk*;Q{eO&5D0bx8s{#;;zj`7LT>Z#KaVy@E!7d@8x^nPX<t++rRSQtL%ek
gX{H}CrmnQS%%?x~=RI?zwTmctv|pEz&B6A60L6~UR{#J2

diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
deleted file mode 100644
index 26919ce..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ /dev/null
@@ -1,250 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(cowplot)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp
-#my_df = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df$OR)
-#[1] TRUE
-
-#<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR PS Plots
-#<<<<<<<<<<<<<<<<<<<
-
-PS_df  = my_df
-
-rm(my_df)
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-getwd()
-
-source("combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df2  = merged_df3_comp
-#my_df2 = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df2)
-str(my_df2)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df2$OR)
-#[1] TRUE
-
-# sanity check: should be <10
-if (max(my_df2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR Lig Plots
-#<<<<<<<<<<<<<<<<
-
-Lig_df  = my_df2
-
-rm(my_df2)
-
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-#############
-# Plots: Bubble plot
-# x = Position, Y = stability
-# size of dots = OR
-# col: stability
-#############
-
-#=================
-# generate plot 1: DUET vs OR by position as geom_points
-#=================  
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
-
-g = ggplot(PS_df, aes(x = factor(Position)
-                   , y = ratioDUET))
-
-p1 = g + 
-  geom_point(aes(col = DUET_outcome
-                 , size = OR)) +
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "DUET(PS)"
-       , size = "Odds Ratio"
-       , colour = "DUET Outcome") +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p1 
-
-#=================
-# generate plot 2: Lig vs OR by position as geom_points
-#=================  
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(Lig_df$Lig_outcome)
-
-g = ggplot(Lig_df, aes(x = factor(Position)
-                   , y = ratioPredAff))
-
-p2 = g + 
-  geom_point(aes(col = Lig_outcome
-                   , size = OR))+
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Ligand Affinity"
-       , size = "Odds Ratio"
-       , colour = "Ligand Outcome"
-       ) +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p2
-
-#======================
-#combine using cowplot
-#======================
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
-#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
-theme_set(theme_gray()) # to preserve default theme
-
-printFile = cowplot::plot_grid(plot_grid(p1, p2
-                             , ncol = 1
-                             , align = 'v'
-                             , labels = c("(a)", "(b)")
-                             , label_size = my_als+5))
-print(printFile)
-dev.off()
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
deleted file mode 100644
index 30b9981..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ /dev/null
@@ -1,154 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# Lig_outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig_outcome.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df  = my_df 
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(my_df)
-
-# sanity checks
-upos = unique(my_df$Position)
-
-# should be a factor
-is.factor(df$Lig_outcome)
-#TRUE
-
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
deleted file mode 100644
index 169bdaf..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ /dev/null
@@ -1,149 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# DUET outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET_outcome
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(my_df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
-  
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
deleted file mode 100644
index a5d9361..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ /dev/null
@@ -1,202 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = Lig_outcome
-# subgroup = normalised score i.e ratioPredAff
-
-# Prepare data: round off ratioLig scores
-# round off to 3 significant digits:
-# 165 if no rounding is performed: used to generate the originalgraph
-# 156 if rounded to 3 places
-# FIXME: check if reducing precision creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioPredAff) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioLigR = round(df$ratioPredAff, n) 
-u = unique(df$ratioLigR) # 156
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioLigR
-df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioLig
-#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
deleted file mode 100644
index 8828e90..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ /dev/null
@@ -1,192 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = DUET_outcome
-# subgroup = normalised score i.e ratioDUET
-
-# Prepare data: round off ratioDUET scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioDUET) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioDUETR = round(df$ratioDUET, n)
-u = unique(df$ratioDUETR)
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioDUETR
-df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioDUET
-#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R
deleted file mode 100644
index 432749e..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R
+++ /dev/null
@@ -1,296 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-############################################################
-# Output dir for plots
-############################################################
-out_dir = "~/git/Data/pyrazinamide/output/plots"
-
-############################################################
-# 2: call script the prepares the data with columns containing
-# colours for axis labels
-############################################################
-
-source("subcols_axis_LIG.R")
-
-# this should return
-#mut_pos_cols: 52, 4
-#my_df: 169, 39
-
-# clear excess variable
-# "mut_pos_cols" is just for inspection in case you need to cross check
-# position numbers and colours
-# open file from deskptop ("sample_axis_cols") for cross checking
-
-table(mut_pos_cols$lab_bg)
-
-sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should  be True
-     
-table(mut_pos_cols$lab_bg2)
-
-sum( table(mut_pos_cols$lab_bg2) ) ==  nrow(mut_pos_cols) # should  be True
-
-table(mut_pos_cols$lab_fg)
-
-sum( table(mut_pos_cols$lab_fg) ) ==  nrow(mut_pos_cols) # should  be True
-
-# very important!: should be the length of the unique positions
-my_axis_colours = mut_pos_cols$lab_fg
-
-# now clear mut_pos_cols
-rm(mut_pos_cols) 
-
-###########################
-# 2: Plot: Lig scores
-###########################
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding PredAff stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-# sanity checks
-upos = unique(my_df$Position)
-
-str(my_df$Lig_outcome)
-
-colnames(my_df)
-
-#=========================== 
-# Data preparation for plots
-#===========================
-#!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-df <- my_df
-#!!!!!!!!!!!!!!!!!
-
-rm(my_df)
-
-# sanity checks
-# should be a factor
-is.factor(df$Lig_outcome); 
-#FALSE
-
-df$Lig_outcome = as.factor(df$Lig_outcome)
-is.factor(df$Lig_outcome); 
-#TRUE
-
-table(df$Lig_outcome)
-
-# check the range
-min(df$ratioPredAff)
-max(df$ratioPredAff)
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = Lig_outcome
-# subgroup = normalised score i.e ratioPredAff
-
-# Prepare data: round off ratioPredAff scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioPredAff) 
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioPredAffR = round(df$ratioPredAff, n)  
-u = unique(df$ratioPredAffR)
-
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioPredAffR
-df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") 
-
-# ELSE
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioPredAff
-#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")  
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#******************
-# generate plot
-#******************
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
-my_title = "Ligand Affinity"
-library(ggplot2)
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis according to frequency
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# plot with axis colours
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 15
-my_yals = 15
-
-# axes text size
-my_xats = 15
-my_yats = 18
-
-# using geom_tile
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off") +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-  # Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# output plot as svg/png
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 18
-my_yals = 18
-
-# axes text size
-my_xats = 16 #14 in PS
-my_yats = 18
-
-# set output dir for plots
-#getwd()
-#setwd("~/git/Data/pyrazinamide/output/plots") 
-#getwd()
-
-plot_name = "barplot_LIG_acoloured.svg"
-my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
-
-svg(my_plot_name, width = 26, height = 4)
-
-g = ggplot(df, aes(factor(Position, ordered = T)))
-
-outFile = g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off"
-  ) +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-#  geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
-#            , fill = df$lab_bg) +
-#  geom_tile(aes(,-1, width = 0.9, height = 0.3)
-#            , fill = df$lab_bg2) +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-# Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Frequency")
-
-
-print(outFile)
-dev.off()
-
-# for sanity and good practice
-#rm(df)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
deleted file mode 100644
index 78029be..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
+++ /dev/null
@@ -1,292 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-############################################################
-# Output dir for plots
-############################################################
-out_dir = "~/git/Data/pyrazinamide/output/plots"
-
-############################################################
-# 2: call script the prepares the data with columns containing
-# colours for axis labels
-############################################################
-
-source("subcols_axis.R")
-
-# this should return
-#mut_pos_cols: 130, 4
-#my_df: 335, 39
-
-# clear excess variable
-# "mut_pos_cols" is just for inspection in case you need to cross check
-# position numbers and colours
-# open file from deskptop ("sample_axis_cols") for cross checking
-
-table(mut_pos_cols$lab_bg)
-
-sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should  be True
-     
-table(mut_pos_cols$lab_bg2)
-
-sum( table(mut_pos_cols$lab_bg2) ) ==  nrow(mut_pos_cols) # should  be True
-
-table(mut_pos_cols$lab_fg)
-
-sum( table(mut_pos_cols$lab_fg) ) ==  nrow(mut_pos_cols) # should  be True
-
-# very important!
-my_axis_colours = mut_pos_cols$lab_fg
-
-# now clear mut_pos_cols
-rm(mut_pos_cols) 
-
-###########################
-# 2: Plot: DUET scores
-###########################
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-# sanity checks
-upos = unique(my_df$Position)
-
-str(my_df$DUET_outcome)
-
-colnames(my_df)
-
-#=========================== 
-# Data preparation for plots
-#===========================
-#!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-df <- my_df
-#!!!!!!!!!!!!!!!!!
-
-rm(my_df)
-
-# sanity checks
-# should be a factor
-is.factor(df$DUET_outcome)
-#TRUE
-
-table(df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioDUET, df$DUET_outcome, min)
-
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = DUET_outcome
-# subgroup = normalised score i.e ratioDUET
-
-# Prepare data: round off ratioDUET scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioDUET) 
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioDUETR = round(df$ratioDUET, n)  
-u = unique(df$ratioDUETR)
-
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioDUETR
-df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") 
-
-# ELSE
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioDUET
-#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")  
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#******************
-# generate plot
-#******************
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
-my_title = "Protein stability (DUET)"
-library(ggplot2)
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis according to frequency
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# plot with axis colours
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 15
-my_yals = 15
-
-# axes text size
-my_xats = 15
-my_yats = 18
-
-# using geom_tile
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off") +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-  # Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# output plot as svg/png
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 18
-my_yals = 18
-
-# axes text size
-my_xats = 14
-my_yats = 18
-
-# set output dir for plots
-#getwd()
-#setwd("~/git/Data/pyrazinamide/output/plots") 
-#getwd()
-
-plot_name = "barplot_PS_acoloured.svg"
-my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
-
-svg(my_plot_name, width = 26, height = 4)
-
-g = ggplot(df, aes(factor(Position, ordered = T)))
-
-outFile = g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off"
-  ) +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-#  geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
-#            , fill = df$lab_bg) +
-#  geom_tile(aes(,-1, width = 0.9, height = 0.3)
-#            , fill = df$lab_bg2) +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-# Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Frequency")
-
-
-print(outFile)
-dev.off()
-
-# for sanity and good practice
-#rm(df)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
deleted file mode 100644
index c4826d3..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ /dev/null
@@ -1,215 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-#require(data.table)
-#require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$lig_outcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-}
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = Lig_outcome))
-#prinfFile = g + geom_bar(
-  g + geom_bar(
-  aes(fill = Lig_outcome)
-  , show.legend = TRUE
-) + geom_label(
-  stat = "count"
-  , aes(label = ..count..)
-  , color = "black"
-  , show.legend = FALSE
-  , size = 10) + theme(
-    axis.text.x = element_blank()
-    , axis.title.x = element_blank()
-    , axis.title.y = element_text(size=my_als)
-    , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-  ) + labs(
-    title = ""
-    , y = "Number of SNPs"
-    #, fill='Ligand Outcome'
-  )  + scale_fill_discrete(name = "Ligand Outcome"
-                           , labels = c("Destabilising", "Stabilising"))
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#require(data.table)
-setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
-
-head(df$pos_count)
-table(df$pos_count)
-# this is cummulative
-#1  2  3  4  5  6 
-#5 24 36 56 30 18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count)) 
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#5 12 12 14  6  3
-# this is what will get plotted
-
-svg('position_count_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-  #g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-  ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-    )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-    )
-    , axis.title.x = element_text(size = my_als)
-    , axis.title.y = element_text(size = my_als)
-    , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-  )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of Lig barplots         			   #
-########################################################################
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
deleted file mode 100644
index 51a2812..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ /dev/null
@@ -1,211 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-  }
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_DUET.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-theme_set(theme_grey())
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = DUET_outcome))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes(fill = DUET_outcome)
-  , show.legend = TRUE
-  ) + geom_label(
-    stat = "count"
-    , aes(label = ..count..)
-    , color = "black"
-    , show.legend = FALSE
-    , size = 10) + theme(
-      axis.text.x = element_blank()
-      , axis.title.x = element_blank()
-      , axis.title.y = element_text(size=my_als)
-      , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-    ) + labs(
-      title = ""
-      , y = "Number of SNPs"
-      #, fill='DUET Outcome'
-      ) + scale_fill_discrete(name = "DUET Outcome"
-                              , labels = c("Destabilising", "Stabilising"))
-
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
-
-setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
-table(df$pos_count)
-# this is cummulative
-#1   2   3   4   5   6 
-#34  76  63 104  40  18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count))
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#34 38 21 26  8  3 
-
-foo = select(df, Mutationinformation
-             , WildPos
-             , wild_type
-             , mutant_type
-             , mutation_info
-             , position
-             , pos_count) #335, 5
-
-getwd()
-write.csv(foo, "../Data/pos_count_freq.csv")
-
-svg('position_count_DUET.svg')
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-  ) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-    ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-      )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-      )
-  , axis.title.x = element_text(size = my_als)
-  , axis.title.y = element_text(size = my_als)
-  , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-    )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of DUET barplots         			   #
-########################################################################
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
deleted file mode 100644
index 0059bca..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ /dev/null
@@ -1,175 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for PS Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#!!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-df  = my_df
-#!!!!!!!!!!!!!!!!!!!!!!!!
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$DUET_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-
-# subset data to generate pairwise correlations
-corr_data = df[, c("ratioDUET"
-#                  , "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-                   , "DUET_outcome"
-#                  , "Lig_outcome"
-                   , "pyrazinamide"
-                   )]
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c("DUET"
-#                    , "Ligand Affinity"
-#                    , "DUET_raw"
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-                     , "DUET_outcome"
-#                    , "Lig_outcome"
-                     , "pyrazinamide")
-
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-#default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('DUET_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
-             , pch = 21
-             , jitter = T
-             #, alpha = .05
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-
-print(printFile)
-dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
deleted file mode 100644
index 4e05d41..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ /dev/null
@@ -1,187 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages			   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$Lig_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-# subset data to generate pairwise correlations
-corr_data = df[, c(#"ratioDUET",
-                  "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-#                  , "DUET_outcome"
-                   , "Lig_outcome"
-                   , "pyrazinamide"
-                   )] 
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c(#"DUET",
-                     "Ligand Affinity"
-#                    ,"DUET_raw" 
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-#                    , "DUET_outcome"
-                     , "Lig_outcome"
-                     , "pyrazinamide")
-                     
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-
-# default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('Lig_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
-             , pch = 21
-             , jitter = T
-#            , alpha = .05
-#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-print(printFile)
-dev.off()
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
deleted file mode 100644
index 1f868e4..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ /dev/null
@@ -1,227 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-
-require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df		   	  		   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for plots
-# you need merged_df2, comprehensive one
-# since this has one-many relationship
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-#==========================
-# Plot: Lineage barplot
-# x = lineage y = No. of samples
-# col = Lineage
-# fill = lineage
-#============================
-table(my_df$lineage)
-
-#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
-#3        104       1293        264       1311          6          6        105 
-
-#===========================
-# Plot: Lineage Barplots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# get freq count of positions so you can subset freq<1
-#setDT(df)[, lineage_count := .N, by = .(lineage)]
-
-#******************
-# generate plot: barplot of mutation by lineage
-#******************
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-df_lin = subset(df, subset = lineage %in% sel_lineages )
-
-#FIXME; add sanity check for numbers.
-# Done this manually
-
-############################################################
-
-#########
-# Data for barplot: Lineage barplot
-# to show total samples and number of unique mutations 
-# within each linege
-##########
-
-# Create df with lineage inform & no. of unique mutations
-# per lineage and total samples within lineage
-# this is essentially barplot with two y axis
-
-bar = bar = as.data.frame(sel_lineages) #4, 1
-total_snps_u = NULL
-total_samples = NULL
-
-for (i in sel_lineages){
-  #print(i)
-  curr_total = length(unique(df$id)[df$lineage==i])
-  total_samples = c(total_samples, curr_total)
-  print(total_samples)
-  
-  foo = df[df$lineage==i,]
-  print(paste0(i, "======="))
-  print(length(unique(foo$Mutationinformation)))
-  curr_count = length(unique(foo$Mutationinformation))
-
-  total_snps_u = c(total_snps_u, curr_count)
-}
-
-print(total_snps_u)
-bar$num_snps_u = total_snps_u
-bar$total_samples = total_samples
-bar
-
-#*****************
-# generate plot: lineage barplot with two y-axis
-#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
-#*****************
-
-bar$num_snps_u = y1
-bar$total_samples = y2
-sel_lineages = x
-
-to_plot = data.frame(x = x
-                      , y1 = y1
-                      , y2 = y2)
-to_plot
-
-melted = melt(to_plot, id = "x")
-melted
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_basic_barplot.svg')
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(melted
-           , aes(x = x
-                 , y = value
-                 , fill = variable)
-           )
-
-
-printFile = g + geom_bar(
-  
-#g + geom_bar(
-  stat = "identity"
-  , position = position_stack(reverse = TRUE)
-  , alpha=.75
-  , colour='grey75'
-    ) + theme(
-    axis.text.x = element_text(
-      size = my_ats
-#      , angle= 30
-    )
-  , axis.text.y = element_text(size = my_ats
-  #, angle = 30
-  , hjust = 1
-  , vjust = 0)
-  , axis.title.x = element_text(
-    size = my_als
-    , colour = 'black'
-    )
-  , axis.title.y = element_text(
-    size = my_als
-    , colour = 'black'
-  )
-  , legend.position = "top"
-  , legend.text = element_text(size = my_als)
-  
-  #) + geom_text(
-  ) + geom_label(
-    aes(label = value)
-    , size = 5
-    , hjust = 0.5
-    , vjust = 0.5
-    , colour = 'black'
-    , show.legend = FALSE
-    #, check_overlap = TRUE
-    , position = position_stack(reverse = T)
-    #, position = ('
-
-  ) + labs(
-    title = ''
-    , x = ''
-    , y = "Number"
-    , fill = 'Variable'
-    , colour = 'black'
-  ) + scale_fill_manual(
-      values = c('grey50', 'gray75')
-      , name=''
-      , labels=c('Mutations', 'Total Samples')
-    ) + scale_x_discrete(
-      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-    )
-print(printFile)
-dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
deleted file mode 100644
index e4e6972..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ /dev/null
@@ -1,253 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for Lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-#==========================
-# Data for plot: assign as 
-# necessary
-#===========================
-
-# uncomment as necessary
-#!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-
-#==================
-# data for ALL muts
-#==================
-plot_df = my_df  
-my_plot_name = 'lineage_dist_PS.svg'
-#my_plot_name = 'lineage_dist_PS_comp.svg'
-
-#=======================
-# data for dr_muts ONLY
-#=======================
-#plot_df = my_df_dr 
-#my_plot_name = 'lineage_dist_dr_PS.svg'
-#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
-#!!!!!!!!!!!!!!!!!!!!!!!
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-#78     961      195     803 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#77     955      194     770
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#!!!!!!!!!!!!!!!!!!!!!!!!! 
-# REASSIGNMENT
-df <- df_lin
-#!!!!!!!!!!!!!!!!!!!!!!!!! 
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-library(plotly)
-library(ggridges)
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-g <- ggplot(df, aes(x = ratioPredAff)) + 
-  geom_density(aes(fill = Lig_outcome)
-               , alpha = 0.5) + 
-  facet_wrap( ~ lineage
-             , scales = "free"
-             , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian(xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-) 
-    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# check plot name
-my_plot_name
-
-svg(my_plot_name)
-
-printFile = ggplot( df, aes(x = ratioPredAff
-                          , y = Lig_outcome) ) +
-  
-  geom_density_ridges_gradient( aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#              , switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-                  ) +
-
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "Ligand Affinity" ) +
-  theme( axis.text.x = element_text( size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-      ) 
-
-print(printFile)
-dev.off()
-#===================================================
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioPredAff
-lin2 = df[df$lineage == "lineage2",]$ratioPredAff
-lin3 = df[df$lineage == "lineage3",]$ratioPredAff
-lin4 = df[df$lineage == "lineage4",]$ratioPredAff
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3) 
-ks.test(lin2,lin4) 
-
-ks.test(lin3,lin4) 
-
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
deleted file mode 100644
index 703a206..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ /dev/null
@@ -1,229 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-# merged_df2
-# merged_df3
-
-# df without NA for pyrazinamide:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info); str(my_df$mutation_info)
-
-# subset df with dr muts only
-my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
-table(my_df_dr$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Run two times: 
-# uncomment as necessary
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-
-#================
-# for ALL muts
-#================
-plot_df = my_df  
-my_plot_name = 'lineage_dist_PS.svg'
-#my_plot_name = 'lineage_dist_PS_comp.svg'
-
-#================
-# for dr muts ONLY
-#================
-#plot_df = my_df_dr 
-#my_plot_name = 'lineage_dist_dr_PS.svg'
-#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
- 
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-table(plot_df$lineage); str(plot_df$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-#library(plotly)
-#library(ggridges)
-
-g <- ggplot(df, aes(x = ratioDUET)) + 
-  geom_density(aes(fill = DUET_outcome)
-               , alpha = 0.5) + facet_wrap(~ lineage,
-                                           scales = "free") +
-  ggtitle("Kernel Density estimates of Protein stability by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# check plot name
-my_plot_name
-
-# output svg
-svg(my_plot_name)
-printFile = ggplot(df, aes(x = ratioDUET
-                            , y = DUET_outcome))+
-  
-  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#             , switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                   , ylim = c(0, 6)
-#                   , clip = "off" 
-) +
-  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "DUET" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#                  , axis.text.y = element_text(size = my_ats
-#                                                , angle = 0
-#                                                , hjust = 1
-#                                                , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#                  , legend.position = c(0.3, 0.8)
-#                  , legend.key.height = unit(1, 'mm')
-  ) 
-
-print(printFile)
-dev.off()
-
-#=!=!=!=!=!=!=!
-# COMMENT: Not much differences in the distributions
-# when using merged_df2 or merged_df2_comp.
-# Also, the lineage differences disappear when looking at all muts
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!=!
-#===================================================
-
-# COMPARING DISTRIBUTIONS: KS test 
-# run: "../KS_test_PS.R"
-
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R
deleted file mode 100644
index f60fb0b..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R
+++ /dev/null
@@ -1,250 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-library(ggseqlogo)
-
-#=======
-# input
-#=======
-#############
-# msa file: output of generate_mut_sequences.py
-#############
-homedir = '~'
-indir = 'git/Data/pyrazinamide/output'
-in_filename = "gene_msa.txt"
-infile = paste0(homedir, '/', indir,'/', in_filename)
-print(infile)
-
-#=======
-# input
-#=======
-#############
-# combined dfs
-#############
-source("../combining_two_df.R")
-
-###########################
-# Data for Logo plots
-# you need big df i.e
-# merged_df2
-# or
-# merged_df2_comp
-# since these have unique SNPs
-# I prefer to use the merged_df2
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# doesn't work if you use the big df as it has duplicate snps
-#rownames(my_df) = my_df$Mutationinformation
-
-# sanity check: should be True
-table(my_df$position == my_df$Position)
-
-c1 = unique(my_df$Position) # 130
-nrow(my_df) # 3092 
-
-#FIXME
-#!!! RESOLVE !!!
-# get freq count of positions and add to the df
-setDT(my_df)[, occurrence_sample := .N, by = .(id)] 
-table(my_df$occurrence_sample)
-
-
-my_df2 = my_df %>%
-  select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample)
-
-write.csv(my_df2, "my_df2.csv")
-
-#  extract freq_pos>1 since this will not add to much in the logo plot
-# pos 5 has one mutation but coming from atleast 5 samples?
-table(my_df$occurrence)
-foo = my_df[my_df$occurrence ==1,]
-
-# uncomment as necessary
-my_data_snp = my_df #3092
-
-#!!! RESOLVE
-# FIXME
-my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019
-
-u = unique(my_data_snp$Position) #96
-
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#########################################################
-# Task: To generate a logo plot  or bar plot but coloured 
-# aa properties.
-# step1: read mcsm file and OR file
-# step2: plot wild type positions
-# step3: plot mutants per position coloured by aa properties
-# step4: make the size of the letters/bars prop to OR if you can!
-#########################################################
-##useful links
-#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#https://omarwagih.github.io/ggseqlogo/
-#https://kkdey.github.io/Logolas-pages/workflow.html
-#A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-
-##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-
-#==============
-# matrix for mutant type
-# frequency of mutant type by position
-#==============
-table(my_data_snp$Mutant_type, my_data_snp$Position)
-tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position)
-class(tab_mt)
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-# should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#**********************
-# Plot 1: mutant logo
-#**********************
-my_ymax = max(my_data_snp$occurrence); my_ymax
-my_ylim = c(0,my_ymax) # very important
-
-# axis sizes
-# common: text and label
-my_ats = 15
-my_als = 20
-
-# individual: text and label 
-my_xats = 15
-my_yats = 20
-my_xals = 15
-my_yals = 20
-
-# legend size: text and label 
-my_lts = 20
-#my_lls = 20
-
-# Color scheme based on chemistry of amino acids
-chemistry = data.frame(
-  letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'),
-  group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)),
-  col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)),
-  stringsAsFactors = F
-) 
-
-# uncomment as necessary
-my_type = "EDLogo"
-#my_type = "Logo"
-
-logomaker(tab_mt
-         , type = my_type
-         , return_heights = T
-#         , color_type = "per_row"
-#         , colors = chemistry$col
-#         , method = 'custom'
-#         , seq_type = 'aa'
-#         , col_scheme = "taylor"
-#         , col_scheme = "chemistry2"
-) +
-theme(legend.position = "bottom"
-        , legend.title = element_blank()
-        , legend.text = element_text(size = my_lts )
-        , axis.text.x = element_text(size = my_ats , angle = 90)
-        , axis.text.y = element_text(size = my_ats , angle = 90))
-
-p0 = logomaker(tab_mt
-               , type =  my_type
-               , return_heights = T
-               , color_type = "per_row"
-               , colors = chemistry$col
-#               , seq_type = 'aa'
-#               , col_scheme = "taylor"
-#               , col_scheme = "chemistry2"
-) + 
-  #ylab('my custom height') +
-  theme(axis.text.x = element_blank()) +
-#  theme_logo()+ 
-  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
-  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 1:my_ymax
-                      , limits = my_ylim)
-
-p0
-
-# further customisation
-p1 = p0 + theme(legend.position = "bottom"
-                , legend.title = element_blank()
-                , legend.text = element_text(size = my_lts)
-                , axis.text.x = element_text(size = my_ats , angle = 90)
-                , axis.text.y = element_text(size = my_ats , angle = 90))
-p1
-
-#=======
-# input
-#=======
-#############
-# msa file: output of generate_mut_sequences.py
-#############
-homedir = '~'
-indir = 'git/Data/pyrazinamide/output'
-in_filename = "gene_msa.txt"
-infile = paste0(homedir, '/', indir,'/', in_filename)
-print(infile)
-
-##############
-# ggseqlogo: custom matrix of my data
-##############
-snps = read.csv(infile
-                , stringsAsFactors = F
-                , header = F) #3072, 
-
-class(snps); str(snps) # df and chr
-
-# turn to a character vector
-snps2 = as.character(snps[1:nrow(snps),])
-
-class(snps2); str(snps2) #character, chr
-
-# plot
-logomaker(snps2, type = my_type
-          , color_type = "per_row") +
-  theme(axis.text.x = element_blank()) +
-  theme_logo()+ 
-  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
-  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 0:5
-                      , limits = my_ylim)
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
deleted file mode 100644
index 80f1971..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
+++ /dev/null
@@ -1,273 +0,0 @@
-getwd()
-setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-# TASK: Multiple mutations per site
-# as aa symbol coloured by aa property
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-#source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-library(ggseqlogo)
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	           #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-#merged_df2 # 3092, 35
-#merged_df2_comp #3012, 35
-
-#merged_df3 #335, 35
-#merged_df3_comp #293, 35
-#==========================
-
-###########################
-# Data for Logo plots
-# you need small df i.e
-# merged_df3
-# or
-# merged_df3_comp? possibly
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3 # to show multiple mutations per site
-my_df = read.csv(file.choose())
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-colnames(my_df)
-str(my_df)
-
-rownames(my_df) = my_df$Mutationinformation
-
-c1 = unique(my_df$Position) #130
-nrow(my_df) #335
-
-table(my_df$occurrence)
-#1   2   3   4   5   6 
-#34  76  63 104  40  18 
-
-# get freq count of positions so you can subset freq<1
-#: already done in teh combining script
-#require(data.table)
-#setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36
-
-table(my_df$Position); table(my_df$occurrence)
-
-# extract freq_pos>1
-my_data_snp = my_df[my_df$occurrence!=1,] #301, 36
-u_pos = unique(my_data_snp$Position) #96
-
-# sanity check
-exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim
-if ( nrow(my_data_snp) == exp_dim ){
-  print("Sanity check passed: Data filtered correctly, dim match")
-} else {
-  print("Error: Please Debug")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#########################################################
-# Task: To generate a logo plot  or bar plot but coloured 
-# aa properties.
-# step1: read data file
-# step2: plot wild type positions
-# step3: plot mutants per position coloured by aa properties
-# step4: make the size of the letters/bars prop to OR if you can!
-#########################################################
-# useful links
-# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-# https://omarwagih.github.io/ggseqlogo/
-# https://kkdey.github.io/Logolas-pages/workflow.html
-# A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-# very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-
-
-#############
-# PLOTS: Bar plot with aa properties
-# using gglogo
-# useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#############
-
-##############
-# ggseqlogo: custom matrix of my data
-##############
-
-#==============
-# matrix for mutant type
-# frequency of mutant type by position
-#==============
-table(my_data_snp$Mutant_type, my_data_snp$Position)
-tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position)
-class(tab_mt)
-
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-# should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#==============
-# matrix for wild type
-# frequency of wild type by position
-#==============  
-# remove wt duplicates
-wt = my_data_snp[, c("Position", "Wild_type")] #301, 2
-wt = wt[!duplicated(wt),]#96, 2
-
-table(wt$Wild_type) # contains duplicates
-
-tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1
-
-tab_wt = unclass(tab_wt) #important
-class(tab_wt); rownames(tab_wt)
-#tab_wt = as.matrix(tab_wt, rownames = T)
-
-rownames(tab_wt)
-rownames(tab_mt)
-
-# sanity check
-if (ncol(tab_wt) == length(u_pos) ){
-  print("Sanity check passed: wt data dim match")
-} else {
-  print("Error: Please debug")
-}
-
-#**************
-# Plot 1: mutant logo
-#**************
-#install.packages("digest")
-#library(digest)
-# following example
-require(ggplot2)
-require(reshape2)
-library(gglogo)
-library(ggrepel)
-library(ggseqlogo)
-
-# generate seq logo for mutant type
-my_ymax = max(my_data_snp$occurrence); my_ymax
-my_ylim = c(0, my_ymax)
-#my_yrange = 1:my_ymax; my_yrange
-
-# axis sizes
-# common: text and label
-my_ats = 15
-my_als = 20
-
-# individual: text and label 
-my_xats = 15
-my_yats = 20
-my_xals = 15
-my_yals = 20
-
-# legend size: text and label 
-my_lts = 20
-#my_lls = 20
-
-p0 = ggseqlogo(tab_mt
-               , method = 'custom'
-               , seq_type = 'aa'
-#               , col_scheme = "taylor"
-#               , col_scheme = "chemistry2"
-) + 
-#  ylab('my custom height') +
-  theme(axis.text.x = element_blank()) +
-  theme_logo()+ 
-#   scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) )
-  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 1:my_ymax
-                     , limits = my_ylim)
-
-p0
-
-# further customisation
-p1 = p0 + theme(legend.position = "none"
-                , legend.title = element_blank()
-                , legend.text = element_text(size = my_lts)
-                , axis.text.x = element_text(size = my_xats, angle = 90)
-                , axis.text.y = element_text(size = my_yats, angle = 90))
-p1
-
-#**************
-# Plot 2: for wild_type
-# with custom x axis to reflect my aa positions
-#**************
-# sanity check: MUST BE TRUE
-# for the correctnes of the x axis
-identical(colnames(tab_mt), colnames(tab_wt))
-identical(ncol(tab_mt), ncol(tab_wt))
-
-p2 = ggseqlogo(tab_wt
-               , method = 'custom'
-               , seq_type = 'aa'
-#               , col_scheme = "taylor"
-#               , col_scheme = chemistry2
-) + 
-#  ylab('my custom height') +
-  theme(axis.text.x = element_blank()
-        , axis.text.y = element_blank()) +
-  theme_logo() +
-  scale_x_continuous(breaks = 1:ncol(tab_wt)
-                     , labels = colnames(tab_wt)) +
-  scale_y_continuous( breaks = 0:1 
-                     , limits = my_ylim )
-
-p2
-
-# further customise
-
-p3 = p2 +
-  theme(legend.position = "bottom"
-        , legend.text = element_text(size = my_lts)
-        , axis.text.x = element_text(size = my_ats
-                                     , angle = 90)
-        , axis.text.y = element_blank()) 
-
-p3
-
-
-# Now combine using cowplot, which ensures the plots are aligned
-suppressMessages( require(cowplot) )
-
-plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
-# background_grid(minor = "xy"
-#                  , size.minor = 1
-#                  , colour.minor = "grey86")
-
-
-#colour scheme
-#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R
deleted file mode 100644
index 2049c3e..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R
+++ /dev/null
@@ -1,208 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-#source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#library(tidyverse)
-
-###########################
-#2: Read file: normalised file, output of step 4 mcsm pipeline
-###########################
-#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-#                  , header = T)
-
-# call script combining_df
-source("../combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#  from Plotting to Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-#merged_df2 
-#merged_df2_comp 
-
-# df without NA for pyrazinamide:
-#merged_df3 
-#merged_df3_comp 
-#==========================
-###########################
-# Data to choose: 
-# We will be using the small dfs
-# to generate the coloured axis
-###########################
-
-# uncomment as necessary
-#!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df  = merged_df3_comp
-#!!!!!!!!!!!!!!!!!!!!!!!
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-str(my_df)
-my_df$Position
-c1 = my_df[my_df$Mutationinformation == "A134V",]
-
-# order my_df by Position
-my_df_o = my_df[order(my_df$Position),]
-head(my_df_o$Position); tail(my_df_o$Position)
-
-c2 = my_df_o[my_df_o$Mutationinformation == "A134V",]
-
-# sanity check
-if (sum(table(c1 == c2)) == ncol(my_df)){
-  print ("Sanity check passsd")
-}else{
-  print ("Error!: Please debug your code")
-}
-
-rm(my_df, c1, c2)
-
-# create a new df with unique position numbers and cols
-Position = unique(my_df_o$Position) 
-Position_cols = as.data.frame(Position)
-
-head(Position_cols) ; tail(Position_cols)
-
-# specify active site residues and bg colour
-Position = c(49, 51, 57, 71
-             , 8, 96, 138
-             , 13, 68
-             , 103, 137
-             , 133, 134) #13
-
-lab_bg = rep(c("purple" 
-               , "yellow"
-               , "cornflowerblue"
-               , "blue"
-               , "green"), times = c(4, 3, 2, 2, 2)
-)
-
-# second bg colour for active site residues
-#lab_bg2 = rep(c("white" 
-#                , "green" , "white", "green"
-#                , "white"
-#                , "white"
-#                , "white"), times = c(4
-#                                      , 1, 1, 1
-#                                      , 2
-#                                      , 2
-#                                      , 2)
-#)
-
-#%%%%%%%%%
-# revised: leave the second box coloured as the first one incase there is no second colour
-#%%%%%%%%%
-lab_bg2 = rep(c("purple" 
-                , "green", "yellow", "green"
-                , "cornflowerblue"
-                , "blue"
-                , "green"), times = c(4
-                                      , 1, 1, 1
-                                      , 2
-                                      , 2
-                                      , 2))
-
-# fg colour for labels for active site residues
-lab_fg = rep(c("white" 
-               , "black"
-               , "black"
-               , "white"
-               , "black"), times = c(4, 3, 2, 2, 2))
-
-#%%%%%%%%%
-# revised: make the purple ones black
-# fg colour for labels for active site residues
-#%%%%%%%%%
-#lab_fg = rep(c("black" 
-#               , "black"
-#               , "black"
-#               , "white"
-#               , "black"), times = c(4, 3, 2, 2, 2))               
-               
-# combined df with active sites, bg and fg colours
-aa_cols_ref = data.frame(Position
-                         , lab_bg
-                         , lab_bg2
-                         , lab_fg
-                         , stringsAsFactors = F) #13, 4
-
-str(Position_cols); class(Position_cols)
-str(aa_cols_ref); class(aa_cols_ref)
-
-# since Position is int and numeric in the two dfs resp, 
-# converting numeric to int for consistency
-aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
-class(aa_cols_ref$Position)
-
-#===========
-# Merge 1: merging Positions df (Position_cols) and
-# active site cols (aa_cols_ref)
-# linking column: "Position"
-# This is so you can have colours defined for all 130 positions
-#===========
-head(Position_cols$Position); head(aa_cols_ref$Position)
-
-mut_pos_cols = merge(Position_cols, aa_cols_ref
-            , by = "Position"
-            , all.x = TRUE) 
-
-head(mut_pos_cols)
-# replace NA's 
-# :column "lab_bg" with "white"
-# : column "lab_fg" with "black"
-mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
-mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
-mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
-head(mut_pos_cols)
-
-#===========
-# Merge 2: Merge mut_pos_cols with mcsm df
-# Now combined the 130 positions with aa colours with 
-# the mcsm_data
-#===========
-# dfs to merge
-df0 = my_df_o
-df1 = mut_pos_cols
-
-# check the column on which merge will be performed
-head(df0$Position); tail(df0$Position)
-head(df1$Position); tail(df1$Position)
-
-# should now have 3 extra columns
-my_df = merge(df0, df1
-              , by = "Position"
-              , all.x = TRUE)
-
-# sanity check
-my_df[my_df$Position == "49",]
-my_df[my_df$Position == "13",]
-
-my_df$Position
-
-# clear variables
-rm(aa_cols_ref
-   , df0
-   , df1
-   , my_df_o
-   , Position_cols
-   , lab_bg
-   , lab_bg2
-   , lab_fg
-   , Position
-   )
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R
deleted file mode 100644
index 37dfe32..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R
+++ /dev/null
@@ -1,208 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-#source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#library(tidyverse)
-
-###########################
-#2: Read file: normalised file, output of step 4 mcsm pipeline
-###########################
-#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-#                  , header = T)
-
-# call script combining_df
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#  from Plotting to Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-#merged_df2 
-#merged_df2_comp 
-
-# df without NA for pyrazinamide:
-#merged_df3 
-#merged_df3_comp 
-#==========================
-###########################
-# Data to choose: 
-# We will be using the small dfs
-# to generate the coloured axis
-###########################
-
-# uncomment as necessary
-#!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df  = merged_df3_comp
-#!!!!!!!!!!!!!!!!!!!!!!!
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-str(my_df)
-my_df$Position
-c1 = my_df[my_df$Mutationinformation == "L4S",]
-
-# order my_df by Position
-my_df_o = my_df[order(my_df$Position),]
-head(my_df_o$Position); tail(my_df_o$Position)
-
-c2 = my_df_o[my_df_o$Mutationinformation == "L4S",]
-
-# sanity check
-if (sum(table(c1 == c2)) == ncol(my_df)){
-  print ("Sanity check passsd")
-}else{
-  print ("Error!: Please debug your code")
-}
-
-rm(my_df, c1, c2)
-
-# create a new df with unique position numbers and cols
-Position = unique(my_df_o$Position) #130
-Position_cols = as.data.frame(Position)
-
-head(Position_cols) ; tail(Position_cols)
-
-# specify active site residues and bg colour
-Position = c(49, 51, 57, 71
-             , 8, 96, 138
-             , 13, 68
-             , 103, 137
-             , 133, 134) #13
-
-lab_bg = rep(c("purple" 
-               , "yellow"
-               , "cornflowerblue"
-               , "blue"
-               , "green"), times = c(4, 3, 2, 2, 2)
-)
-
-# second bg colour for active site residues
-#lab_bg2 = rep(c("white" 
-#                , "green" , "white", "green"
-#                , "white"
-#                , "white"
-#                , "white"), times = c(4
-#                                      , 1, 1, 1
-#                                      , 2
-#                                      , 2
-#                                      , 2)
-#)
-
-#%%%%%%%%%
-# revised: leave the second box coloured as the first one incase there is no second colour
-#%%%%%%%%%
-lab_bg2 = rep(c("purple" 
-                , "green", "yellow", "green"
-                , "cornflowerblue"
-                , "blue"
-                , "green"), times = c(4
-                                      , 1, 1, 1
-                                      , 2
-                                      , 2
-                                      , 2))
-
-# fg colour for labels for active site residues
-lab_fg = rep(c("white" 
-               , "black"
-               , "black"
-               , "white"
-               , "black"), times = c(4, 3, 2, 2, 2))
-
-#%%%%%%%%%
-# revised: make the purple ones black
-# fg colour for labels for active site residues
-#%%%%%%%%%
-#lab_fg = rep(c("black" 
-#               , "black"
-#               , "black"
-#               , "white"
-#               , "black"), times = c(4, 3, 2, 2, 2))               
-               
-# combined df with active sites, bg and fg colours
-aa_cols_ref = data.frame(Position
-                         , lab_bg
-                         , lab_bg2
-                         , lab_fg
-                         , stringsAsFactors = F) #13, 4
-
-str(Position_cols); class(Position_cols)
-str(aa_cols_ref); class(aa_cols_ref)
-
-# since Position is int and numeric in the two dfs resp, 
-# converting numeric to int for consistency
-aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
-class(aa_cols_ref$Position)
-
-#===========
-# Merge 1: merging Positions df (Position_cols) and
-# active site cols (aa_cols_ref)
-# linking column: "Position"
-# This is so you can have colours defined for all 130 positions
-#===========
-head(Position_cols$Position); head(aa_cols_ref$Position)
-
-mut_pos_cols = merge(Position_cols, aa_cols_ref
-            , by = "Position"
-            , all.x = TRUE) 
-
-head(mut_pos_cols)
-# replace NA's 
-# :column "lab_bg" with "white"
-# : column "lab_fg" with "black"
-mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
-mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
-mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
-head(mut_pos_cols)
-
-#===========
-# Merge 2: Merge mut_pos_cols with mcsm df
-# Now combined the 130 positions with aa colours with 
-# the mcsm_data
-#===========
-# dfs to merge
-df0 = my_df_o
-df1 = mut_pos_cols
-
-# check the column on which merge will be performed
-head(df0$Position); tail(df0$Position)
-head(df1$Position); tail(df1$Position)
-
-# should now have 3 extra columns
-my_df = merge(df0, df1
-              , by = "Position"
-              , all.x = TRUE)
-
-# sanity check
-my_df[my_df$Position == "49",]
-my_df[my_df$Position == "13",]
-
-my_df$Position
-
-# clear variables
-rm(aa_cols_ref
-   , df0
-   , df1
-   , my_df_o
-   , Position_cols
-   , lab_bg
-   , lab_bg2
-   , lab_fg
-   , Position
-   )
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
deleted file mode 100644
index 41ca884..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ /dev/null
@@ -1,27 +0,0 @@
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# This script only reads the pdb file of your complex
-
-# read in pdb file complex1 
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-
-# list of 8
-my_pdb = read.pdb(complex1
-                  , maxlines = -1
-                  , multi = FALSE 
-                  , rm.insert = FALSE
-                  , rm.alt = TRUE
-                  , ATOM.only = FALSE 
-                  , hex = FALSE
-                  , verbose = TRUE)
-
-rm(inDir, inFile, complex1)
-#====== end of script
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
deleted file mode 100644
index 658eec4..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ /dev/null
@@ -1,386 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-source("Header_TT.R")
-
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was 
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T)
-str(my_df)
-
-#=========================================================
-# Processing P1: Replacing B factor with mean ratioDUET scores
-#=========================================================
-
-#########################
-# Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original DUET Scores
-# 3: replaced B-factors with DUET Scores
-#*********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
- #1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: DUET scores
-hist(my_df$average_DUETR
-     , xlab = "" 
-     , main = "Norm_DUET")
-
-plot(density(my_df$average_DUETR)
-     , xlab = ""
-     , main = "Norm_DUET")
-
-# 3: After the following replacement
-#********************************
-
-#=========
-# step 0_P1: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na 
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with DUET Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "DUET_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-#********************************
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# NOTE: This replaced B-factor distribution has the same
-# x-axis as the PredAff normalised values, but the distribution
-# is affected since 0 is overinflated. This is because all the positions
-# where there are no SNPs have been assigned 0.
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-
-
-
-
-#######################################################################
-#====================== end of section 1 ==============================
-#######################################################################
-
-
-
-
-
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace 
-rm(list = ls())
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T) 
-str(my_df)
-#rm(inDir, inFile)
-
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original Pred Aff Scores
-# 3: replaced B-factors with PredAff Scores
-#********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
-# 1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-     , xlab = "" 
-     , main = "Norm_lig_average")
-
-plot(density(my_df$average_PredAffR)
-     , xlab = ""
-     , main = "Norm_lig_average")
-
-# 3: After the following replacement
-#********************************
-
-#=================================================
-# Processing P2: Replacing  B values with ratioPredAff scores
-#=================================================
-# use match to perform this replacement linking with "position no"
-# in the pdb file, this corresponds to column "resno"
-# in my_df, this corresponds to column "Position"
-
-#=========
-# step 0_P2: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
-
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P2
-#=========
-
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with PredAff Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "Lig_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-#********************************
-
-###########
-# end of output files with Bfactors
-##########
diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
deleted file mode 100644
index 9f30f28..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ /dev/null
@@ -1,257 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-#########################################################
-# 1: Installing and loading required packages           #
-#########################################################
-
-source("Header_TT.R")
-#source("barplot_colour_function.R")
-
-##########################################################
-#           Checking: Entire data frame and for PS      #
-##########################################################
-
-###########################
-#2) Read file: combined one from the script
-###########################
-source("combining_two_df.R")
-
-# df with NA:
-# merged_df2
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                  , row.names = 1
-                  , stringsAsFactors = F
-                  , header = T)
-str(mcsm_data)
-my_colnames  = colnames(mcsm_data)
-
-#====================================
-# subset my_df to include only the columns in mcsm data
-my_df2 = my_df[my_colnames]
-#====================================
-# compare the two
-head(mcsm_data$Mutationinformation)
-head(mcsm_data$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
-head(mcsm_data_s$Mutationinformation)
-head(mcsm_data_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data_s, my_df2)
-
-#from lib compare
-compare(mcsm_data_s, my_df2) # seems rownames are the problem
-
-# FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-#====================================================== end of section 1
-
-
-
-##########################################################
-#             Checking: LIG(Filtered dataframe)          #
-##########################################################
-
-# clear workspace
-rm(list = ls())
-
-###########################
-#3) Read file: combined_lig from the script
-###########################
-source("combining_two_df_lig.R")
-
-# df with NA:
-# merged_df2 :
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                      , row.names = 1
-                      , stringsAsFactors = F
-                      , header = T)
-str(mcsm_data)
-
-###########################
-# 4a: Filter/subset data: ONLY for LIGand analysis
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-# sanity checks
-upos = unique(mcsm_data$Position)
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# Lig filtered: subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-rm(mcsm_data) #to avoid confusion
-
-table(mcsm_data2$Dis_lig_Ang<10)
-table(mcsm_data2$Dis_lig_Ang>10)
-
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-upos_f = unique(mcsm_data2$Position); upos_f
-
-# colnames of df that you will need to subset the bigger df from
-my_colnames  = colnames(mcsm_data2)
-#====================================
-# subset bigger df i.e my_df to include only the columns in mcsm data2
-my_df2 = my_df[my_colnames] 
-
-rm(my_df) #to avoid confusion
-#====================================
-# compare the two
-head(mcsm_data2$Mutationinformation)
-head(mcsm_data2$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
-head(mcsm_data2_s$Mutationinformation)
-head(mcsm_data2_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data2_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data2_s, my_df2)
-
-# from library compare
-compare(mcsm_data2_s, my_df2) # seems rownames are the problem
-
-#FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-##########################################################
-#  extract and write output file for SNP posn: all     #
-##########################################################
-
-head(merged_df3$Position)
-
-foo = merged_df3[order(merged_df3$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique
-
-# sanity check: 
-table(snp_pos_unique == combined_df$Position)
-
-#=====================
-# write_output files
-#=====================
-outDir = "~/Data/pyrazinamide/input/processed/"
-
-
-outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
-print(paste0("Output file name and path will be:","", outFile1))
-
-write.table(snp_pos_unique
-            , outFile1
-            , row.names = F
-            , col.names = F)
-            
-##############################################################
-#  extract and write output file for SNP posn: complete only #
-##############################################################
-head(merged_df3_comp$Position)
-
-foo = merged_df3_comp[order(merged_df3_comp$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique 
-
-# outDir = "~/Data/pyrazinamide/input/processed/" # already set
-
-outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
-print(paste0("Output file name and path will be:", outFile2))
-
-write.table(snp_pos_unique
-            , outFile2
-            , row.names = F
-            , col.names = F)
-#============================== end of script
-
-
diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py
index 7b59cfa..5d3dc64 100755
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@@ -48,9 +48,10 @@ gene = 'pncA'
 datadir = homedir + '/' + 'git/Data'
 
 #=======
-# input
+# input from outdir
 #=======
-indir = datadir + '/' + drug + '/' + 'output'
+#indir = datadir + '/' + drug + '/' + 'output'
+outdir = datadir + '/' + drug + '/' + 'output'
 #in_filename = 'pnca.dssp'
 in_filename = gene.lower() +'.dssp'
 infile = indir + '/' + in_filename
diff --git a/mk_drug_dirs.sh b/mk_drug_dirs.sh
index 6a6dd6d..a336ed3 100755
--- a/mk_drug_dirs.sh
+++ b/mk_drug_dirs.sh
@@ -4,9 +4,6 @@
 ## Structure:
 #
 # $DATA_DIR/$DRUG/input
-#                 |- original
-#                 |- processed
-#                 |- structure
 #                 
 # $DATA_DIR/$DRUG/output
 #                 |- plots
@@ -15,18 +12,17 @@
 DATA_DIR=~/git/Data
 
 if [[ $1 == '' ]]; then
+	echo "Error"
     echo "usage: mk-drug-dirs.sh <drug name>";
     exit;
 else
     DRUG=$1
-    echo Creating structure for: $DRUG
+    echo Creating directory structure for: $DRUG
 
     if [ -d $DATA_DIR ]
     then
         echo Doing creation in $DATA_DIR
-        mkdir -p $DATA_DIR/$DRUG/input/original
-        mkdir -p $DATA_DIR/$DRUG/input/processed
-        mkdir -p $DATA_DIR/$DRUG/input/structure
+        mkdir -p $DATA_DIR/$DRUG/input
         mkdir -p $DATA_DIR/$DRUG/output/plots
         mkdir -p $DATA_DIR/$DRUG/output/structure
         
diff --git a/meta_data_analysis/data_extraction.py b/scripts/data_extraction.py
similarity index 57%
rename from meta_data_analysis/data_extraction.py
rename to scripts/data_extraction.py
index 70f3008..451d6cf 100755
--- a/meta_data_analysis/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -11,63 +11,77 @@ Created on Tue Aug  6 12:56:03 2019
 
 # FIXME: import dirs.py to get the basic dir paths available
 #=======================================================================
-# TASK: extract ALL pncA_p. mutations from GWAS data
+# TASK: extract ALL <gene> matched mutations from GWAS data
 # Input data file has the following format: each row = unique sample id 
-# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide...
-# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,pncA_p.<wt>POS<mut>; pncA_c.<wt>POS<mut>...
-# where multiple mutations and multiple mutation types are separated by ';'. We are interested in the 
-# protein coding region i.e mutation with the 'p.' format.
-
-# the script splits the mutations on the ';' and extracts protein coding muts only
+# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col...
+# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_match<wt>POS<mut>; pncA_c.<wt>POS<mut>...
+# where multiple mutations and multiple mutation types are separated by ';'. 
+# We are interested in the protein coding region i.e mutation with the<gene>_'p.' format.
+# This script splits the mutations on the ';' and extracts protein coding muts only
 # where each row is a separate mutation
 # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
 
-# output files:
-# 0) pnca_common_ids.csv
-# 1) pnca_ambiguous_muts.csv
-# 2) pnca_mcsm_snps.csv
-# 3) pnca_metadata.csv
-# 4) pnca_all_muts_msa.csv
-# 5) pnca_mutational_positons.csv
+# output files: all lower case
+# 0) <gene>_common_ids.csv
+# 1) <gene>_ambiguous_muts.csv
+# 2) <gene>_mcsm_snps.csv
+# 3) <gene>_metadata.csv
+# 4) <gene>_all_muts_msa.csv
+# 5) <gene>_mutational_positons.csv
 #=======================================================================
 #%% load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
-
-#from pandas.api.types import is_string_dtype
-#from pandas.api.types import is_numeric_dtype
-
-#%% specify homedir as python doesn't recognise tilde
+import argparse
+#=======================================================================
+#%% homdir and curr dir and local imports
 homedir = os.path.expanduser('~') 
-
 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 
 # import aa dict
-from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+drug = args.drug
+gene = args.gene
 gene_match = gene + '_p.'
+# building cols to extract
+dr_muts_col = 'dr_mutations_' + drug
+other_muts_col = 'other_mutations_' + drug
 
+print('Extracting columns based on variables:\n'
+	, drug
+	, '\n'
+	, dr_muts_col
+	, '\n'
+	, other_muts_col
+	, '\n===============================================================')
+#=======================================================================
+#%% input and output dirs and files
 #=======
 # data dir
 #=======
-#indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 
 #=======
 # input
 #=======
-#indir = 'git/Data/pyrazinamide/input/original'
 in_filename  = 'original_tanushree_data_v2.csv'
 infile = datadir + '/' + in_filename
 print('Input filename: ', in_filename
-      , '\nInput path: ', indir
+      , '\nInput path: ', datadir
       , '\n============================================================')
 
 #=======
@@ -88,15 +102,15 @@ master_data  = pd.read_csv(infile, sep = ',')
 # column names
 #list(master_data.columns)
 
-# extract elevant columns to extract from meta data related to the pyrazinamide
+# extract elevant columns to extract from meta data related to the drug
 meta_data = master_data[['id'
        ,'country'
        ,'lineage'
        ,'sublineage'
        ,'drtype'
-       , 'pyrazinamide'
-       , 'dr_mutations_pyrazinamide'
-       , 'other_mutations_pyrazinamide'
+       , drug
+       , dr_muts_col
+       , other_muts_col
         ]] 
 
 del(master_data)
@@ -115,13 +129,13 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 meta_data.head()
 
 # equivalent of table in R
-# pyrazinamide counts
-meta_data.pyrazinamide.value_counts() 
-print('RESULT: Sus and Res samples:\n', meta_data.pyrazinamide.value_counts()
+# drug counts
+meta_data[drug].value_counts() 
+print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
 		, '\n===========================================================')
 
 # clear variables
-del(indir, in_filename,infile)
+del(in_filename,infile)
 #del(outdir)
 #%%
 # !!!IMPORTANT!!! sanity check:
@@ -129,18 +143,18 @@ del(indir, in_filename,infile)
 # can use it to check if your data extraction process for dr_muts 
 # and other_muts has worked correctly AND also to check the dim of the 
 # final formatted data.
-# This will have: unique COMBINATION of sample id and pncA_p.mutations
+# This will have: unique COMBINATION of sample id and <gene_match> mutations
 
 #========
-# First: counting pncA_p. mutations in dr_mutations_pyrazinamide column
+# First: counting <gene_match> mutations in dr_muts_col column
 #========
-print('Now counting WT & pncA_p. muts within the column: dr_mutations_pyrazinamide')
+print('Now counting WT &', gene_match, 'muts within the column:', dr_muts_col)
 
 # drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=['dr_mutations_pyrazinamide'])
+clean_df =  meta_data.dropna(subset=[dr_muts_col])
 
 # sanity check: count na
-na_count = meta_data['dr_mutations_pyrazinamide'].isna().sum()
+na_count = meta_data[dr_muts_col].isna().sum()
 
 if len(clean_df) == (total_samples - na_count):
    print('PASS: clean_df extracted: length is', len(clean_df)
@@ -150,7 +164,7 @@ else:
     print('FAIL: dropping NA failed'
          , '\n==========================================================')
 
-dr_pnca_count = 0
+dr_gene_count = 0
 wt = 0
 id_dr = []
 id2_dr = []
@@ -158,45 +172,44 @@ id2_dr = []
 for i, id in enumerate(clean_df.id):
 #    print (i, id)
 #    id_dr.append(id)
-#    count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count('pncA_p.') #works 2201
-    count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count(gene_match) #works 2201
-    if count_pnca_dr > 0:
+    count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) 
+    if count_gene_dr > 0:
         id_dr.append(id)
-    if count_pnca_dr > 1:
+    if count_gene_dr > 1:
         id2_dr.append(id)
-#        print(id, count_pnca_dr)    
-    dr_pnca_count = dr_pnca_count + count_pnca_dr
-    count_wt = clean_df.dr_mutations_pyrazinamide.iloc[i].count('WT')
+#        print(id, count_gene_dr)    
+    dr_gene_count = dr_gene_count + count_gene_dr
+    count_wt = clean_df[dr_muts_col].iloc[i].count('WT')
     wt = wt + count_wt
     
 print('RESULTS:')        
-print('Total WT in dr_mutations_pyrazinamide:', wt)
-print('Total matches of', gene_match, 'in dr_mutations_pyrazinamide:', dr_pnca_count)
-print('Total samples with > 1', gene_match, 'muts in dr_mutations_pyrazinamide:', len(id2_dr) )
+print('Total WT in dr_muts_col:', wt)
+print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count)
+print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) )
 print('=================================================================')
 
-del(i, id, wt, id2_dr, clean_df, na_count, count_pnca_dr, count_wt)
+del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt)
 
 #========
-# Second: counting pncA_p. mutations in dr_mutations_pyrazinamide column
+# Second: counting <gene_match> mutations in dr_muts_col column
 #========
-print('Now counting WT & pncA_p. muts within the column: other_mutations_pyrazinamide')
+print('Now counting WT &', gene_match, 'muts within the column:', other_muts_col)
       
 # drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=['other_mutations_pyrazinamide'])
+clean_df =  meta_data.dropna(subset=[other_muts_col])
 
 # sanity check: count na
-na_count = meta_data['other_mutations_pyrazinamide'].isna().sum()
+na_count = meta_data[other_muts_col].isna().sum()
 
 if len(clean_df) == (total_samples - na_count):
    print('PASS: clean_df extracted: length is', len(clean_df)
-         , '\nNo.of NA s=', na_count, '/', total_samples
+         , '\nNo.of NAs =', na_count, '/', total_samples
          , '\n=========================================================')
 else:
     print('FAIL: dropping NA failed'
          , '\n=========================================================')
 
-other_pnca_count = 0
+other_gene_count = 0
 wt_other = 0
 id_other = []
 id2_other = []
@@ -204,63 +217,63 @@ id2_other = []
 for i, id in enumerate(clean_df.id):
 #    print (i, id)
 #    id_other.append(id)
-#    count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count('pncA_p.')
-    count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count(gene_match)
-    if count_pnca_other > 0:
+#    count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match')
+    count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
+    if count_gene_other > 0:
         id_other.append(id)            
-    if count_pnca_other > 1:
+    if count_gene_other > 1:
         id2_other.append(id)
-#        print(id, count_pnca_other)    
-    other_pnca_count = other_pnca_count + count_pnca_other    
-    count_wt = clean_df.other_mutations_pyrazinamide.iloc[i].count('WT')
+#        print(id, count_gene_other)    
+    other_gene_count = other_gene_count + count_gene_other    
+    count_wt = clean_df[other_muts_col].iloc[i].count('WT')
     wt_other = wt_other + count_wt   
 print('RESULTS:')
-print('Total WT in other_mutations_pyrazinamide:', wt_other)
-print('Total matches of', gene_match, 'in other_mutations_pyrazinamide:', other_pnca_count)
-print('Total samples with > 1', gene_match, 'muts in other_mutations_pyrazinamide:', len(id2_other) )
+print('Total WT in other_muts_col:', wt_other)
+print('Total matches of', gene_match, 'in', other_muts_col, ':', other_gene_count)
+print('Total samples with > 1', gene_match, 'muts in other_muts_col:', len(id2_other) )
 print('=================================================================')
 
-print('Predicting total no. of rows in your curated df:', dr_pnca_count + other_pnca_count )
-expected_rows = dr_pnca_count + other_pnca_count
+print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
+      , '\n===================================================================')
+expected_rows = dr_gene_count + other_gene_count
 
-del(i, id, wt_other, clean_df, na_count, id2_other, count_pnca_other, count_wt)
+del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt)
 
 #%%
 ############
 # extracting dr and other muts separately along with the common cols
 #############
-print('=================================================================')
-print('Extracting dr_muts in a dr_mutations_pyrazinamide with other meta_data')
+print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data')
 print('gene to extract:', gene_match )
 
 #===============
-# dr mutations: extract pncA_p. entries with meta data and ONLY dr_muts col
+# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col
 #===============
-# FIXME: replace pyrazinamide with variable containing the drug name
+# FIXME: replace drug with variable containing the drug name
 # !!! important !!!
 meta_data_dr = meta_data[['id'
        ,'country'
        ,'lineage'
        ,'sublineage'
        ,'drtype'
-       , 'pyrazinamide'
-       , 'dr_mutations_pyrazinamide'
+       , drug
+       , dr_muts_col
         ]] 
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
 print('actual dim:', meta_data_dr.shape
 	, '\n===============================================================')
 
 # Extract within this the gene of interest using string match
-#meta_pnca_dr = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
-meta_pnca_dr = meta_data_dr.loc[meta_data_dr.dr_mutations_pyrazinamide.str.contains(gene_match, na = False)] 
+#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] 
+meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] 
 
-dr_id = meta_pnca_dr['id'].unique()
+dr_id = meta_gene_dr['id'].unique()
 print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
 print('checking RESULT:',
       '\nexpected len =', len(id_dr),
-      '\nactual len =', len(meta_pnca_dr) )
+      '\nactual len =', len(meta_gene_dr) )
 
-if len(id_dr) == len(meta_pnca_dr):
+if len(id_dr) == len(meta_gene_dr):
     print('PASS: lengths match'
     , '\n===============================================================')
 else:
@@ -270,18 +283,18 @@ else:
 dr_id = pd.Series(dr_id)
 
 #=================
-# other mutations: extract pncA_p. entries
+# other mutations: extract gene_match entries
 #==================
-print('Extracting dr_muts in a other_mutations_pyrazinamide with other meta_data')
-# FIXME: replace pyrazinamide with variable containing the drug name
+print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
+# FIXME: replace drug with variable containing the drug name
 # !!! important !!!
 meta_data_other = meta_data[['id'
        ,'country'
        ,'lineage'
        ,'sublineage'
        ,'drtype'
-       , 'pyrazinamide'
-       , 'other_mutations_pyrazinamide'
+       , drug
+       , other_muts_col
         ]] 
 
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
@@ -289,15 +302,15 @@ print('actual dim:', meta_data_other.shape
 	, '\n===============================================================')
 
 # Extract within this the gene of interest using string match
-meta_pnca_other  = meta_data_other.loc[meta_data_other.other_mutations_pyrazinamide.str.contains(gene_match, na = False)] 
+meta_gene_other  = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] 
 
-other_id = meta_pnca_other['id'].unique()
+other_id = meta_gene_other['id'].unique()
 print('RESULT: No. of samples with other muts:', len(other_id))
 print('checking RESULT:',
       '\nexpected len =', len(id_other),
-      '\nactual len =', len(meta_pnca_other) )
+      '\nactual len =', len(meta_gene_other) )
 
-if len(id_other) == len(meta_pnca_other):
+if len(id_other) == len(meta_gene_other):
     print('PASS: lengths match'
     , '\n==============================================================')
 else:
@@ -308,7 +321,7 @@ other_id = pd.Series(other_id)
 #%% Find common IDs
 print('Now extracting common_ids...')
 common_mut_ids = dr_id.isin(other_id).sum() 
-print('RESULT: No. of common Ids:', common_mut_ids)
+print('RESULT: No. of common ids:', common_mut_ids)
 
 # sanity checks
 # check if True: should be since these are common
@@ -327,9 +340,9 @@ common_ids2.columns = ['index', 'id2']
 # should be True
 print(common_ids['id'].equals(common_ids2['id2']))
 
-# good sanity check: use it later to check pnca_sample_counts
-expected_pnca_samples = ( len(meta_pnca_dr) + len(meta_pnca_other) - common_mut_ids ) 
-print('expected no. of pnca samples:', expected_pnca_samples)
+# good sanity check: use it later to check gene_sample_counts
+expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) 
+print('expected no. of gene samples:', expected_gene_samples)
 print('=================================================================')
 #%% write file
 #print(outdir)
@@ -348,47 +361,47 @@ del(out_filename0)
 
 # clear variables
 del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
+#%% Now extract 'all' pncA mutations: i.e 'gene_match*'
+print('extracting from string match:', gene_match, 'mutations from cols:\n'
+      , dr_muts_col, 'and', other_muts_col, 'using string match:'
+      , '\n===================================================================')
+#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match) | meta_data[other_muts_col].str.contains(gene_match) ]
+meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ]
 
-#%% Now extract 'all' pncA mutations: i.e 'pncA_p.*'
-print('extracting all pncA mutations from dr_ and other_ cols using string match:', gene_match
-	, '\n===============================================================')
-#meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match) ]
-meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match, na = False) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match, na = False) ]
+extracted_gene_samples = meta_gene_all['id'].nunique()
+print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
+      , '\n===================================================================')
 
-extracted_pnca_samples = meta_pnca_all['id'].nunique()
-print('RESULT: actual no. of pnca samples extracted:', extracted_pnca_samples)
-print('======================================================================')
-
-# sanity check: length of pnca samples
+# sanity check: length of gene samples
 print('Performing sanity check:')
-if extracted_pnca_samples == expected_pnca_samples:
-    print('No. of pnca samples:', len(meta_pnca_all)
-          , '\nPASS: expected & actual no. of pnca samples match'
+if extracted_gene_samples == expected_gene_samples:
+    print('No. of gene samples:', len(meta_gene_all)
+          , '\nPASS: expected & actual no. of gene samples match'
           , '\n=========================================================')
 else:
     print('FAIL: Debug please!'
     , '\n===============================================================')
 
-# count NA  in pyrazinamide column
-pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
-print('No. of pnca samples without pza testing i.e NA in pza column:', pnca_na)
+# count NA  in drug column
+gene_na = meta_gene_all[drug].isna().sum() 
+print('No. of gene samples without pza testing i.e NA in pza column:', gene_na)
 
 # use it later to check number of complete samples from LF data
-comp_pnca_samples = len(meta_pnca_all) - pnca_na
-print('comp pnca samples tested for pza:', comp_pnca_samples)
+comp_gene_samples = len(meta_gene_all) - gene_na
+print('comp gene samples tested for pza:', comp_gene_samples)
 print('=================================================================')
 
 # Comment: This is still dirty data since these
-# are samples that have pncA_p. muts, but can have others as well
+# are samples that have gene_match muts, but can have others as well
 # since the format for mutations is mut1; mut2, etc.
-print('This is still dirty data: samples have pncA_p. muts, but may have others as well'
+print('This is still dirty data: samples have ', gene_match, 'muts but may have others as well'
       , '\nsince the format for mutations is mut1; mut2, etc.'
       , '\n=============================================================')
 
 #%% tidy_split():Function to split mutations on specified delimiter: ';'
 #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+print('Performing tidy_split(): to separate the mutations into indivdual rows')
 
-print('Performing tidy_spllit(): to separate the mutations into indivdual rows')
 # define the split function
 def tidy_split(df, column, sep='|', keep=False):
     '''
@@ -428,38 +441,38 @@ def tidy_split(df, column, sep='|', keep=False):
     
 #%% end of tidy_split()
 #=========
-# DF1: dr_mutations_pyrazinamide
+# DF1: dr_muts_col
 #=========
 ########
-# tidy_split(): on 'dr_mutations_pyrazinamide' column and remove leading white spaces
+# tidy_split(): on dr_muts_col column and remove leading white spaces
 ########    
-col_to_split1 = 'dr_mutations_pyrazinamide'
-print ('Firstly, applying tidy split on dr df:', meta_pnca_dr.shape
-       , '\ncolumn name:', col_to_split1
+col_to_split1 = dr_muts_col
+print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
+       , '\ncolumn name to apply tidy_split():', col_to_split1
        , '\n============================================================')
 # apply tidy_split()
-dr_WF0 = tidy_split(meta_pnca_dr, col_to_split1, sep = ';') 
+dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
 # remove leading white space else these are counted as distinct mutations as well
-dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
+dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() 
 
-# extract only the samples/rows with pncA_p.
-dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)]
+# extract only the samples/rows with gene_match
+dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
 
 print('lengths after tidy split and extracting', gene_match, 'muts:'
-      , '\nold length:' , len(meta_pnca_dr)
+      , '\nold length:' , len(meta_gene_dr)
       , '\nlen after split:', len(dr_WF0)
-      , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0)
-      , '\nexpected len:', dr_pnca_count)
+      , '\ndr_gene_WF0 length:', len(dr_gene_WF0)
+      , '\nexpected len:', dr_gene_count)
 
-if len(dr_pnca_WF0) == dr_pnca_count:
-    print('PASS: length of dr_pnca_WF0 match with expected length'
+if len(dr_gene_WF0) == dr_gene_count:
+    print('PASS: length of dr_gene_WF0 match with expected length'
     , '\n===============================================================')
 else:
     print('FAIL: lengths mismatch'
     , '\n===============================================================')
 
 # count the freq of 'dr_muts' samples
-dr_muts_df = dr_pnca_WF0 [['id', 'dr_mutations_pyrazinamide']] 
+dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] 
 print('dim of dr_muts_df:', dr_muts_df.shape)
 
 # add freq column
@@ -468,13 +481,13 @@ dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count')
 print('revised dim of dr_muts_df:', dr_muts_df.shape) 
 
 c1 = dr_muts_df.dr_sample_freq.value_counts()
-print('counting no. of sample frequency:\n', c1)
-print('=================================================================')
+print('counting no. of sample frequency:\n', c1
+      , '\n===================================================================')
 
-# sanity check: length of pnca samples
-if len(dr_pnca_WF0) == c1.sum():
+# sanity check: length of gene samples
+if len(dr_gene_WF0) == c1.sum():
     print('PASS: WF data has expected length'
-    , '\nlength of dr_pnca WFO:', c1.sum()
+    , '\nlength of dr_gene WFO:', c1.sum()
     , '\n===============================================================')
 else:
     print('FAIL: Debug please!'
@@ -483,7 +496,7 @@ else:
 #!!! Important !!!
 # Assign 'column name' on which split was performed as an extra column
 # This is so you can identify if mutations are dr_type or other in the final df
-dr_df = dr_pnca_WF0.assign(mutation_info = 'dr_mutations_pyrazinamide')
+dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
 print('dim of dr_df:', dr_df.shape
 	, '\n=============================================================='
 	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
@@ -493,35 +506,35 @@ print('dim of dr_df:', dr_df.shape
 # DF2: other_mutations_pyrazinamdie
 #=========
 ########
-# tidy_split(): on 'other_mutations_pyrazinamide' column and remove leading white spaces
+# tidy_split(): on other_muts_col column and remove leading white spaces
 ######## 
-col_to_split2 = 'other_mutations_pyrazinamide'
-print ('applying second tidy split separately on df:', meta_pnca_other.shape
-       , '\ncolumn name:', col_to_split2
+col_to_split2 = other_muts_col
+print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
+       , '\ncolumn name to apply tidy_split():', col_to_split2
        , '\n============================================================')
 
 # apply tidy_split()
-other_WF1 = tidy_split(meta_pnca_other, col_to_split2, sep = ';') 
+other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
 # remove the leading white spaces in the column
-other_WF1['other_mutations_pyrazinamide'] = other_WF1['other_mutations_pyrazinamide'].str.strip() 
+other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
 
-# extract only the samples/rows with pncA_p.
-other_pnca_WF1 = other_WF1.loc[other_WF1.other_mutations_pyrazinamide.str.contains(gene_match)]
+# extract only the samples/rows with gene_match
+other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
 
 print('lengths after tidy split and extracting', gene_match, 'muts:',
-      '\nold length:' , len(meta_pnca_other),
+      '\nold length:' , len(meta_gene_other),
       '\nlen after split:', len(other_WF1),
-      '\nother_pnca_WF1 length:', len(other_pnca_WF1),
-      '\nexpected len:', other_pnca_count)
+      '\nother_gene_WF1 length:', len(other_gene_WF1),
+      '\nexpected len:', other_gene_count)
 
-if len(other_pnca_WF1) == other_pnca_count:
-    print('PASS: length of dr_pnca_WF0 match with expected length
+if len(other_gene_WF1) == other_gene_count:
+    print('PASS: length of dr_gene_WF0 match with expected length'
     , '\n===============================================================')
 else:
-    print('FAIL: lengths mismatch
+    print('FAIL: lengths mismatch'
     , '\n===============================================================')    
 # count the freq of 'other muts' samples
-other_muts_df = other_pnca_WF1 [['id', 'other_mutations_pyrazinamide']] 
+other_muts_df = other_gene_WF1 [['id', other_muts_col]] 
 print('dim of other_muts_df:', other_muts_df.shape) 
 
 # add freq column
@@ -531,10 +544,10 @@ print('revised dim of other_muts_df:', other_muts_df.shape)
 c2 = other_muts_df.other_sample_freq.value_counts()
 print('counting no. of sample frequency:\n', c2)
 print('=================================================================')
-# sanity check: length of pnca samples
-if len(other_pnca_WF1) == c2.sum():
+# sanity check: length of gene samples
+if len(other_gene_WF1) == c2.sum():
     print('PASS: WF data has expected length'
-    , '\nlength of other_pnca WFO:', c2.sum()
+    , '\nlength of other_gene WFO:', c2.sum()
     , '\n===============================================================')
 else:
     print('FAIL: Debug please!'
@@ -543,7 +556,7 @@ else:
 #!!! Important !!!
 # Assign 'column name' on which split was performed as an extra column
 # This is so you can identify if mutations are dr_type or other in the final df
-other_df = other_pnca_WF1.assign(mutation_info = 'other_mutations_pyrazinamide')
+other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
 print('dim of other_df:', other_df.shape
 	, '\n==============================================================='
 	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
@@ -555,17 +568,19 @@ print('dim of other_df:', other_df.shape
 #!!! important !!!
 # change column names to allow concat:
 # dr_muts.. & other_muts : 'mutation'
-print('Now concatenating the two dfs by row')
+print('Now concatenating the two dfs by row'
+      , '\nfirst assigning a common colname: "mutation" to the col containing muts'
+      , '\nthis is done for both dfs'
+      , '\n===================================================================')
 
 dr_df.columns
-dr_df.rename(columns = {'dr_mutations_pyrazinamide': 'mutation'}, inplace = True)
+dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True)
 dr_df.columns
 
 other_df.columns
-other_df.rename(columns = {'other_mutations_pyrazinamide': 'mutation'}, inplace = True)
+other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
 other_df.columns
 
-print('=================================================================')
 print('Now appending the two dfs:'
       , '\ndr_df dim:', dr_df.shape
       , '\nother_df dim:', other_df.shape
@@ -582,18 +597,18 @@ else:
     print('FAIL: Debug please!')
 
 # concatenate (axis = 0): Rbind
-pnca_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
+gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
 
 # checking colnames and length after concat
 print('checking colnames AFTER concatenating the two dfs...')
-if (set(dr_df.columns) == set(pnca_LF0.columns)):
+if (set(dr_df.columns) == set(gene_LF0.columns)):
     print('PASS: column names match')
 else:
     print('FAIL: Debug please!')
     
 print('checking length AFTER concatenating the two dfs...')
 
-if len(pnca_LF0) == len(dr_df) +  len(other_df):
+if len(gene_LF0) == len(dr_df) +  len(other_df):
     print('PASS:length of df after concat match'
     , '\n===============================================================')
 else:
@@ -603,61 +618,59 @@ else:
 ###########
 # This is hopefully clean data, but just double check
 # Filter LF data so that you only have
-# mutations corresponding to pncA_p.* (string match pattern) 
+# mutations corresponding to gene_match* (string match pattern) 
 # this will be your list you run OR calcs 
 ###########
-print('length of pnca_LF0:', len(pnca_LF0),
+print('length of gene_LF0:', len(gene_LF0),
       '\nThis should be what you need. But just double check and extract', gene_match, 
-      '\nfrom LF0 (concatenated data)')
+      '\nfrom LF0 (concatenated data) using string match:', gene_match)
 
-print('using string match:', gene_match)
+print('Double checking and creating df: gene_LF1')
+gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)]   
 
-print('Double checking and creating df: pnca_LF1')
-pnca_LF1 = pnca_LF0[pnca_LF0['mutation'].str.contains(gene_match)]   
-
-if len(pnca_LF0) == len(pnca_LF1):
-    print('PASS: length of pnca_LF0 and pnca_LF1 match',
+if len(gene_LF0) == len(gene_LF1):
+    print('PASS: length of gene_LF0 and gene_LF1 match',
           '\nconfirming extraction and concatenating worked correctly')
 else:
     print('FAIL: BUT NOT FATAL!'
-          , '\npnca_LF0 and pnca_LF1 lengths differ'
+          , '\ngene_LF0 and gene_LF1 lengths differ'
           , '\nsuggesting error in extraction process'
-          , ' use pnca_LF1 for downstreama analysis'
+          , ' use gene_LF1 for downstreama analysis'
           , '\n=========================================================')    
 print('length of dfs pre and post processing...'
-      , '\npnca WF data (unique samples in each row):', extracted_pnca_samples
-      , '\npnca LF data (unique mutation in each row):', len(pnca_LF1)
+      , '\ngene WF data (unique samples in each row):', extracted_gene_samples
+      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
       , '\n=============================================================')
 
-#%%
-# final sanity check
+#%% sanity check for extraction
 print('Verifying whether extraction process worked correctly...')
-if len(pnca_LF1) == expected_rows:
+if len(gene_LF1) == expected_rows:
     print('PASS: extraction process performed correctly'
           , '\nexpected length:', expected_rows
-          , '\ngot:', len(pnca_LF1)
-          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(pnca_LF1)
+          , '\ngot:', len(gene_LF1)
+          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1)
           , '\n=========================================================')
 else:
     print('FAIL: extraction process has bugs'
           , '\nexpected length:', expected_rows
-          , '\ngot:', len(pnca_LF1)
+          , '\ngot:', len(gene_LF1)
           , ', \Debug please'
           , '\n=========================================================')
 #%%
-print('Perfmorning some more sanity checks...')
+print('Performing some more sanity checks...')
+
 # From LF1:
 # no. of unique muts
-distinct_muts = pnca_LF1.mutation.value_counts()
-print('Distinct mutations:', len(distinct_muts))
+distinct_muts = gene_LF1.mutation.value_counts()
+print('Distinct genomic mutations:', len(distinct_muts))
 
 # no. of samples contributing the unique muta
-pnca_LF1.id.nunique()
-print('No.of samples contributing to distinct muts:', pnca_LF1.id.nunique() )
+gene_LF1.id.nunique()
+print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique() )
 
 # no. of dr and other muts
-mut_grouped = pnca_LF1.groupby('mutation_info').mutation.nunique()
-print('No.of unique dr and other muts:', pnca_LF1.groupby('mutation_info').mutation.nunique() )
+mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique()
+print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique() )
 
 # sanity check
 if len(distinct_muts) == mut_grouped.sum() :
@@ -670,7 +683,7 @@ else:
           , '\nmuts should be distinct within dr* and other* type'
           , '\ninspecting ...'
           , '\n=========================================================')
-    muts_split = list(pnca_LF1.groupby('mutation_info'))
+    muts_split = list(gene_LF1.groupby('mutation_info'))
     dr_muts = muts_split[0][1].mutation 
     other_muts =  muts_split[1][1].mutation
 #   print('splitting muts by mut_info:', muts_split)
@@ -679,7 +692,7 @@ else:
 #%%
 # !!! IMPORTANT !!!!
 # sanity check: There should not be any common muts
-# i.e the same mutation cannot be classed as a 'drug' AND 'others'
+# i.e the same mutation cannot be classed as a drug AND 'others'
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
     print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
     , '\n===============================================================')
@@ -695,8 +708,8 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
           , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum()
           , '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)]
           , '\n========================================================='
-          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum(),
-          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)],
+          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
+          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
           , '\n=========================================================')    
 else:
     print('Error: ambiguous muts present, but extraction failed. Debug!'
@@ -706,22 +719,22 @@ print('Counting no. of ambiguous muts...')
 
 if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
     common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)),
-          'list of ambiguous mutations (see below):', *common_muts, sep = '\n'
-          , '\n=========================================================')
+    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+    print('\n===========================================================')
 else:
    print('Error: ambiguous muts detected, but extraction failed. Debug!'
-         , '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-         , '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+         , '\nNo. of ambiguous muts in dr:'
+         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+         , '\nNo. of ambiguous muts in other:'
+         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
          , '\n=========================================================')       
          
 #%% clear variables
-del(id_dr, id_other, meta_data, meta_pnca_dr, meta_pnca_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_pnca_count, pnca_LF0, pnca_na)  
+del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
 
-del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_muts_df, dr_pnca_WF0, dr_pnca_count, expected_pnca_samples, other_pnca_WF1)                     
+del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
  
-#%% end of data extraction and some files writing. Below are some more files writing.
-
 #%%: write file: ambiguous muts
 # uncomment as necessary
 #print(outdir)
@@ -734,8 +747,8 @@ print('Writing file: ambiguous muts',
       '\nFilename:', out_filename1,
       '\nPath:',  outdir)
 
-#common_muts = ['pncA_p.Val180Phe','pncA_p.Gln10Pro'] # test
-inspect = pnca_LF1[pnca_LF1['mutation'].isin(common_muts)]
+#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
+inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
 inspect.to_csv(outfile1)
 
 print('Finished writing:', out_filename1
@@ -746,22 +759,33 @@ print('Finished writing:', out_filename1
       , '\n=============================================================')
       
 del(out_filename1)
-
-#%% read aa dict and pull relevant info
-print('Reading aa dict and fetching1 letter aa code'
+#%% end of data extraction and some files writing. Below are some more files writing.
+#=============================================================================
+#%% Formatting df: read aa dict and pull relevant info
+print('Now some more formatting:'
+      , '\nread aa dict and pull relevant info'
+      , '\nformat mutations:'
+      , '\nsplit mutation into mCSM style muts: '
       , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
-      , '\nAdding aa properties'
-      , '\n============================================================')
-      
+      , '\nassign aa properties: adding 2 cols at a time for each prop'
+      , '\n===================================================================')
+
+# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
+#  in each lookup cycle 
+ncol_mutf_add = 3 # mut split into 3 cols
+ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
+
 #===========
 # Split 'mutation' column into three:  wild_type, position and
 # mutant_type separately. Then map three letter code to one using
-# reference_dict.
-# First: Import reference dict
-# Second: convert to mutation to lowercase for compatibility with dict 
+# reference_dict imported at the beginning.
+# After importing, convert to mutation to lowercase for compatibility with dict 
 #===========
-pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower()
+gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
+gene_regex = gene_match.lower()+'(\w{3})'
+print('gene regex being used:', gene_regex)
 
+mylen0 = len(gene_LF1.columns)
 #=======
 # Iterate through the dict, create a lookup dict i.e
 # lookup_dict = {three_letter_code: one_letter_code}.
@@ -770,17 +794,47 @@ pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower()
 # The three letter code is extracted using a string match match from the dataframe and then converted
 # to 'pandas series'since map only works in pandas series
 #=======
+print('Adding', ncol_mutf_add, 'more cols:\n')
 
+# initialise a sub dict that is lookup dict for three letter code to 1-letter code
+# adding three more cols
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
     lookup_dict[k] = v['one_letter_code']
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wild_type'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    pnca_LF1['mutant_type'] = mut.map(lookup_dict)
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wild_type'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    gene_LF1['mutant_type'] = mut.map(lookup_dict)
 
 # extract position info from mutation column separetly using string match
-pnca_LF1['position'] = pnca_LF1['mutation'].str.extract(r'(\d+)') 
+gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
+
+mylen1 = len(gene_LF1.columns)
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen1 ==  mylen0 + ncol_mutf_add:
+    print('PASS: successfully added', ncol_mutf_add, 'cols'
+      , '\nold length:', mylen0
+      , '\nnew len:', mylen1)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen0
+          , '\nnew len:', mylen1)
 
 # clear variables
 del(k, v, wt, mut, lookup_dict)
@@ -790,18 +844,45 @@ del(k, v, wt, mut, lookup_dict)
 # lookup_dict =  {three_letter_code: aa_prop_water} 
 # Do this for both wild_type and mutant as above.
 #=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
+print('Adding', ncol_aa_add, 'more cols:\n')
 
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
 for k, v in my_aa_dict.items():
     lookup_dict[k] = v['aa_prop_water']
     #print(lookup_dict)
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
-    
-# added two more cols
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
+
+mylen2 = len(gene_LF1.columns)    
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen2 == mylen1 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
 
 # clear variables
 del(k, v, wt, mut, lookup_dict)
@@ -811,19 +892,92 @@ del(k, v, wt, mut, lookup_dict)
 # lookup_dict =  {three_letter_code: aa_prop_polarity} 
 # Do this for both wild_type and mutant as above.
 #=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
+print('Adding', ncol_aa_add, 'more cols:\n')
 
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
 for k, v in my_aa_dict.items():
     lookup_dict[k] = v['aa_prop_polarity']
     #print(lookup_dict)
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
-    
-# added two more cols
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
     
+mylen3 = len(gene_LF1.columns)       
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen3 ==  mylen2 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+   
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_calcprop} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_calcprop']
+    #print(lookup_dict)
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
+ 
+mylen4 = len(gene_LF1.columns)   
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen4 ==  mylen3 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen3
+      , '\nnew len:', mylen4)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen3
+          , '\nnew len:', mylen4)
+
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 
@@ -833,56 +987,62 @@ del(k, v, wt, mut, lookup_dict)
 # Do this for both wild_type and mutant as above.
 # caution: taylor mapping will create a list within a column
 #=========
+#print('Adding', ncol_aa_add, 'more cols:\n')
 #lookup_dict = dict()
-
 #for k, v in my_aa_dict.items():
 #    lookup_dict[k] = v['aa_taylor']
-#    #print(lookup_dict)
-#    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-#    pnca_LF1['wt_taylor'] = wt.map(lookup_dict)   
-#    mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-#    pnca_LF1['mut_taylor'] = mut.map(lookup_dict)
+    #print(lookup_dict)
+#    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+#    gene_LF1['wt_taylor'] = wt.map(lookup_dict)   
+#    mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+#    gene_LF1['mut_taylor'] = mut.map(lookup_dict)
     
-# added two more cols
+#mylen5 = len(gene_LF1.columns)   
+
+# sanity checks
+#print('checking if 3-letter wt&mut residue extraction worked correctly')
+#if wt.isna().sum() & mut.isna().sum() == 0:
+#  print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+#         , '\nNo NAs detected:'
+#         , '\nwild-type\n', wt
+#         , '\nmutant-type\n', mut
+#         , '\ndim of df:', gene_LF1.shape)
+#else:
+#    print('FAIL: 3-letter wt&mut residue extraction failed'
+#          , '\nNo NAs detected:'
+#          , '\nwild-type\n', wt
+#          , '\nmutant-type\n', mut
+#          , '\ndim of df:', gene_LF1.shape)
+
+#if mylen5 ==  mylen4 + ncol_aa_add:
+#    print('PASS: successfully added', ncol_aa_add, 'cols'
+#      , '\nold length:', mylen4
+#      , '\nnew len:', mylen5)
+#else:
+#    print('FAIL: failed to add cols:'
+#          , '\nold length:', mylen4
+#          , '\nnew len:', mylen5)
 # clear variables
 #del(k, v, wt, mut, lookup_dict)
 
-#========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_calcprop} 
-# Do this for both wild_type and mutant as above.
-#=========
-lookup_dict = dict()
-
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_calcprop']
-    #print(lookup_dict)
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wt_calcprop'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    pnca_LF1['mut_calcprop'] = mut.map(lookup_dict)
-    
-# added two more cols
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
 ########
 # combine the wild_type+poistion+mutant_type columns to generate 
 # Mutationinformation (matches mCSM output field)
 # Remember to use .map(str) for int col types to allow string concatenation
 #########
-pnca_LF1['Mutationinformation'] = pnca_LF1['wild_type'] + pnca_LF1.position.map(str) + pnca_LF1['mutant_type']
+gene_LF1['Mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
 print('Created column: Mutationinformation'
-	, '\n===============================================================')
+	, '\n====================================================================='
+    , gene_LF1.Mutationinformation.head(10))
 
 #%% Write file: mCSM muts
-snps_only = pd.DataFrame(pnca_LF1['Mutationinformation'].unique())
+snps_only = pd.DataFrame(gene_LF1['Mutationinformation'].unique())
 snps_only.head()
 # assign column name
 snps_only.columns = ['Mutationinformation']
 
 # count how many positions this corresponds to
-pos_only = pd.DataFrame(pnca_LF1['position'].unique()) 
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
 
 print('Checking NA in snps...')# should be 0
 if snps_only.Mutationinformation.isna().sum() == 0:
@@ -912,7 +1072,7 @@ print('Finished writing:', out_filename2
       , '\n=============================================================')
 del(out_filename2)
 
-#%% Write file: pnca_metadata (i.e pnca_LF1)
+#%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename3 = gene.lower() + '_' + 'metadata.csv'
 outfile3 = outdir + '/' + out_filename3
@@ -921,15 +1081,15 @@ print('Writing file: LF formatted data'
       , '\nPath:', outdir
       , '\n============================================================')
 
-pnca_LF1.to_csv(outfile3, header = True, index = False)
+gene_LF1.to_csv(outfile3, header = True, index = False)
 print('Finished writing:', out_filename3
-      , '\nNo. of rows:', len(pnca_LF1)
-      , '\nNo. of cols:', len(pnca_LF1.columns)
+      , '\nNo. of rows:', len(gene_LF1)
+      , '\nNo. of cols:', len(gene_LF1.columns)
       , '\n=============================================================')
 del(out_filename3)
 
 #%% write file: mCSM style but with repitions for MSA and logo plots
-all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) 
+all_muts_msa = pd.DataFrame(gene_LF1['Mutationinformation']) 
 all_muts_msa.head()
 # assign column name
 all_muts_msa.columns = ['Mutationinformation']
@@ -978,7 +1138,7 @@ del(out_filename4)
 
 #%% write file for mutational positions
 # count how many positions this corresponds to
-pos_only = pd.DataFrame(pnca_LF1['position'].unique()) 
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
 # assign column name
 pos_only.columns = ['position']
 # make sure dtype of column position is int or numeric and not string
diff --git a/meta_data_analysis/reference_dict.py b/scripts/reference_dict.py
similarity index 95%
rename from meta_data_analysis/reference_dict.py
rename to scripts/reference_dict.py
index 0461523..8087009 100644
--- a/meta_data_analysis/reference_dict.py
+++ b/scripts/reference_dict.py
@@ -23,34 +23,31 @@ homedir = os.path.expanduser('~')
 
 # set working dir
 #os.getcwd()
-#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 #os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
 
 #==========
 # data dir
 #==========
-#indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 
 #=======
 # input
 #=======
-indir = datadir + '/' + drug + 'input'
 in_filename = 'aa_codes.csv'
-infile = indir + '/' + in_filename 
+infile = datadir + '/' + in_filename 
 print('Input filename:', in_filename
-      , '\nInput path:', indir
+      , '\nInput path:', datadir
       , '\n============================================================')
 
 #=======
 # output: No output
 #=======
-
 #outdir = datadir + '/' + drug + '/' + 'output'  
 #out_filename = ''
 #outfile = outdir + '/' + out_filename
@@ -76,6 +73,7 @@ my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
 # using 'index' creates a dict of dicts
 # using 'records' creates a list of dicts
 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
+print('Printing my_aa_dict:', my_aa_dict.keys())
 
 #================================================
 # dict of aa with their corresponding properties