diff --git a/README.md b/README.md
index 628cafd..dd4cfb5 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,10 @@
 mCSM Analysis
 =============
 
-This repo does mCSM analysis using bash, python and R.
-
+This contains scripts that does the following:
+ 1. mCSM analysis: using bash, python and R
+ 2. meta data analysis: using python and R
+ 
 Requires an additional 'Data' directory. Batteries not included:-)
 
 ## Assumptions
@@ -19,17 +21,14 @@ subdirs within this repo
  	*.R
  	*.py
 	
- mcsm\_analysis/
-	<drug>/
-		scripts/
-		*.R
-		*.py
-			mcsm/
-			*.sh
-			*.py
-			*.R
-			plotting/
-			*.R
+ mcsm_analysis
+#	<drug>/
+
+ foldx_analysis
+ 
+ plotting
+ 	*.R
+ 
 ```
 
 More docs here as I write them. 
diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
deleted file mode 100644
index 9eae42a..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ /dev/null
@@ -1,130 +0,0 @@
-#########################################################
-### A) Installing and loading required packages
-#########################################################
-#lib_loc = "/usr/local/lib/R/site-library")
-
-#if (!require("gplots")) {
-#  install.packages("gplots", dependencies = TRUE)
-#  library(gplots)
-#}
-
-#if (!require("tidyverse")) {
-#  install.packages("tidyverse", dependencies = TRUE)
-#  library(tidyverse)
-#}
-
-if (!require("ggplot2")) {
-  install.packages("ggplot2", dependencies = TRUE)
-  library(ggplot2)
-}
-
-if (!require("plotly")) {
-  install.packages("plotly", dependencies = TRUE)
-  library(plotly)
-}
-
-if (!require("cowplot")) {
-  install.packages("copwplot", dependencies = TRUE)
-  library(cowplot)
-}
-
-if (!require("ggcorrplot")) {
-  install.packages("ggcorrplot", dependencies = TRUE)
-  library(ggcorrplot)
-}
-
-if (!require("ggpubr")) {
-  install.packages("ggpubr", dependencies = TRUE)
-  library(ggpubr)
-}
-
-if (!require("RColorBrewer")) {
-  install.packages("RColorBrewer", dependencies = TRUE)
-  library(RColorBrewer)
-}
-
-if (!require ("GOplot")) {
-  install.packages("GOplot")
-  library(GOplot)
-}
-
-if(!require("VennDiagram")) {
-  install.packages("VennDiagram", dependencies = T)
-  library(VennDiagram)
-}
-
-if(!require("scales")) {
-  install.packages("scales", dependencies = T)
-  library(scales)
-}
-
-if(!require("plotrix")) {
-  install.packages("plotrix", dependencies = T)
-  library(plotrix)
-}
-
-if(!require("stats")) {
-  install.packages("stats", dependencies = T)
-  library(stats)
-}
-
-if(!require("stats4")) {
-  install.packages("stats4", dependencies = T)
-  library(stats4)
-}
-
-if(!require("data.table")) {
-install.packages("data.table")
-  library(data.table)
-}
-
-if (!require("PerformanceAnalytics")){
-  install.packages("PerformanceAnalytics", dependencies = T)
-  library(PerformaceAnalytics)
-}
-
-if (!require ("GGally")){
-  install.packages("GGally")
-  library(GGally)
-}
-
-if (!require ("corrr")){
-  install.packages("corrr")
-  library(corrr)
-}
-
-if (!require ("psych")){
-  install.packages("psych")
-  library(psych)
-}
-
-if (!require ("dplyr")){
-  install.packages("dplyr")
-  library(dplyr)
-}
-
-if (!require ("compare")){
-  install.packages("compare")
-  library(compare)
-}
-
-if (!require ("arsenal")){
-  install.packages("arsenal")
-  library(arsenal)
-}
-
-
-####TIDYVERSE
-# Install
-#if(!require(devtools)) install.packages("devtools")
-#devtools::install_github("kassambara/ggcorrplot")
-
-library(ggcorrplot)
-
-
-###for PDB files
-#install.packages("bio3d") 
-if(!require(bio3d)){
-  install.packages("bio3d")
-  library(bio3d)
-}
diff --git a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R b/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
deleted file mode 100644
index 5a827c8..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/KS_test_PS.R
+++ /dev/null
@@ -1,157 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-# merged_df2
-# merged_df3
-
-# df without NA for pyrazinamide:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-
-# uncomment as necessary
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info); str(my_df$mutation_info)
-
-# subset df with dr muts only
-my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
-table(my_df_dr$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Run two times: 
-# uncomment as necessary
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-
-#================
-# for ALL muts
-#================
-#plot_df = my_df  
-
-#================
-# for dr muts ONLY
-#================
-plot_df = my_df_dr 
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-#============================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-table(plot_df$lineage); str(plot_df$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioDUET
-lin2 = df[df$lineage == "lineage2",]$ratioDUET
-lin3 = df[df$lineage == "lineage3",]$ratioDUET
-lin4 = df[df$lineage == "lineage4",]$ratioDUET
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3)
-ks.test(lin2,lin4)  
-
-ks.test(lin3,lin4)  
-
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
deleted file mode 100644
index a3cc403..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
+++ /dev/null
@@ -1,27 +0,0 @@
-#########################################################
-# 1b: Define function: coloured barplot by subgroup
-# LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
-#########################################################
-
-ColourPalleteMulti <- function(df, group, subgroup){
-  
-  # Find how many colour categories to create and the number of colours in each
-  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
-                          , df
-                          , function(x) length(unique(x)))
-  #  return(categories) }
-  
-  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
-  
-  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
-  
-  #return(category.start); return(category.end)}
-  
-  # Build Colour pallette
-  colours <- unlist(lapply(1:nrow(categories),
-                           function(i){
-                             colorRampPalette(colors = c(category.start[i]
-                                                         , category.end[i]))(categories[i,2])}))
-  return(colours)
-}
-#########################################################
\ No newline at end of file
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
deleted file mode 100644
index 31a533b..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ /dev/null
@@ -1,417 +0,0 @@
-#########################################################
-# TASK: To combine mcsm and meta data with af and or files  
-# Input csv files:
-# 1) mcsm normalised and struct params
-# 2) gene associated meta_data_with_AFandOR
-
-# Output: 
-# 1) muts with opposite effects on stability
-# 2) large combined df including NAs for AF, OR,etc
-# 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
-# 3) small combined df including NAs for AF, OR, etc.
-# 		Dim: same as mcsm data
-# 4) large combined df excluding NAs 
-# 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
-# 5) small combined df excluding NAs
-# 		Dim: dim(#2) - no. of unique NAs - 1
-# This script is sourced from other .R scripts for plotting
-#########################################################
-getwd()
-setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/')
-getwd()
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-source('Header_TT.R')
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# data dir
-#===========
-datadir = paste0('~/git/Data')
-
-#===========
-# input
-#===========
-# infile1: mCSM data
-#indir = '~/git/Data/pyrazinamide/input/processed/'
-indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
-#in_filename = 'mcsm_complex1_normalised.csv'
-in_filename = 'pnca_mcsm_struct_params.csv' 
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
-
-# infile2: gene associated meta data combined with AF and OR
-#indir: same as above
-in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
-infile_comb = paste0(indir, '/', in_filename_comb)
-cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
-
-#===========
-# output
-#===========
-# Uncomment if and when required to output
-outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir
-cat('Output dir: ', outdir)
-#out_filename = paste0(tolower(gene), 'XXX')
-#outfile = paste0(outdir, '/', out_filename)
-#cat(paste0('Output file with full path:', outfile))
-#%% end of variable assignment for input and output files
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-cat('Reading mcsm_data:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename)
-
-mcsm_data = read.csv(infile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(mcsm_data)
-    , '\nNo. of cols:', ncol(mcsm_data))
-
-# clear variables
-rm(in_filename, infile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
-changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
-
-# sanity check: redundant, but uber cautious!
-dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
-ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
-
-cat('Identifying muts with opposite stability effects')
-if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
-  cat('PASS: muts with opposite effects on stability and affinity correctly identified'
-        , '\nNo. of such muts: ', nrow(changes))
-}else {
-  cat('FAIL: unsuccessful in extracting muts with changed stability effects')
-}
-
-#***************************
-# write file: changed muts
-out_filename = 'muts_opp_effects.csv'
-outfile = paste0(outdir, '/', out_filename)
-cat('Writing file for muts with opp effects:'
-    , '\nFilename: ', outfile
-    , '\nPath: ', outdir)
-
-write.csv(changes, outfile)
-#****************************
-# clear variables
-rm(out_filename, outfile)
-rm(changes, dl_i, ld_i)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-orig_col = ncol(mcsm_data)
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
-
-cat('Added 1 col: position frequency to see which posn has how many muts'
-    , '\nNo. of cols now', ncol(mcsm_data)
-    , '\nNo. of cols before: ', orig_col)
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-cat('Reading combined meta data and AFandOR file:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename_comb)
-
-meta_with_afor <- read.csv(infile_comb
-                      , stringsAsFactors = F
-                      , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(meta_with_afor)
-    , '\nNo. of cols:', ncol(meta_with_afor))
-
-# counting NAs in AF, OR cols
-if (identical(sum(is.na(meta_with_afor$OR))
-              , sum(is.na(meta_with_afor$pvalue))
-              , sum(is.na(meta_with_afor$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count = sum(is.na(meta_with_afor$AF))
-  cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
-      , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
-      , '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
-}
-
-# clear variables
-rm(in_filename_comb, infile_comb)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-###########################
-# 3: merging two dfs: with NA
-###########################
-# link col name  = 'Mutationinforamtion'
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2')
-
-#########
-# merge 3a (merged_df2): meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = 'Mutationinformation'
-                  , all.y = T)
-
-cat('Dim of merged_df2: '
-    , '\nNo. of rows: ', nrow(merged_df2)
-    , '\nNo. of cols: ', ncol(merged_df2))
-head(merged_df2$Position)
-
-# sanity check
-cat('Checking nrows in merged_df2')
-if(nrow(meta_with_afor) == nrow(merged_df2)){
-  cat('nrow(merged_df2) = nrow (gene associated metadata)'
-      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
-      ,'\nGot no. of rows: ', nrow(merged_df2))
-} else{
-  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
-      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
-      , '\nGot no. of rows: ', nrow(merged_df2)
-      , '\nFinding discrepancy')
-  merged_muts_u = unique(merged_df2$Mutationinformation)
-  meta_muts_u = unique(meta_with_afor$Mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = 'Mutationinformation'
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 3b (merged_df3):remove duplicate mutation information
-#########
-cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
-    ,'\nCannot trust lineage info from this'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df3')
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity check
-cat('Checking nrows in merged_df3')
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat('PASS: No. of rows match with mcsm_data'
-      ,'\nExpected no. of rows: ', nrow(mcsm_data)
-      ,'\nGot no. of rows: ', nrow(merged_df3))
-} else {
-  cat('FAIL: No. of rows mismatch'
-      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
-      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
-}
-
-# counting NAs in AF, OR cols in merged_df3
-# this is becuase mcsm has no AF, OR cols,
-# so you cannot count NAs
-if (identical(sum(is.na(merged_df3$OR))
-              , sum(is.na(merged_df3$pvalue))
-              , sum(is.na(merged_df3$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count_df3 = sum(is.na(merged_df3$AF))
-  cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(merged_df3$OR))
-      , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
-      , '\nNA in AF:', sum(is.na(merged_df3$AF)))
-}
-
-###########################
-# 4: merging two dfs: without NA
-###########################
-#########
-# merge 4a (merged_df2_comp): same as merge 1 but excluding NA
-#########
-cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2_comp')
-
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-
-# sanity check
-cat('Checking nrows in merged_df2_comp')
-if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
-  cat('PASS: No. of rows match'
-      ,'\nDim of merged_df2_comp: '
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      , '\nNo. of rows: ', nrow(merged_df2_comp)
-      , '\nNo. of cols: ', ncol(merged_df2_comp))
-}else{
-  cat('FAIL: No. of rows mismatch'
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      ,'\nGot no. of rows: ', nrow(merged_df2_comp))
-}
-
-#########
-# merge 4b (merged_df3_comp): remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-cat('Dim of merged_df3_comp: '
-    , '\nNo. of rows: ', nrow(merged_df3_comp)
-    , '\nNo. of cols: ', ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-# sanity check
-cat('Checking nrows in merged_df3_comp')
-if(nrow(merged_df3_comp) == nrow(merged_df3)){
-  cat('NO NAs detected in merged_df3 in AF|OR cols'
-      ,'\nNo. of rows are identical: ', nrow(merged_df3))
-} else{
-  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
-  cat('PASS: NAs detected in merged_df3 in AF|OR cols'
-      , '\nNo. of NAs: ', na_count_df3
-      , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
-      , '\nGot no. of rows: ', nrow(merged_df3_comp))
-  }
-}
-
-#=============== end of combining df
-#*********************
-# writing 1 file in the style of a loop: merged_df3
-# print(output dir)
-#i = 'merged_df3'
-#out_filename = paste0(i, '.csv')
-#outfile = paste0(outdir, '/', out_filename)
-
-#cat('Writing output file: '
-#    ,'\nFilename: ', out_filename
-#    ,'\nPath: ', outdir)
-
-#template: write.csv(merged_df3, 'merged_df3.csv')
-#write.csv(get(i), outfile, row.names = FALSE)
-#cat('Finished writing: ', outfile
-#    , '\nNo. of rows: ', nrow(get(i))
-#    , '\nNo. of cols: ', ncol(get(i)))
-
-#%% write_output files;  all 4 files: 
-outvars = c('merged_df2'
-             , 'merged_df3'
-             , 'merged_df2_comp'
-             , 'merged_df3_comp')
-
-cat('Writing output files: '
-    , '\nPath:', outdir)
-
-for (i in outvars){
-#  cat(i, '\n')
-  out_filename = paste0(i, '.csv')
-#  cat(out_filename, '\n')
-#  cat('getting value of variable: ', get(i))
-  outfile = paste0(outdir, '/', out_filename)
-#  cat('Full output path: ', outfile, '\n')
-  cat('Writing output file:'
-      ,'\nFilename: ', out_filename,'\n')
-  write.csv(get(i), outfile, row.names = FALSE)
-  cat('Finished writing: ', outfile
-      , '\nNo. of rows: ', nrow(get(i))
-      , '\nNo. of cols: ', ncol(get(i)), '\n')
-}
-
-# alternate way to replace with implicit loop 
-# FIXME
-#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
-#*************************
-# clear variables
-rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
-rm(pos_count_check)
-#============================= end of script
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
deleted file mode 100644
index 361b6b6..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ /dev/null
@@ -1,330 +0,0 @@
-#########################################################
-# TASK: To combine mcsm and meta data with af and or
-# by filtering for distance to ligand (<10Ang).
-# This script doesn't output anything.
-# This script is sourced from other .R scripts for plotting ligand plots
-
-# Input csv files:
-# 1) mcsm normalised and struct params
-# 2) gene associated meta_data_with_AFandOR
-#########################################################
-getwd()
-setwd('~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/')
-getwd()
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-
-source('Header_TT.R')
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# input
-#===========
-# infile1: mCSM data
-#indir = '~/git/Data/pyrazinamide/input/processed/'
-indir = paste0('~/git/Data', '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
-#in_filename = 'mcsm_complex1_normalised.csv'
-in_filename = 'pnca_mcsm_struct_params.csv' 
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
-
-# infile2: gene associated meta data combined with AF and OR
-#indir: same as above
-in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
-infile_comb = paste0(indir, '/', in_filename_comb)
-cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
-
-#===========
-# output
-#===========
-# Uncomment if and when required to output
-outdir = paste0('~/git/Data', '/', drug, '/', 'output') #same as indir
-cat('Output dir: ', outdir)
-#out_filename = paste0(tolower(gene), 'XXX')
-#outfile = paste0(outdir, '/', out_filename)
-#cat(paste0('Output file with full path:', outfile))
-#%% end of variable assignment for input and output files
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-cat('Reading mcsm_data:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename)
-
-mcsm_data = read.csv(infile
-                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(mcsm_data)
-    , '\nNo. of cols:', ncol(mcsm_data))
-
-# clear variables
-rm(in_filename, infile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-# excluded from here as it is redundant. 
-# check 'combining_two_df.R' to refer if required.
-
-########################### !!! only for mcsm_lig
-# 4: Filter/subset data 
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# count
-table(mcsm_data$Dis_lig_Ang<10)
-
-# subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-# sanity checks
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-# count no of unique positions
-length(unique(mcsm_data2$Position))
-
-# count no of unique mutations
-length(unique(mcsm_data2$Mutationinformation))
-
-# count Destabilisinga and stabilising
-table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(mcsm_data2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT: so as not to alter the script
-mcsm_data = mcsm_data2
-#!!!!!!!!!!!!!!!!!!!!!
-# clear variables
-rm(mcsm_data2)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-orig_col = ncol(mcsm_data)
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
-
-cat('Added 1 col: position frequency to see which posn has how many muts'
-    , '\nNo. of cols now', ncol(mcsm_data)
-    , '\nNo. of cols before: ', orig_col)
-
-pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
-
-###########################
-# 2: Read file: meta data with AFandOR
-###########################
-cat('Reading combined meta data and AFandOR file:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename_comb)
-
-meta_with_afor <- read.csv(infile_comb
-                      , stringsAsFactors = F
-                      , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(meta_with_afor)
-    , '\nNo. of cols:', ncol(meta_with_afor))
-
-# clear variables
-rm(in_filename_comb, infile_comb)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-###########################
-# 3: merging two dfs: with NA
-###########################
-# link col name  = 'Mutationinforamtion'
-cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2')
-
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# merge 3a: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = 'Mutationinformation'
-                  , all.y = T)
-
-cat('Dim of merged_df2: '
-    , '\nNo. of rows: ', nrow(merged_df2)
-    , '\nNo. of cols: ', ncol(merged_df2))
-head(merged_df2$Position)
-
-if(nrow(meta_with_afor) == nrow(merged_df2)){
-  cat('nrow(merged_df2) = nrow (gene associated metadata)'
-      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
-      ,'\nGot no. of rows: ', nrow(merged_df2))
-} else{
-  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
-      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
-      , '\nGot no. of rows: ', nrow(merged_df2)
-      , '\nFinding discrepancy')
-  merged_muts_u = unique(merged_df2$Mutationinformation)
-  meta_muts_u = unique(meta_with_afor$Mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = 'Mutationinformation'
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# merge 3b:remove duplicate mutation information
-#########
-cat('Merging dfs with NAs: small df (removing duplicate muts)'
-    ,'\nCannot trust lineage info from this'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df3')
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat('PASS: No. of rows match with mcsm_data'
-      ,'\nExpected no. of rows: ', nrow(mcsm_data)
-      ,'\nGot no. of rows: ', nrow(merged_df3))
-} else {
-  cat('FAIL: No. of rows mismatch'
-      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
-      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
-}
-
-###########################
-# 4: merging two dfs: without NA
-###########################
-cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2_comp')
-
-#########
-# merge 4a: same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-
-cat('Dim of merged_df2_comp: '
-    , '\nNo. of rows: ', nrow(merged_df2_comp)
-    , '\nNo. of cols: ', ncol(merged_df2_comp))
-
-#########
-# merge 4b: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-cat('Dim of merged_df3_comp: '
-    , '\nNo. of rows: ', nrow(merged_df3_comp)
-    , '\nNo. of cols: ', ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-#=============== end of combining df
-#*********************
-# write_output files
-# Not required as this is a subset of the combining_two_df.R
-#*************************
-# clear variables
-rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
-rm(pos_count_check)
-#============================= end of script
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
deleted file mode 100755
index 5cc5f09..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jun 25 08:46:36 2019
-
-@author: tanushree
-"""
-############################################
-# load libraries
-import os
-import pandas as pd
-import numpy as np
-from Bio import SeqIO
-############################################
-#********************************************************************
-# TASK: Read in fasta files and create mutant sequences akin to a MSA,
-# to allow generation of logo plots
-
-# Requirements:
-# input: Fasta file of protein/target for which mut seqs will be created
-	# path: "Data/<drug>/input/original/<filename>"
-# output: MSA for mutant sequences
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-#%%
-# specify input and output variables
-homedir = os.path.expanduser('~')
-#=======
-# input
-#=======
-#############
-# fasta file
-#############
-indir = 'git/Data/pyrazinamide/input/original'
-in_filename_fasta = "3pl1.fasta.txt"
-infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta
-print(infile_fasta)
-
-#############
-# meta data
-#############
-# FIXME when you change the dir struc
-inpath_p = "git/Data/pyrazinamide/input/processed"
-in_filename_meta_data = "meta_data_with_AFandOR.csv"
-infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data
-print("Input file is:", infile_meta_data)
-
-#=======
-# output 
-#=======
-outdir = 'git/Data/pyrazinamide/output'
-# filenames in respective sections
-
-################## end of variable assignment for input and output files
-#%%
-#==========
-# read files
-#==========
-
-#############
-# fasta file
-#############
-my_fasta_o = str()
-for seq_record in SeqIO.parse(infile_fasta, "fasta"):
-    my_seq = seq_record.seq
-    my_fasta_o = str(my_seq) #convert to a string
-    print(my_fasta_o)
-    print(len(my_fasta_o))
-#    print( type(my_fasta) )
-
-# remove non_struc positions from fasta
-def remove_char(str, n):
-    first_part = str[:n] 
-    last_part = str[n+1:]
-    return first_part + last_part
-#print(remove_char('Python', 0))
-
-ns_pos_o = 186
-offset = 1 # 0 based indexing
-ns_pos = ns_pos_o - offset 
-my_fasta = remove_char(my_fasta_o, ns_pos)
-print("orig length:", len(my_fasta_o))
-print("new length:", len(my_fasta))
-
-#############
-# SNP info and no of MSA to generate
-#############
-# read mutant_info file and extract cols with positions and mutant_info
-# This should be all samples with pncA muts
-#my_data = pd.read_csv('mcsm_complex1_normalised.csv') 
-my_data = pd.read_csv(infile_meta_data)
-list(my_data.columns)
-#my_data['OR'].value_counts()
-#my_data['OR'].isna().sum()
-
-#FIXME: You need a better way to identify this
-# ideally this file should not contain any non_struc pos
-# remove positions not in the structure
-my_data = my_data[my_data.position != ns_pos_o] 
-
-# if multiple positions, then try the example below;
-# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
-#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
-
-# count mutations per sample
-mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']]
-
-# test
-foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')]
- 
-foo = mut_info.pivot_table(values = ['Mutationinformation']
-                           , index = ['Mutationinformation', 'id']
-#                           , columns = 
-                           , aggfunc = 'count')
-
-# table
-foo_tab = mut_info.pivot_table(values = ['Mutationinformation']
-#                           , index = ['Mutationinformation']
-                           , columns = ['id', 'Mutationinformation']
-                           , aggfunc = 'count'
-#                           , margins = True)
-                           )
-foo_tab.stack('id')
-
-mut_info.to_csv('mutinfo.csv')
-
-mut_info1 = my_data[['position', 'mutant_type']]
-#%%
-################
-# data cleaning
-################
-# extract only those positions that have a frequency count of pos>1
-###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
-
-# add a column of frequency for each position
-#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count')
-mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts())
-
-# sort by position
-mut_info2 = mut_info1.sort_values(by=['position'])
-
-# count how many pos have freq 1 as you will need to exclude those
-mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos
-
-# extract entries with freq_pos>1
-# should be 3093-211 = 3072
-mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
-print("orig length:", len(mut_info1))
-print("No. of excluded values:", mutfreq1_count)
-print("new length:", len(mut_info3))
-# sanity check
-if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ):
-    print("Sanity check passed: Filtered data correctly")
-else:
-    print("Error: Debug you code")
-
-# reset index to allow iteration !!!!!!!!!! IMPORTANT
-mut_info = mut_info3.reset_index(drop = True)
-
-##del(mut_info1, mut_info2, mut_info3, my_data)
-
-###################
-# generate mut seqs
-###################
-mut_seqsL = [] * len(mut_info) 
-
-# iterate 
-for i, pos in enumerate(mut_info['position']):
-    my_fastaL = list(my_fasta)
-    mut = mut_info['mutant_type'][i]
-    offset_pos = pos-1
-    
-    print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
-    my_fastaL[offset_pos] = mut
-    print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
-     
-    mut_seq = "".join(my_fastaL)
-#    print(mut_seq + '\n')
-    print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut,  mut_seq)
-    mut_seqsL.append(mut_seq)
-
-
-###############
-# sanity check
-################
-len_orig = len(my_fasta)
-#    checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in mut_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != len_orig:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
- 
-del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)       
-            
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
-print(outdir)
-out_filename = "gene_msa.txt"
-outfile_gene = homedir + '/' + outdir + '/' + out_filename
-print(outfile_gene)
-
-with open(outfile_gene, 'w') as file_handler:
-    for item in mut_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-#R = "\n".join(mut_seqsL)
-#f = open('Columns.csv','w')
-#f.write(R)
-#f.close()
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
deleted file mode 100755
index 7e00fb1..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# run all bash scripts for mcsm
-
-#./step0_check_duplicate_SNPs.sh
-#./step1_lig_output_urls.sh
-./step2_lig_results.sh
-./step3a_results_format_interim.sh
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
deleted file mode 100755
index 4c24392..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-#*************************************
-# need to be in the correct directory
-#*************************************
-##: comments for code
-#: commented out code
-
-#**********************************************************************
-# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
-# per line. Sort by unique, which automatically removes duplicates.
-# sace file in current directory
-#**********************************************************************
-infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
-outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
-
-# sort unique entries and output to current directory
-sort -u ${infile} > ${outfile}
-
-# count no. of unique snps mCSM will run on
-count=$(wc -l < ${outfile})
-
-# print to console no. of unique snps mCSM will run on
-echo "${count} unique mutations for mCSM to run on"
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
deleted file mode 100755
index 6361b62..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_lig_output_urls.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-
-#**********************************************************************
-# TASK: submit requests using curl: HANDLE redirects and refresh url. 
-# Iterate over mutation file and write/append result urls to a file
-# Mutation file must have one mutation (format A1B) per line
-# Requirements
-# input: mutation list (format: A1B), complex struc: (pdb format)
-    # mutation: outFile from step0, one unique mutation/line, no chain ID
-    	# path: "Data/<drug>/input/processed/<filename>"
-    # structure: pdb file of drug-target complex
-    	# path: "Data/<drug>/input/structure/<filename>"
-# output: should be n urls (n=no. of unique mutations in file)
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# NOTE: these are just result urls, not actual values for results
-#**********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath_mut="/processed"
-in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
-infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
-echo Input Mut filename: ${infile_mut}
-
-inpath_struc="/structure"
-in_filename_struc="/complex1_no_water.pdb"
-infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
-echo Input Struc filename: ${infile_struc}
-
-# output
-outpath="/processed"
-out_filename="/complex1_result_url.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-#echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# iterate over mutation file (infile_mut); line by line and 
-# submit query using curl
-# some useful messages
-echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
-COUNT=0
-while read -r line; do
-((COUNT++))
-mutation="${line}"
-#    echo "${mutation}"
-#pdb='../Data/complex1_no_water.pdb'
-pdb="${infile_struc}"
-mutation="${mutation}"
-chain="A"
-lig_id="PZA"
-affin_wt="0.99"
-host="http://biosig.unimelb.edu.au"
-call_url="/mcsm_lig/prediction"
-
-#=========================================
-##html field_names names required for curl
-##complex_field:wild=@
-##mutation_field:mutation=@
-##chain_field:chain=@
-##ligand_field:lig_id@
-##energy_field:affin_wt
-#=========================================
-refresh_url=$(curl -L \
-     -sS \
-     -F "wild=@${pdb}" \
-     -F "mutation=${mutation}" \
-     -F "chain=${chain}" \
-     -F "lig_id=${lig_id}" \
-     -F "affin_wt=${affin_wt}" \
-     ${host}${call_url} | grep "http-equiv")
-
-#echo Refresh URL: $refresh_url
-#echo Host+Refresh: ${host}${refresh_url}
-
-# use regex to extract the relevant bit from the refresh url
-# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
-
-# Now build: result url using host and refresh url and write the urls to a file 
-result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
-sleep 10
-
-echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
-
-# create output file with the added number of muts from file
-# after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
-echo -e "${host}${result_url}" >> ${outfile}
-#echo -n '.'
-done < "${infile_mut}"
-
-#FIXME: stop executing if error else these echo statements are misleading!
-echo
-echo Output filename: ${outfile}
-echo
-echo Number of urls saved: $(wc -l < ${infile_mut})
-echo
-echo "Processing Complete"
-
-# end of submitting query, receiving result url and storing results url in a file
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
deleted file mode 100755
index 51a7844..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_lig_results.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: submit result urls and fetch actual results using curl
-# Iterate over each result url from the output of step1 stored in processed/
-# Use curl to fetch results and extract relevant sections using hxtools
-# and store these in another file in processed/
-
-# Requirements:
-# input: output of step1, file containing result urls
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: name of the file where extracted results will be stored
-	# path: "Data/<drug>/input/processed/<filename>"
-
-# Optional: can make these command line args you pass when calling script
-# by uncommenting code as indicated
-#*********************************************************************
-############################# uncomment: to make it command line args
-#if [ "$#" -ne 2 ]; then
-  #if [ -Z $1 ]; then
-#  echo "
-#  Please provide both Input and Output files.
-
-#  Usage: batch_read_urls.sh INFILE OUTFILE
-#  "
-#  exit 1
-#fi
-
-# First argument: Input File
-# Second argument: Output File
-#infile=$1
-#outfile=$2
-############################ end of code block to make command line args
-
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-#echo Home directory is ${homedir}
-basedir="/git/Data/pyrazinamide/input"
-
-# input
-inpath="/processed"
-in_filename="/complex1_result_url.txt"
-infile="${homedir}${basedir}${inpath}${in_filename}"
-echo Input Mut filename: ${infile}
-
-# output
-outpath="/processed"
-out_filename="/complex1_output_MASTER.txt"
-outfile="${homedir}${basedir}${outpath}${out_filename}"
-echo Output filename: ${outfile}
-################## end of variable assignment for input and output files
-
-# Iterate over each result url, and extract results using hxtools 
-# which nicely cleans and formats html
-echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
-echo
-COUNT=0
-while read -r line; do
-#COUNT=$(($COUNT+1))
-((COUNT++))
-  curl --silent ${line} \
-    | hxnormalize -x \
-    | hxselect -c div.span4 \
-    | hxselect -c div.well \
-    | sed -r -e 's/<[^>]*>//g' \
-    | sed -re 's/ +//g' \
-    >> ${outfile}
-  #| tee -a ${outfile}
-#  echo -n '.'
-echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
-  
-done < "${infile}"
-
-echo
-echo "Processing Complete"
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
deleted file mode 100755
index 0861996..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_results_format_interim.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-#********************************************************************
-# TASK: Intermediate results processing
-# output file has a convenient delimiter of ":" that can be used to 
-# format the file into two columns (col1: field_desc and col2: values)
-# However the section "PredictedAffinityChange:...." and 
-# "DUETstabilitychange:.." are split over multiple lines and 
-# prevent this from happening. Additionally there are other empty lines
-# that need to be omiited. In order ensure these sections are not split
-# over multiple lines, this script is written.
-
-# Requirements:
-# input: output of step2, file containing mcsm results as described above
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: replaces file in place.
-# Therefore first create a copy of the input file
-# but rename it to remove the word "MASTER" and add the word "processed"
-# file format: .txt
-
-# NOTE: This replaces the file in place!
-# the output is a txt file with no newlines and formatting 
-# to have the following format "<colname><:><value>
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir="${HOME}"
-basedir="/git/Data/pyrazinamide/input"
-
-inpath="/processed"
-
-# Create input file: copy and rename output file of step2
-oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
-newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
-cp $oldfile $newfile
-
-echo Input filename is ${oldfile}
-echo
-echo Output i.e copied filename is ${newfile}
-
-# output: No output perse
-# Replacement in place inside the copied file
-################## end of variable assignment for input and output files
-
-#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
-# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
-
-# Outputs records separated by a newline, that look something like this:
-# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
-# Mutationinformation:
-# Wild-type:L
-# Position:4
-# Mutant-type:W
-# Chain:A
-# LigandID:PZA
-# Distancetoligand:15.911&Aring;
-# DUETstabilitychange:-2.169Kcal/mol
-# 
-# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
-# (...etc)
-
-# This script brings everything in a convenient format for further processing in python.
-sed -i '/PredictedAffinityChange/ {
-N
-N
-N
-N
-s/\n//g
-}
-/DUETstabilitychange:/ {
-N
-N
-s/\n//g
-}
-/^$/d' ${newfile}
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
deleted file mode 100755
index 0e07c0d..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/python
-
-###################
-# load libraries
-import os, sys
-import pandas as pd
-from collections import defaultdict
-####################
-
-#********************************************************************
-# TASK: Formatting results with nice colnames
-# step3a processed the mcsm results to remove all newlines and 
-# brought data in a format where the delimiter ":" splits
-# data into a convenient format of "colname": "value".
-# this script formats the data and outputs a df with each row
-# as a mutation and its corresponding mcsm_values
-
-# Requirements:
-# input: output of step3a, file containing  "..._output_processed.txt"
-	# path: "Data/<drug>/input/processed/<filename>"
-# output: formatted .csv file
-	# path: "Data/<drug>/input/processed/<filename>"
-#***********************************************************************
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
-# input
-inpath = "/processed"
-in_filename = "/complex1_output_processed.txt"
-infile = homedir + basedir + inpath + in_filename
-print("Input file is:", infile)
-
-# output
-outpath = "/processed"
-out_filename = "/complex1_formatted_results.csv"
-outfile = homedir + basedir + outpath + out_filename
-print("Output file is:", outfile)
-################## end of variable assignment for input and output files
-
-outCols=[
-        'PredictedAffinityChange',
-        'Mutationinformation',
-        'Wild-type',
-        'Position',
-        'Mutant-type',
-        'Chain',
-        'LigandID',
-        'Distancetoligand',
-        'DUETstabilitychange'
-        ]
-
-lines = [line.rstrip('\n') for line in open(infile)]
-
-outputs = defaultdict(list)
-
-for item in lines:
-	col, val = item.split(':')
-	outputs[col].append(val)
-
-dfOut=pd.DataFrame(outputs)
-
-pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
deleted file mode 100644
index c58dc8b..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ /dev/null
@@ -1,230 +0,0 @@
-getwd()
-#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
-getwd()
-
-#=======================================================
-# TASK: read formatted_results_df.csv to complete 
-# missing info, adding DUET categories, assigning
-# meaningful colnames, etc.
-
-# Requirements:
-# input: output of step3b, python processing,
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-basedir = "/git/Data/pyrazinamide/input"
-inpath = "/processed"
-in_filename = "/complex1_formatted_results.csv"
-infile = paste0(homedir, basedir, inpath, in_filename)
-print(paste0("Input file is:", infile))
-
-#======================================================
-#TASK: To tidy the columns so you can generate figures
-#=======================================================
-####################
-#### read file #####: this will be the output from python script (csv file)
-####################
-data = read.csv(infile
-              , header = T
-              , stringsAsFactors = FALSE)
-dim(data)
-str(data)
-
-# clear variables
-rm(homedir, basedir, inpath, in_filename, infile)
-
-###########################
-##### Data processing #####
-###########################
-
-# populate mutation information columns as currently it is empty
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-
-# should not be blank: create muation information
-data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
-
-head(data$Mutationinformation)
-tail(data$Mutationinformation)
-#write.csv(data, 'test.csv')
-
-##########################################
-# Remove duplicate SNPs as a sanity check
-##########################################
-# very important
-table(duplicated(data$Mutationinformation))
-
-# extract duplicated entries
-dups = data[duplicated(data$Mutationinformation),] #0
-
-# No of dups should match with the no. of TRUE in the above table 
-#u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) )
-
-#***************************************************************
-# select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),]
-#***************************************************************
-# sanity check
-u = unique(df$Mutationinformation)
-u2 = unique(data$Mutationinformation)
-table(u%in%u2)
-
-# should all be 1
-sum(table(df$Mutationinformation) == 1)
-
-# sort df by Position
-# MANUAL CHECKPOINT:  
-#foo <- df[order(df$Position),]
-#df <- df[order(df$Position),]
-
-# clear variables
-rm(u, u2, dups)
-
-####################
-#### give meaningful colnames to reflect units to enable correct data type
-####################
-
-#=======
-#STEP 1
-#========
-# make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange
-
- #make Predicted...column numeric and outcome column categorical
-head(df$PredictedAffinityChange)
-df$PredictedAffinityChange = gsub("log.*"
-                                  , ""
-                                  , df$PredictedAffinityChange)
-
-# sanity checks
-head(df$PredictedAffinityChange)
-
-# should be numeric, check and if not make it numeric
-is.numeric( df$PredictedAffinityChange )
-
-# change to numeric
-df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
-
-# should be TRUE
-is.numeric( df$PredictedAffinityChange )
-
-# change the column name to indicate units
-n = which(colnames(df) == "PredictedAffinityChange"); n
-colnames(df)[n] = "PredAffLog"
-colnames(df)[n]
-
-#========
-#STEP 2
-#========
-# make Lig_outcome column categorical showing effect of mutation
-head(df$Lig_outcome)
-df$Lig_outcome = gsub("^.*-"
-                  , "",
-                  df$Lig_outcome)
-# sanity checks
-head(df$Lig_outcome)
-
-# should be factor, check and if not change it to factor
-is.factor(df$Lig_outcome) 
-
-# change to factor
-df$Lig_outcome = as.factor(df$Lig_outcome)
-
-# should be TRUE
-is.factor(df$Lig_outcome) 
-
-#========
-#STEP 3
-#========
-# gsub
-head(df$Distancetoligand)
-df$Distancetoligand = gsub("&Aring;"
-                           , ""
-                           , df$Distancetoligand)
-# sanity checks
-head(df$Distancetoligand)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$Distancetoligand)
-
-# change to numeric
-df$Distancetoligand = as.numeric(df$Distancetoligand)
-
-# should be TRUE
-is.numeric(df$Distancetoligand)
-
-# change the column name to indicate units
-n = which(colnames(df) == "Distancetoligand")
-colnames(df)[n] <- "Dis_lig_Ang"
-colnames(df)[n]
-
-#========
-#STEP 4
-#========
-#gsub
-head(df$DUETstabilitychange)
-df$DUETstabilitychange = gsub("Kcal/mol"
-                              , ""
-                              , df$DUETstabilitychange)
-# sanity checks
-head(df$DUETstabilitychange)
-
-# should be numeric, check if not change it to numeric
-is.numeric(df$DUETstabilitychange)
-
-# change to numeric 
-df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
-
-# should be TRUE
-is.numeric(df$DUETstabilitychange)
-
-# change the column name to indicate units
-n = which(colnames(df) == "DUETstabilitychange"); n
-colnames(df)[n] = "DUETStability_Kcalpermol"
-colnames(df)[n]
-
-#========
-#STEP 5
-#========
-# create yet another extra column: classification of DUET stability only
-df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
-                         , "Stabilizing"
-                         , "Destabilizing") # spelling to be consistent with mcsm
-
-table(df$Lig_outcome)
-
-table(df$DUET_outcome)
-
-#==============================
-#FIXME
-#Insert a venn diagram
-#================================
-
-#========
-#STEP 6
-#========
-# assign wild and mutant colnames correctly
-
-wt = which(colnames(df) == "Wild.type"); wt
-colnames(df)[wt] <- "Wild_type"
-colnames(df[wt])
-
-mut = which(colnames(df) == "Mutant.type"); mut
-colnames(df)[mut] <- "Mutant_type"
-colnames(df[mut])
-
-#========
-#STEP 7
-#========
-# create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position)
-
-# clear variables
-rm(n, wt, mut)
-
-################ end of data cleaning
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
deleted file mode 100644
index eb24cab..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
+++ /dev/null
@@ -1,275 +0,0 @@
-##################
-# load libraries
- library(compare)
-##################
-
-getwd()
-
-#=======================================================
-# TASK:read cleaned data and perform rescaling
-  # of DUET stability scores
-  # of Pred affinity
-# compare scaling methods with plots
-
-# Requirements:
-# input: R script, step3c_results_cleaning.R
-  # path: Data/<drug>/input/processed/<filename>"
-# output: NO output as the next scripts refers to this
-# for yet more processing
-# output normalised file
-#=======================================================
-
-# specify variables for input and output paths and filenames
-homedir = "~"
-currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
-in_filename = "/step3c_results_cleaning.R"
-infile = paste0(homedir, currdir, in_filename)
-print(paste0("Input file is:", infile))
-
-# output file
-basedir = "/git/Data/pyrazinamide/input"
-outpath = "/processed"
-out_filename = "/mcsm_complex1_normalised.csv"
-outfile = paste0(homedir, basedir, outpath, out_filename)
-print(paste0("Output file is:", outfile))
-
-####################
-#### read file #####: this will be the output of my R script that cleans the data columns
-####################
-source(infile)
-
-#This will outut two dataframes:
-# data: unclean data: 10 cols
-# df : cleaned df: 13 cols
-# you can remove data if you want as you will not need it
-rm(data)
-
-colnames(df)
-
-#===================
-#3a: PredAffLog
-#===================
-n = which(colnames(df) == "PredAffLog"); n
-group = which(colnames(df) == "Lig_outcome"); group 
-
-#===================================================
-# order according to PredAffLog values
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$PredAffLog)
-
-# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
-df = df[order(df$PredAffLog),] 
-head(df$PredAffLog)
-
-# sanity checks
-head(df[,n]) # all negatives
-tail(df[,n]) # all positives
-
-# sanity checks
-mean(df[,n])
-#-0.9526746
-
-tapply(df[,n], df[,group], mean)
-
-#===========================
-# Same as above: in 2 steps
-#===========================
-
-# find range of your data
-my_min = min(df[,n]); my_min #
-my_max = max(df[,n]); my_max #
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-df$ratioPredAff = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                      )# 14 cols
-# sanity checks
-head(df$ratioPredAff)
-tail(df$ratioPredAff)
-
-min(df$ratioPredAff); max(df$ratioPredAff)
-
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-# should be the same as below 
-sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
-
-table(df$Lig_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "Ligand_stability"
-# my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioPredAff
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioPredAff )
-     , main = "ratio rescaling"
-)
-
-# titles
-mtext(text = "Frequency"
-       , side = 2
-       , line = 0
-       , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-
-#clear variables 
-rm(my_min, my_max, my_title, n, group)
-
-#===================
-# 3b: DUET stability
-#===================
-dim(df) # 14 cols
-
-n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
-group = which(colnames(df) == "DUET_outcome"); group #12
-
-#===================================================
-# order according to DUET scores
-#===================================================
-# This is because this makes it easier to see the results of rescaling for debugging
-head(df$DUETStability_Kcalpermol)
-
-# ORDER BY DUET scores: negative values at the top and positive at the bottom
-df = df[order(df$DUETStability_Kcalpermol),] 
-
-# sanity checks
-head(df[,n]) # negatives
-tail(df[,n]) # positives
-
-# sanity checks
-mean(df[,n])
-
-tapply(df[,n], df[,group], mean)
-
-#===============================================
-# WITHIN GROUP rescaling 2: method "ratio"
-# create column to store the rescaled values
-# Rescaling separately (Less dangerous) 
-#       =====> chosen one: preserves sign
-#===============================================
-# find range of your data
-my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max
-
-df$ratioDUET = ifelse(df[,n] < 0
-                      , df[,n]/abs(my_min)
-                      , df[,n]/my_max
-                    ) # 15 cols
-# sanity check
-head(df$ratioDUET)
-tail(df$ratioDUET)
-
-min(df$ratioDUET); max(df$ratioDUET)
-
-# sanity checks
-tapply(df$ratioDUET, df$DUET_outcome, min)
-
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-# should be the same as below (267 and 42)
-sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
-
-table(df$DUET_outcome)
-
-#===============================================
-# Hist and density plots to compare the rescaling 
-# methods: Base R
-#===============================================
-# uncomment as necessary
-my_title = "DUET_stability"
-#my_title = colnames(df[n])
-
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(2,2))
-
-hist(df[,n]
-     , xlab = ""
-     , main = "Raw values"
-)
-
-hist(df$ratioDUET
-     , xlab = ""
-     , main = "ratio rescaling"
-)
-
-# Plot density plots underneath
-plot(density( df[,n] )
-     , main = "Raw values"
-)
-
-plot(density( df$ratioDUET )
-     , main = "ratio rescaling"
-)
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = my_title
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-# reorder by column name
-#data <- data[c("A", "B", "C")]
-colnames(df)
-df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
-           , "Wild_type", "Mutant_type"
-           , "DUETStability_Kcalpermol", "DUET_outcome"
-           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
-           , "ratioDUET", "ratioPredAff"
-           , "LigandID","Chain")]
-
-# sanity check
-# should be True
-#compare(df, df2, allowAll = T)
-compare(df, df2, ignoreColOrder = T)
-#TRUE 
-#reordered columns
-
-#===================
-# write output as csv file
-#===================
-#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
-write.csv(df2, outfile, row.names = FALSE)
diff --git a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
deleted file mode 100644
index 877215a..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ /dev/null
@@ -1,131 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-###########################
-# This will return:
-
-# df with NA:
-# merged_df2 
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-###########################
-# you need merged_df3 
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-###########################
-# Data for bfactor figure
-# PS average 
-# Lig average
-###########################
-
-head(my_df$Position)
-head(my_df$ratioDUET)
-
-# order data frame 
-df = my_df[order(my_df$Position),]
-
-head(df$Position)
-head(df$ratioDUET)
-
-#***********
-# PS: average by position
-#***********
-
-mean_DUET_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.DUET = mean(ratioDUET))
-
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-  group_by(Position) %>%
-  summarize(averaged.Lig = mean(ratioPredAff))
-
-
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-
-# sanity check
-# mean_PS_Lig_Bfactor
-
-colnames(combined)
-
-colnames(combined) = c("Position"
-                       , "average_DUETR"
-                       , "Position2"
-                       , "average_PredAffR")
-
-colnames(combined)
-
-identical(combined$Position, combined$Position2)
-
-n = which(colnames(combined) == "Position2"); n
-
-combined_df = combined[,-n]
-
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-
-head(combined_df$Position); tail(combined_df$Position)
-
-write.csv(combined_df, outFile
-          , row.names = F)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
deleted file mode 100644
index 9ebc62b..0000000
Binary files a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData and /dev/null differ
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
deleted file mode 100644
index 26919ce..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ /dev/null
@@ -1,250 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(cowplot)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp
-#my_df = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df$OR)
-#[1] TRUE
-
-#<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR PS Plots
-#<<<<<<<<<<<<<<<<<<<
-
-PS_df  = my_df
-
-rm(my_df)
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-getwd()
-
-source("combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for OR and stability plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df2  = merged_df3_comp
-#my_df2 = merged_df3
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df2)
-str(my_df2)
-
-# sanity check
-# Ensure correct data type in columns to plot: need to be factor
-is.numeric(my_df2$OR)
-#[1] TRUE
-
-# sanity check: should be <10
-if (max(my_df2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-# FOR Lig Plots
-#<<<<<<<<<<<<<<<<
-
-Lig_df  = my_df2
-
-rm(my_df2)
-
-#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
-
-#############
-# Plots: Bubble plot
-# x = Position, Y = stability
-# size of dots = OR
-# col: stability
-#############
-
-#=================
-# generate plot 1: DUET vs OR by position as geom_points
-#=================  
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
-
-g = ggplot(PS_df, aes(x = factor(Position)
-                   , y = ratioDUET))
-
-p1 = g + 
-  geom_point(aes(col = DUET_outcome
-                 , size = OR)) +
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "DUET(PS)"
-       , size = "Odds Ratio"
-       , colour = "DUET Outcome") +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p1 
-
-#=================
-# generate plot 2: Lig vs OR by position as geom_points
-#=================  
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-# Spelling Correction: made redundant as already corrected at the source
-
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-table(Lig_df$Lig_outcome)
-
-g = ggplot(Lig_df, aes(x = factor(Position)
-                   , y = ratioPredAff))
-
-p2 = g + 
-  geom_point(aes(col = Lig_outcome
-                   , size = OR))+
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = my_ats
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = my_als)
-        , axis.title.y = element_text(size = my_als) 
-        , legend.text = element_text(size = my_als)
-        , legend.title = element_text(size = my_als) ) +
-  #, legend.key.size = unit(1, "cm")) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Ligand Affinity"
-       , size = "Odds Ratio"
-       , colour = "Ligand Outcome"
-       ) +
-  guides(colour = guide_legend(override.aes = list(size=4))) 
-
-p2
-
-#======================
-#combine using cowplot
-#======================
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
-#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
-theme_set(theme_gray()) # to preserve default theme
-
-printFile = cowplot::plot_grid(plot_grid(p1, p2
-                             , ncol = 1
-                             , align = 'v'
-                             , labels = c("(a)", "(b)")
-                             , label_size = my_als+5))
-print(printFile)
-dev.off()
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
deleted file mode 100644
index 30b9981..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ /dev/null
@@ -1,154 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# Lig_outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig_outcome.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df  = my_df 
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(my_df)
-
-# sanity checks
-upos = unique(my_df$Position)
-
-# should be a factor
-is.factor(df$Lig_outcome)
-#TRUE
-
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
deleted file mode 100644
index 169bdaf..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ /dev/null
@@ -1,149 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# DUET outcome. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET_outcome
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(my_df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
-  
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
deleted file mode 100644
index a5d9361..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ /dev/null
@@ -1,202 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Plot: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding Lig stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-table(df$Lig_outcome)
-
-# should be -1 and 1: may not be in this case because you have filtered the data
-# FIXME: normalisation before or after filtering?
-min(df$ratioPredAff) #
-max(df$ratioPredAff) #
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = Lig_outcome
-# subgroup = normalised score i.e ratioPredAff
-
-# Prepare data: round off ratioLig scores
-# round off to 3 significant digits:
-# 165 if no rounding is performed: used to generate the originalgraph
-# 156 if rounded to 3 places
-# FIXME: check if reducing precision creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioPredAff) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioLigR = round(df$ratioPredAff, n) 
-u = unique(df$ratioLigR) # 156
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioLigR
-df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioLig
-#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
-my_title = "Ligand affinity"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
deleted file mode 100644
index 8828e90..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ /dev/null
@@ -1,192 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-upos = unique(df$Position)
-
-# should be a factor
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-table(df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-tapply(df$ratioDUET, df$DUET_outcome, min)
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-#******************
-# generate plot
-#******************
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = DUET_outcome
-# subgroup = normalised score i.e ratioDUET
-
-# Prepare data: round off ratioDUET scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioDUET) 
-
-# <<<<< -------------------------------------------
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioDUETR = round(df$ratioDUET, n)
-u = unique(df$ratioDUETR)
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioDUETR
-df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# else 
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioDUET
-#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
-
-# <<<<< -----------------------------------------------
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
-my_title = "Protein stability (DUET)"
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-# for sanity and good practice
-rm(df)
-#======================= end of plot
-# axis colours labels
-# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
-# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R
deleted file mode 100644
index 432749e..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_LIG.R
+++ /dev/null
@@ -1,296 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-############################################################
-# Output dir for plots
-############################################################
-out_dir = "~/git/Data/pyrazinamide/output/plots"
-
-############################################################
-# 2: call script the prepares the data with columns containing
-# colours for axis labels
-############################################################
-
-source("subcols_axis_LIG.R")
-
-# this should return
-#mut_pos_cols: 52, 4
-#my_df: 169, 39
-
-# clear excess variable
-# "mut_pos_cols" is just for inspection in case you need to cross check
-# position numbers and colours
-# open file from deskptop ("sample_axis_cols") for cross checking
-
-table(mut_pos_cols$lab_bg)
-
-sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should  be True
-     
-table(mut_pos_cols$lab_bg2)
-
-sum( table(mut_pos_cols$lab_bg2) ) ==  nrow(mut_pos_cols) # should  be True
-
-table(mut_pos_cols$lab_fg)
-
-sum( table(mut_pos_cols$lab_fg) ) ==  nrow(mut_pos_cols) # should  be True
-
-# very important!: should be the length of the unique positions
-my_axis_colours = mut_pos_cols$lab_fg
-
-# now clear mut_pos_cols
-rm(mut_pos_cols) 
-
-###########################
-# 2: Plot: Lig scores
-###########################
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to Lig_outcome
-# Stacked Barplot with colours: Lig_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding PredAff stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-# sanity checks
-upos = unique(my_df$Position)
-
-str(my_df$Lig_outcome)
-
-colnames(my_df)
-
-#=========================== 
-# Data preparation for plots
-#===========================
-#!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-df <- my_df
-#!!!!!!!!!!!!!!!!!
-
-rm(my_df)
-
-# sanity checks
-# should be a factor
-is.factor(df$Lig_outcome); 
-#FALSE
-
-df$Lig_outcome = as.factor(df$Lig_outcome)
-is.factor(df$Lig_outcome); 
-#TRUE
-
-table(df$Lig_outcome)
-
-# check the range
-min(df$ratioPredAff)
-max(df$ratioPredAff)
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioPredAff, df$Lig_outcome, min)
-
-tapply(df$ratioPredAff, df$Lig_outcome, max)
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = Lig_outcome
-# subgroup = normalised score i.e ratioPredAff
-
-# Prepare data: round off ratioPredAff scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioPredAff) 
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioPredAffR = round(df$ratioPredAff, n)  
-u = unique(df$ratioPredAffR)
-
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioPredAffR
-df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") 
-
-# ELSE
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioPredAff
-#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")  
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#******************
-# generate plot
-#******************
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
-my_title = "Ligand Affinity"
-library(ggplot2)
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis according to frequency
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# plot with axis colours
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 15
-my_yals = 15
-
-# axes text size
-my_xats = 15
-my_yats = 18
-
-# using geom_tile
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off") +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-  # Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# output plot as svg/png
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 18
-my_yals = 18
-
-# axes text size
-my_xats = 16 #14 in PS
-my_yats = 18
-
-# set output dir for plots
-#getwd()
-#setwd("~/git/Data/pyrazinamide/output/plots") 
-#getwd()
-
-plot_name = "barplot_LIG_acoloured.svg"
-my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
-
-svg(my_plot_name, width = 26, height = 4)
-
-g = ggplot(df, aes(factor(Position, ordered = T)))
-
-outFile = g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off"
-  ) +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-#  geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
-#            , fill = df$lab_bg) +
-#  geom_tile(aes(,-1, width = 0.9, height = 0.3)
-#            , fill = df$lab_bg2) +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-# Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Frequency")
-
-
-print(outFile)
-dev.off()
-
-# for sanity and good practice
-#rm(df)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
deleted file mode 100644
index 78029be..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_aa_PS.R
+++ /dev/null
@@ -1,292 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-source("../Header_TT.R")
-source("../barplot_colour_function.R")
-
-############################################################
-# Output dir for plots
-############################################################
-out_dir = "~/git/Data/pyrazinamide/output/plots"
-
-############################################################
-# 2: call script the prepares the data with columns containing
-# colours for axis labels
-############################################################
-
-source("subcols_axis.R")
-
-# this should return
-#mut_pos_cols: 130, 4
-#my_df: 335, 39
-
-# clear excess variable
-# "mut_pos_cols" is just for inspection in case you need to cross check
-# position numbers and colours
-# open file from deskptop ("sample_axis_cols") for cross checking
-
-table(mut_pos_cols$lab_bg)
-
-sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should  be True
-     
-table(mut_pos_cols$lab_bg2)
-
-sum( table(mut_pos_cols$lab_bg2) ) ==  nrow(mut_pos_cols) # should  be True
-
-table(mut_pos_cols$lab_fg)
-
-sum( table(mut_pos_cols$lab_fg) ) ==  nrow(mut_pos_cols) # should  be True
-
-# very important!
-my_axis_colours = mut_pos_cols$lab_fg
-
-# now clear mut_pos_cols
-rm(mut_pos_cols) 
-
-###########################
-# 2: Plot: DUET scores
-###########################
-#==========================
-# Plot 2: Barplot with scores (unordered)
-# corresponds to DUET_outcome
-# Stacked Barplot with colours: DUET_outcome @ position coloured by 
-# stability scores. This is a barplot where each bar corresponds 
-# to a SNP and is coloured by its corresponding DUET stability value.
-# Normalised values (range between -1 and 1 ) to aid visualisation
-# NOTE: since barplot plots discrete values, colour = score, so number of
-# colours will be equal to the no. of unique normalised scores 
-# rather than a continuous scale
-# will require generating the colour scale separately.
-#============================
-# sanity checks
-upos = unique(my_df$Position)
-
-str(my_df$DUET_outcome)
-
-colnames(my_df)
-
-#=========================== 
-# Data preparation for plots
-#===========================
-#!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-df <- my_df
-#!!!!!!!!!!!!!!!!!
-
-rm(my_df)
-
-# sanity checks
-# should be a factor
-is.factor(df$DUET_outcome)
-#TRUE
-
-table(df$DUET_outcome)
-
-# should be -1 and 1
-min(df$ratioDUET)
-max(df$ratioDUET)
-
-# sanity checks
-# very important!!!!
-tapply(df$ratioDUET, df$DUET_outcome, min)
-
-tapply(df$ratioDUET, df$DUET_outcome, max)
-
-# My colour FUNCTION: based on group and subgroup
-# in my case;
-# df = df
-# group = DUET_outcome
-# subgroup = normalised score i.e ratioDUET
-
-# Prepare data: round off ratioDUET scores
-# round off to 3 significant digits:
-# 323 if no rounding is performed: used to generate the original graph
-# 287 if rounded to 3 places
-# FIXME: check if reducing precicion creates any ML prob
-
-# check unique values in normalised data
-u = unique(df$ratioDUET) 
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Run this section if rounding is to be used
-# specify number for rounding
-n = 3 
-df$ratioDUETR = round(df$ratioDUET, n)  
-u = unique(df$ratioDUETR)
-
-# create an extra column called group which contains the "gp name and score" 
-# so colours can be generated for each unique values in this column
-my_grp = df$ratioDUETR
-df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "") 
-
-# ELSE
-# uncomment the below if rounding is not required
-
-#my_grp = df$ratioDUET
-#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")  
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#******************
-# generate plot
-#******************
-
-# Call the function to create the palette based on the group defined above
-colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
-my_title = "Protein stability (DUET)"
-library(ggplot2)
-
-# axis label size
-my_xaxls = 13
-my_yaxls = 15
-
-# axes text size
-my_xaxts = 15
-my_yaxts = 15
-
-# no ordering of x-axis according to frequency
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  theme( axis.text.x = element_text(size = my_xaxls
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = my_yaxls 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xaxts)
-         , axis.title.y = element_text(size = my_yaxts ) ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# plot with axis colours
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 15
-my_yals = 15
-
-# axes text size
-my_xats = 15
-my_yats = 18
-
-# using geom_tile
-g = ggplot(df, aes(factor(Position, ordered = T)))
-g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off") +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-  # Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = my_title
-       , x = "Position"
-       , y = "Frequency")
-
-#========================
-# output plot as svg/png
-#========================
-class(df$lab_bg)
-# make this a named vector
-
-# define cartesian coord
-my_xlim = length(unique(df$Position)); my_xlim
-
-# axis label size
-my_xals = 18
-my_yals = 18
-
-# axes text size
-my_xats = 14
-my_yats = 18
-
-# set output dir for plots
-#getwd()
-#setwd("~/git/Data/pyrazinamide/output/plots") 
-#getwd()
-
-plot_name = "barplot_PS_acoloured.svg"
-my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name
-
-svg(my_plot_name, width = 26, height = 4)
-
-g = ggplot(df, aes(factor(Position, ordered = T)))
-
-outFile = g + 
-  coord_cartesian(xlim = c(1, my_xlim)
-                  , ylim = c(0, 6)
-                  , clip = "off"
-  ) +
-  
-  geom_bar(aes(fill = group), colour = "grey") +
-  scale_fill_manual( values = colours
-                     , guide = 'none') +
-#  geom_tile(aes(,-0.6, width = 0.9, height = 0.7)
-#            , fill = df$lab_bg) +
-#  geom_tile(aes(,-1, width = 0.9, height = 0.3)
-#            , fill = df$lab_bg2) +
-  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
-            , fill = df$lab_bg) +
-  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
-            , fill = df$lab_bg2) +
-  
-# Here it's important to specify that your axis goes from 1 to max number of levels
-  theme( axis.text.x = element_text(size = my_xats
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4
-                                    , colour = my_axis_colours)
-         , axis.text.y = element_text(size = my_yats 
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = my_xals)
-         , axis.title.y = element_text(size = my_yals ) 
-         , axis.ticks.x = element_blank()
-  ) +
-  labs(title = ""
-       , x = "Position"
-       , y = "Frequency")
-
-
-print(outFile)
-dev.off()
-
-# for sanity and good practice
-#rm(df)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
deleted file mode 100644
index c4826d3..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ /dev/null
@@ -1,215 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-
-#require(data.table)
-#require(dplyr)
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$Lig_outcome)
-my_df$Lig_outcome = as.factor(my_df$lig_outcome)
-is.factor(my_df$Lig_outcome)
-#[1] TRUE
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-}
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = Lig_outcome))
-#prinfFile = g + geom_bar(
-  g + geom_bar(
-  aes(fill = Lig_outcome)
-  , show.legend = TRUE
-) + geom_label(
-  stat = "count"
-  , aes(label = ..count..)
-  , color = "black"
-  , show.legend = FALSE
-  , size = 10) + theme(
-    axis.text.x = element_blank()
-    , axis.title.x = element_blank()
-    , axis.title.y = element_text(size=my_als)
-    , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-  ) + labs(
-    title = ""
-    , y = "Number of SNPs"
-    #, fill='Ligand Outcome'
-  )  + scale_fill_discrete(name = "Ligand Outcome"
-                           , labels = c("Destabilising", "Stabilising"))
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#require(data.table)
-setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
-
-head(df$pos_count)
-table(df$pos_count)
-# this is cummulative
-#1  2  3  4  5  6 
-#5 24 36 56 30 18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count)) 
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#5 12 12 14  6  3
-# this is what will get plotted
-
-svg('position_count_LIG.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-  #g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-  ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-    )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-    )
-    , axis.title.x = element_text(size = my_als)
-    , axis.title.y = element_text(size = my_als)
-    , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-  )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of Lig barplots         			   #
-########################################################################
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
deleted file mode 100644
index 51a2812..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ /dev/null
@@ -1,211 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for DUET plots
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3 
-#my_df  = merged_df3_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-# sanity check
-is.factor(my_df$DUET_outcome)
-my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
-is.factor(my_df$DUET_outcome)
-#[1] TRUE
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Basic barplots 
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-if (identical(df$Position, df$position)){
-  print("Sanity check passed: Columns 'Position' and 'position' are identical")
-} else{
-  print("Error!: Check column names and info contained")
-  }
-
-#****************
-# generate plot: No of stabilising and destabilsing muts
-#****************
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('basic_barplots_DUET.svg')
-
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-theme_set(theme_grey())
-
-# uncomment as necessary for either directly outputting results or 
-# printing on the screen
-g = ggplot(df, aes(x = DUET_outcome))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes(fill = DUET_outcome)
-  , show.legend = TRUE
-  ) + geom_label(
-    stat = "count"
-    , aes(label = ..count..)
-    , color = "black"
-    , show.legend = FALSE
-    , size = 10) + theme(
-      axis.text.x = element_blank()
-      , axis.title.x = element_blank()
-      , axis.title.y = element_text(size=my_als)
-      , axis.text.y = element_text(size = my_ats)
-    , legend.position = c(0.73,0.8)
-    , legend.text = element_text(size=my_als-2)
-    , legend.title = element_text(size=my_als)
-    , plot.title = element_blank()
-    ) + labs(
-      title = ""
-      , y = "Number of SNPs"
-      #, fill='DUET Outcome'
-      ) + scale_fill_discrete(name = "DUET Outcome"
-                              , labels = c("Destabilising", "Stabilising"))
-
-print(prinfFile)
-dev.off()
-
-#****************
-# generate plot: No of positions
-#****************
-#get freq count of positions so you can subset freq<1
-#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
-
-setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
-table(df$pos_count)
-# this is cummulative
-#1   2   3   4   5   6 
-#34  76  63 104  40  18 
-
-# use group by on this
-snpsBYpos_df <- df %>%
-  group_by(Position) %>%
-  summarize(snpsBYpos = mean(pos_count))
-
-table(snpsBYpos_df$snpsBYpos)
-#1  2  3  4  5  6 
-#34 38 21 26  8  3 
-
-foo = select(df, Mutationinformation
-             , WildPos
-             , wild_type
-             , mutant_type
-             , mutation_info
-             , position
-             , pos_count) #335, 5
-
-getwd()
-write.csv(foo, "../Data/pos_count_freq.csv")
-
-svg('position_count_DUET.svg')
-my_ats = 25 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
-prinfFile = g + geom_bar(
-#g + geom_bar(
-  aes (alpha = 0.5)
-  , show.legend = FALSE
-  ) +
-  geom_label(
-    stat = "count", aes(label = ..count..)
-    , color = "black"
-    , size = 10
-    ) +
-  theme( 
-    axis.text.x = element_text(
-      size = my_ats
-      , angle = 0
-      )
-    , axis.text.y = element_text(
-      size = my_ats
-      , angle = 0
-      , hjust = 1
-      )
-  , axis.title.x = element_text(size = my_als)
-  , axis.title.y = element_text(size = my_als)
-  , plot.title = element_blank()
-  ) +
-  labs(
-    x = "Number of SNPs"
-    , y = "Number of Sites"
-    )
-print(prinfFile)
-dev.off()
-########################################################################
-#               			end of DUET barplots         			   #
-########################################################################
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
deleted file mode 100644
index 0059bca..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ /dev/null
@@ -1,175 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 		Installing and loading required packages and functions		   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for PS Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#!!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-df  = my_df
-#!!!!!!!!!!!!!!!!!!!!!!!!
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$DUET_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-
-# subset data to generate pairwise correlations
-corr_data = df[, c("ratioDUET"
-#                  , "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-                   , "DUET_outcome"
-#                  , "Lig_outcome"
-                   , "pyrazinamide"
-                   )]
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c("DUET"
-#                    , "Ligand Affinity"
-#                    , "DUET_raw"
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-                     , "DUET_outcome"
-#                    , "Lig_outcome"
-                     , "pyrazinamide")
-
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-#default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('DUET_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
-             , pch = 21
-             , jitter = T
-             #, alpha = .05
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-
-print(printFile)
-dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
deleted file mode 100644
index 4e05d41..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ /dev/null
@@ -1,187 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages			   #	
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R") 
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for Lig Corr plots
-# you need merged_df3_comp
-# since these are matched 
-# to allow pairwise corr
-###########################
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df3_comp 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Plot: Correlation plots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT 
-df  = my_df 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-rm(my_df)
-
-# sanity checks
-str(df)
-
-table(df$Lig_outcome)
-
-# unique positions
-length(unique(df$Position)) #{RESULT: unique positions for comp data}
-
-# subset data to generate pairwise correlations
-corr_data = df[, c(#"ratioDUET",
-                  "ratioPredAff"
-#                  , "DUETStability_Kcalpermol"
-#                  , "PredAffLog"
-#                  , "OR"
-                   , "logor"
-#                  , "pvalue"
-                   , "neglog10pvalue"
-                   , "AF"
-#                  , "DUET_outcome"
-                   , "Lig_outcome"
-                   , "pyrazinamide"
-                   )] 
-dim(corr_data)
-rm(df)
-
-# assign nice colnames (for display)
-my_corr_colnames = c(#"DUET",
-                     "Ligand Affinity"
-#                    ,"DUET_raw" 
-#                    , "Lig_raw"
-#                    , "OR"
-                     , "Log(Odds Ratio)"
-#                    , "P-value"
-                     , "-LogP"
-                     , "Allele Frequency"
-#                    , "DUET_outcome"
-                     , "Lig_outcome"
-                     , "pyrazinamide")
-                     
-# sanity check
-if (length(my_corr_colnames) == length(corr_data)){
-  print("Sanity check passed: corr_data and corr_names match in length")
-}else{
-  print("Error: length mismatch!")
-}
-
-colnames(corr_data)
-colnames(corr_data) <- my_corr_colnames
-colnames(corr_data)
-
-###############
-# PLOTS: corr
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-###############
-
-# default pairs plot
-start = 1
-end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
-offset = 1
-
-my_corr = corr_data[start:(end-offset)]
-head(my_corr)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-#==========
-# psych: ionformative since it draws the ellipsoid
-# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
-# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
-#==========
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots"
-getwd()
-
-svg('Lig_corr.svg', width = 15, height = 15)
-printFile = pairs.panels(my_corr[1:4]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
-             , pch = 21
-             , jitter = T
-#            , alpha = .05
-#            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 3
-             , cex.cor = 1
-             , smooth = F
-)
-print(printFile)
-dev.off()
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
deleted file mode 100644
index 1f868e4..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ /dev/null
@@ -1,227 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-
-require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df		   	  		   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#==========================
-
-###########################
-# Data for plots
-# you need merged_df2, comprehensive one
-# since this has one-many relationship
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-#==========================
-# Plot: Lineage barplot
-# x = lineage y = No. of samples
-# col = Lineage
-# fill = lineage
-#============================
-table(my_df$lineage)
-
-#        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
-#3        104       1293        264       1311          6          6        105 
-
-#===========================
-# Plot: Lineage Barplots
-#===========================
-
-#===================
-# Data for plots
-#===================
-
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-df <- my_df
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-rm(my_df)
-
-# get freq count of positions so you can subset freq<1
-#setDT(df)[, lineage_count := .N, by = .(lineage)]
-
-#******************
-# generate plot: barplot of mutation by lineage
-#******************
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-df_lin = subset(df, subset = lineage %in% sel_lineages )
-
-#FIXME; add sanity check for numbers.
-# Done this manually
-
-############################################################
-
-#########
-# Data for barplot: Lineage barplot
-# to show total samples and number of unique mutations 
-# within each linege
-##########
-
-# Create df with lineage inform & no. of unique mutations
-# per lineage and total samples within lineage
-# this is essentially barplot with two y axis
-
-bar = bar = as.data.frame(sel_lineages) #4, 1
-total_snps_u = NULL
-total_samples = NULL
-
-for (i in sel_lineages){
-  #print(i)
-  curr_total = length(unique(df$id)[df$lineage==i])
-  total_samples = c(total_samples, curr_total)
-  print(total_samples)
-  
-  foo = df[df$lineage==i,]
-  print(paste0(i, "======="))
-  print(length(unique(foo$Mutationinformation)))
-  curr_count = length(unique(foo$Mutationinformation))
-
-  total_snps_u = c(total_snps_u, curr_count)
-}
-
-print(total_snps_u)
-bar$num_snps_u = total_snps_u
-bar$total_samples = total_samples
-bar
-
-#*****************
-# generate plot: lineage barplot with two y-axis
-#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
-#*****************
-
-bar$num_snps_u = y1
-bar$total_samples = y2
-sel_lineages = x
-
-to_plot = data.frame(x = x
-                      , y1 = y1
-                      , y2 = y2)
-to_plot
-
-melted = melt(to_plot, id = "x")
-melted
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-svg('lineage_basic_barplot.svg')
-
-my_ats = 20 # axis text size
-my_als = 22 # axis label size
-
-g = ggplot(melted
-           , aes(x = x
-                 , y = value
-                 , fill = variable)
-           )
-
-
-printFile = g + geom_bar(
-  
-#g + geom_bar(
-  stat = "identity"
-  , position = position_stack(reverse = TRUE)
-  , alpha=.75
-  , colour='grey75'
-    ) + theme(
-    axis.text.x = element_text(
-      size = my_ats
-#      , angle= 30
-    )
-  , axis.text.y = element_text(size = my_ats
-  #, angle = 30
-  , hjust = 1
-  , vjust = 0)
-  , axis.title.x = element_text(
-    size = my_als
-    , colour = 'black'
-    )
-  , axis.title.y = element_text(
-    size = my_als
-    , colour = 'black'
-  )
-  , legend.position = "top"
-  , legend.text = element_text(size = my_als)
-  
-  #) + geom_text(
-  ) + geom_label(
-    aes(label = value)
-    , size = 5
-    , hjust = 0.5
-    , vjust = 0.5
-    , colour = 'black'
-    , show.legend = FALSE
-    #, check_overlap = TRUE
-    , position = position_stack(reverse = T)
-    #, position = ('
-
-  ) + labs(
-    title = ''
-    , x = ''
-    , y = "Number"
-    , fill = 'Variable'
-    , colour = 'black'
-  ) + scale_fill_manual(
-      values = c('grey50', 'gray75')
-      , name=''
-      , labels=c('Mutations', 'Total Samples')
-    ) + scale_x_discrete(
-      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-    )
-print(printFile)
-dev.off()
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
deleted file mode 100644
index e4e6972..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ /dev/null
@@ -1,253 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for Lig		   	   #
-########################################################################
-
-source("../combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA:
-# merged_df2
-# merged_df3
-
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-###########################
-
-# uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info)
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(my_df$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-#==========================
-# Data for plot: assign as 
-# necessary
-#===========================
-
-# uncomment as necessary
-#!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-
-#==================
-# data for ALL muts
-#==================
-plot_df = my_df  
-my_plot_name = 'lineage_dist_PS.svg'
-#my_plot_name = 'lineage_dist_PS_comp.svg'
-
-#=======================
-# data for dr_muts ONLY
-#=======================
-#plot_df = my_df_dr 
-#my_plot_name = 'lineage_dist_dr_PS.svg'
-#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
-#!!!!!!!!!!!!!!!!!!!!!!!
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-#78     961      195     803 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#77     955      194     770
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#!!!!!!!!!!!!!!!!!!!!!!!!! 
-# REASSIGNMENT
-df <- df_lin
-#!!!!!!!!!!!!!!!!!!!!!!!!! 
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-library(plotly)
-library(ggridges)
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-g <- ggplot(df, aes(x = ratioPredAff)) + 
-  geom_density(aes(fill = Lig_outcome)
-               , alpha = 0.5) + 
-  facet_wrap( ~ lineage
-             , scales = "free"
-             , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian(xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-) 
-    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# check plot name
-my_plot_name
-
-svg(my_plot_name)
-
-printFile = ggplot( df, aes(x = ratioPredAff
-                          , y = Lig_outcome) ) +
-  
-  geom_density_ridges_gradient( aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#              , switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off"
-                  ) +
-
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "Ligand Affinity" ) +
-  theme( axis.text.x = element_text( size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-      ) 
-
-print(printFile)
-dev.off()
-#===================================================
-
-# COMPARING DISTRIBUTIONS
-head(df$lineage)
-df$lineage = as.character(df$lineage)
-
-lin1 = df[df$lineage == "lineage1",]$ratioPredAff
-lin2 = df[df$lineage == "lineage2",]$ratioPredAff
-lin3 = df[df$lineage == "lineage3",]$ratioPredAff
-lin4 = df[df$lineage == "lineage4",]$ratioPredAff
-
-# ks test
-ks.test(lin1,lin2) 
-ks.test(lin1,lin3) 
-ks.test(lin1,lin4) 
-
-ks.test(lin2,lin3) 
-ks.test(lin2,lin4) 
-
-ks.test(lin3,lin4) 
-
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
deleted file mode 100644
index 703a206..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ /dev/null
@@ -1,229 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
-
-source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
-
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-# merged_df2
-# merged_df3
-
-# df without NA for pyrazinamide:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df  = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# Ensure correct data type in columns to plot: need to be factor
-is.factor(my_df$lineage)
-my_df$lineage = as.factor(my_df$lineage)
-is.factor(my_df$lineage)
-
-table(my_df$mutation_info); str(my_df$mutation_info)
-
-# subset df with dr muts only
-my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
-table(my_df_dr$mutation_info)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#==========================
-# Run two times: 
-# uncomment as necessary
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-
-#================
-# for ALL muts
-#================
-plot_df = my_df  
-my_plot_name = 'lineage_dist_PS.svg'
-#my_plot_name = 'lineage_dist_PS_comp.svg'
-
-#================
-# for dr muts ONLY
-#================
-#plot_df = my_df_dr 
-#my_plot_name = 'lineage_dist_dr_PS.svg'
-#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
- 
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-#==========================
-# Plot: Lineage Distribution
-# x = mcsm_values, y = dist
-# fill = stability
-#============================
-
-#===================
-# Data for plots
-#===================
-table(plot_df$lineage); str(plot_df$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4")
-
-# uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
-
-# refactor
-df_lin$lineage = factor(df_lin$lineage)
-
-table(df_lin$lineage) #{RESULT: No of samples within lineage}
-#lineage1 lineage2 lineage3 lineage4 
-
-length(unique(df_lin$Mutationinformation))
-#{Result: No. of unique mutations the 4 lineages contribute to}
-
-# sanity checks
-r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(df_lin)
-
-#******************
-# generate distribution plot of lineages
-#******************
-# basic: could improve this!
-#library(plotly)
-#library(ggridges)
-
-g <- ggplot(df, aes(x = ratioDUET)) + 
-  geom_density(aes(fill = DUET_outcome)
-               , alpha = 0.5) + facet_wrap(~ lineage,
-                                           scales = "free") +
-  ggtitle("Kernel Density estimates of Protein stability by lineage")
-
-ggplotly(g)
-
-# 2 : ggridges (good!)
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-
-# set output dir for plots
-getwd()
-setwd("~/git/Data/pyrazinamide/output/plots")
-getwd()
-
-# check plot name
-my_plot_name
-
-# output svg
-svg(my_plot_name)
-printFile = ggplot(df, aes(x = ratioDUET
-                            , y = DUET_outcome))+
-  
-  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                                , scale = 3
-                                , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-#             , switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                   , ylim = c(0, 6)
-#                   , clip = "off" 
-) +
-  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-                        , name = "DUET" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-#                  , axis.text.y = element_text(size = my_ats
-#                                                , angle = 0
-#                                                , hjust = 1
-#                                                , vjust = 0)
-         , axis.text.y = element_blank()
-         , axis.title.x = element_blank()
-         , axis.title.y = element_blank()
-         , axis.ticks.y = element_blank()
-         , plot.title = element_blank()
-         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size = my_als)
-#                  , legend.position = c(0.3, 0.8)
-#                  , legend.key.height = unit(1, 'mm')
-  ) 
-
-print(printFile)
-dev.off()
-
-#=!=!=!=!=!=!=!
-# COMMENT: Not much differences in the distributions
-# when using merged_df2 or merged_df2_comp.
-# Also, the lineage differences disappear when looking at all muts
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!=!
-#===================================================
-
-# COMPARING DISTRIBUTIONS: KS test 
-# run: "../KS_test_PS.R"
-
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R
deleted file mode 100644
index f60fb0b..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/logolas_logoplot.R
+++ /dev/null
@@ -1,250 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting/")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-library(ggseqlogo)
-
-#=======
-# input
-#=======
-#############
-# msa file: output of generate_mut_sequences.py
-#############
-homedir = '~'
-indir = 'git/Data/pyrazinamide/output'
-in_filename = "gene_msa.txt"
-infile = paste0(homedir, '/', indir,'/', in_filename)
-print(infile)
-
-#=======
-# input
-#=======
-#############
-# combined dfs
-#############
-source("../combining_two_df.R")
-
-###########################
-# Data for Logo plots
-# you need big df i.e
-# merged_df2
-# or
-# merged_df2_comp
-# since these have unique SNPs
-# I prefer to use the merged_df2
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df2
-#my_df = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_df)
-str(my_df)
-
-# doesn't work if you use the big df as it has duplicate snps
-#rownames(my_df) = my_df$Mutationinformation
-
-# sanity check: should be True
-table(my_df$position == my_df$Position)
-
-c1 = unique(my_df$Position) # 130
-nrow(my_df) # 3092 
-
-#FIXME
-#!!! RESOLVE !!!
-# get freq count of positions and add to the df
-setDT(my_df)[, occurrence_sample := .N, by = .(id)] 
-table(my_df$occurrence_sample)
-
-
-my_df2 = my_df %>%
-  select(id, Mutationinformation, Wild_type, WildPos, position, Mutant_type, occurrence, occurrence_sample)
-
-write.csv(my_df2, "my_df2.csv")
-
-#  extract freq_pos>1 since this will not add to much in the logo plot
-# pos 5 has one mutation but coming from atleast 5 samples?
-table(my_df$occurrence)
-foo = my_df[my_df$occurrence ==1,]
-
-# uncomment as necessary
-my_data_snp = my_df #3092
-
-#!!! RESOLVE
-# FIXME
-my_data_snp = my_df[my_df$occurrence!=1,] #3072, 36...3019
-
-u = unique(my_data_snp$Position) #96
-
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#########################################################
-# Task: To generate a logo plot  or bar plot but coloured 
-# aa properties.
-# step1: read mcsm file and OR file
-# step2: plot wild type positions
-# step3: plot mutants per position coloured by aa properties
-# step4: make the size of the letters/bars prop to OR if you can!
-#########################################################
-##useful links
-#https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#https://omarwagih.github.io/ggseqlogo/
-#https://kkdey.github.io/Logolas-pages/workflow.html
-#A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-
-##very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-
-#==============
-# matrix for mutant type
-# frequency of mutant type by position
-#==============
-table(my_data_snp$Mutant_type, my_data_snp$Position)
-tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position)
-class(tab_mt)
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-# should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#**********************
-# Plot 1: mutant logo
-#**********************
-my_ymax = max(my_data_snp$occurrence); my_ymax
-my_ylim = c(0,my_ymax) # very important
-
-# axis sizes
-# common: text and label
-my_ats = 15
-my_als = 20
-
-# individual: text and label 
-my_xats = 15
-my_yats = 20
-my_xals = 15
-my_yals = 20
-
-# legend size: text and label 
-my_lts = 20
-#my_lls = 20
-
-# Color scheme based on chemistry of amino acids
-chemistry = data.frame(
-  letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'),
-  group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)),
-  col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)),
-  stringsAsFactors = F
-) 
-
-# uncomment as necessary
-my_type = "EDLogo"
-#my_type = "Logo"
-
-logomaker(tab_mt
-         , type = my_type
-         , return_heights = T
-#         , color_type = "per_row"
-#         , colors = chemistry$col
-#         , method = 'custom'
-#         , seq_type = 'aa'
-#         , col_scheme = "taylor"
-#         , col_scheme = "chemistry2"
-) +
-theme(legend.position = "bottom"
-        , legend.title = element_blank()
-        , legend.text = element_text(size = my_lts )
-        , axis.text.x = element_text(size = my_ats , angle = 90)
-        , axis.text.y = element_text(size = my_ats , angle = 90))
-
-p0 = logomaker(tab_mt
-               , type =  my_type
-               , return_heights = T
-               , color_type = "per_row"
-               , colors = chemistry$col
-#               , seq_type = 'aa'
-#               , col_scheme = "taylor"
-#               , col_scheme = "chemistry2"
-) + 
-  #ylab('my custom height') +
-  theme(axis.text.x = element_blank()) +
-#  theme_logo()+ 
-  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
-  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 1:my_ymax
-                      , limits = my_ylim)
-
-p0
-
-# further customisation
-p1 = p0 + theme(legend.position = "bottom"
-                , legend.title = element_blank()
-                , legend.text = element_text(size = my_lts)
-                , axis.text.x = element_text(size = my_ats , angle = 90)
-                , axis.text.y = element_text(size = my_ats , angle = 90))
-p1
-
-#=======
-# input
-#=======
-#############
-# msa file: output of generate_mut_sequences.py
-#############
-homedir = '~'
-indir = 'git/Data/pyrazinamide/output'
-in_filename = "gene_msa.txt"
-infile = paste0(homedir, '/', indir,'/', in_filename)
-print(infile)
-
-##############
-# ggseqlogo: custom matrix of my data
-##############
-snps = read.csv(infile
-                , stringsAsFactors = F
-                , header = F) #3072, 
-
-class(snps); str(snps) # df and chr
-
-# turn to a character vector
-snps2 = as.character(snps[1:nrow(snps),])
-
-class(snps2); str(snps2) #character, chr
-
-# plot
-logomaker(snps2, type = my_type
-          , color_type = "per_row") +
-  theme(axis.text.x = element_blank()) +
-  theme_logo()+ 
-  # scale_x_continuous(breaks=1:51, parse (text = colnames(tab)) )
-  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 0:5
-                      , limits = my_ylim)
-
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R b/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
deleted file mode 100644
index 80f1971..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/snp_logo_plot.R
+++ /dev/null
@@ -1,273 +0,0 @@
-getwd()
-setwd("~/git//LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-# TASK: Multiple mutations per site
-# as aa symbol coloured by aa property
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-#source("../Header_TT.R")
-
-#source("barplot_colour_function.R")
-
-library(ggseqlogo)
-
-########################################################################
-#		 Read file: call script for combining df for lig		   	           #
-########################################################################
-
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "/home/tanu/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-#merged_df2 # 3092, 35
-#merged_df2_comp #3012, 35
-
-#merged_df3 #335, 35
-#merged_df3_comp #293, 35
-#==========================
-
-###########################
-# Data for Logo plots
-# you need small df i.e
-# merged_df3
-# or
-# merged_df3_comp? possibly
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3 # to show multiple mutations per site
-my_df = read.csv(file.choose())
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-colnames(my_df)
-str(my_df)
-
-rownames(my_df) = my_df$Mutationinformation
-
-c1 = unique(my_df$Position) #130
-nrow(my_df) #335
-
-table(my_df$occurrence)
-#1   2   3   4   5   6 
-#34  76  63 104  40  18 
-
-# get freq count of positions so you can subset freq<1
-#: already done in teh combining script
-#require(data.table)
-#setDT(my_df)[, occurrence := .N, by = .(Position)] #189, 36
-
-table(my_df$Position); table(my_df$occurrence)
-
-# extract freq_pos>1
-my_data_snp = my_df[my_df$occurrence!=1,] #301, 36
-u_pos = unique(my_data_snp$Position) #96
-
-# sanity check
-exp_dim = nrow(my_df) - table(my_df$occurrence)[[1]]; exp_dim
-if ( nrow(my_data_snp) == exp_dim ){
-  print("Sanity check passed: Data filtered correctly, dim match")
-} else {
-  print("Error: Please Debug")
-}
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#########################################################
-# Task: To generate a logo plot  or bar plot but coloured 
-# aa properties.
-# step1: read data file
-# step2: plot wild type positions
-# step3: plot mutants per position coloured by aa properties
-# step4: make the size of the letters/bars prop to OR if you can!
-#########################################################
-# useful links
-# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-# https://omarwagih.github.io/ggseqlogo/
-# https://kkdey.github.io/Logolas-pages/workflow.html
-# A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-# very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-
-
-#############
-# PLOTS: Bar plot with aa properties
-# using gglogo
-# useful links: https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-#############
-
-##############
-# ggseqlogo: custom matrix of my data
-##############
-
-#==============
-# matrix for mutant type
-# frequency of mutant type by position
-#==============
-table(my_data_snp$Mutant_type, my_data_snp$Position)
-tab_mt = table(my_data_snp$Mutant_type, my_data_snp$Position)
-class(tab_mt)
-
-# unclass to convert to matrix
-tab_mt = unclass(tab_mt)
-tab_mt = as.matrix(tab_mt, rownames = T)
-
-# should be TRUE
-is.matrix(tab_mt)
-
-rownames(tab_mt) #aa
-colnames(tab_mt) #pos
-
-#==============
-# matrix for wild type
-# frequency of wild type by position
-#==============  
-# remove wt duplicates
-wt = my_data_snp[, c("Position", "Wild_type")] #301, 2
-wt = wt[!duplicated(wt),]#96, 2
-
-table(wt$Wild_type) # contains duplicates
-
-tab_wt = table(wt$Wild_type, wt$Position); tab_wt # should all be 1
-
-tab_wt = unclass(tab_wt) #important
-class(tab_wt); rownames(tab_wt)
-#tab_wt = as.matrix(tab_wt, rownames = T)
-
-rownames(tab_wt)
-rownames(tab_mt)
-
-# sanity check
-if (ncol(tab_wt) == length(u_pos) ){
-  print("Sanity check passed: wt data dim match")
-} else {
-  print("Error: Please debug")
-}
-
-#**************
-# Plot 1: mutant logo
-#**************
-#install.packages("digest")
-#library(digest)
-# following example
-require(ggplot2)
-require(reshape2)
-library(gglogo)
-library(ggrepel)
-library(ggseqlogo)
-
-# generate seq logo for mutant type
-my_ymax = max(my_data_snp$occurrence); my_ymax
-my_ylim = c(0, my_ymax)
-#my_yrange = 1:my_ymax; my_yrange
-
-# axis sizes
-# common: text and label
-my_ats = 15
-my_als = 20
-
-# individual: text and label 
-my_xats = 15
-my_yats = 20
-my_xals = 15
-my_yals = 20
-
-# legend size: text and label 
-my_lts = 20
-#my_lls = 20
-
-p0 = ggseqlogo(tab_mt
-               , method = 'custom'
-               , seq_type = 'aa'
-#               , col_scheme = "taylor"
-#               , col_scheme = "chemistry2"
-) + 
-#  ylab('my custom height') +
-  theme(axis.text.x = element_blank()) +
-  theme_logo()+ 
-#   scale_x_continuous(breaks=1:51, parse (text = colnames(tab_mt)) )
-  scale_x_continuous(breaks = 1:ncol(tab_mt)
-                     , labels = colnames(tab_mt))+
-  scale_y_continuous( breaks = 1:my_ymax
-                     , limits = my_ylim)
-
-p0
-
-# further customisation
-p1 = p0 + theme(legend.position = "none"
-                , legend.title = element_blank()
-                , legend.text = element_text(size = my_lts)
-                , axis.text.x = element_text(size = my_xats, angle = 90)
-                , axis.text.y = element_text(size = my_yats, angle = 90))
-p1
-
-#**************
-# Plot 2: for wild_type
-# with custom x axis to reflect my aa positions
-#**************
-# sanity check: MUST BE TRUE
-# for the correctnes of the x axis
-identical(colnames(tab_mt), colnames(tab_wt))
-identical(ncol(tab_mt), ncol(tab_wt))
-
-p2 = ggseqlogo(tab_wt
-               , method = 'custom'
-               , seq_type = 'aa'
-#               , col_scheme = "taylor"
-#               , col_scheme = chemistry2
-) + 
-#  ylab('my custom height') +
-  theme(axis.text.x = element_blank()
-        , axis.text.y = element_blank()) +
-  theme_logo() +
-  scale_x_continuous(breaks = 1:ncol(tab_wt)
-                     , labels = colnames(tab_wt)) +
-  scale_y_continuous( breaks = 0:1 
-                     , limits = my_ylim )
-
-p2
-
-# further customise
-
-p3 = p2 +
-  theme(legend.position = "bottom"
-        , legend.text = element_text(size = my_lts)
-        , axis.text.x = element_text(size = my_ats
-                                     , angle = 90)
-        , axis.text.y = element_blank()) 
-
-p3
-
-
-# Now combine using cowplot, which ensures the plots are aligned
-suppressMessages( require(cowplot) )
-
-plot_grid(p1, p3, ncol = 1, align = 'v') #+ 
-# background_grid(minor = "xy"
-#                  , size.minor = 1
-#                  , colour.minor = "grey86")
-
-
-#colour scheme
-#https://rdrr.io/cran/ggseqlogo/src/R/col_schemes.r
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R
deleted file mode 100644
index 2049c3e..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_LIG.R
+++ /dev/null
@@ -1,208 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-#source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#library(tidyverse)
-
-###########################
-#2: Read file: normalised file, output of step 4 mcsm pipeline
-###########################
-#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-#                  , header = T)
-
-# call script combining_df
-source("../combining_two_df_lig.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#  from Plotting to Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-#merged_df2 
-#merged_df2_comp 
-
-# df without NA for pyrazinamide:
-#merged_df3 
-#merged_df3_comp 
-#==========================
-###########################
-# Data to choose: 
-# We will be using the small dfs
-# to generate the coloured axis
-###########################
-
-# uncomment as necessary
-#!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df  = merged_df3_comp
-#!!!!!!!!!!!!!!!!!!!!!!!
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-str(my_df)
-my_df$Position
-c1 = my_df[my_df$Mutationinformation == "A134V",]
-
-# order my_df by Position
-my_df_o = my_df[order(my_df$Position),]
-head(my_df_o$Position); tail(my_df_o$Position)
-
-c2 = my_df_o[my_df_o$Mutationinformation == "A134V",]
-
-# sanity check
-if (sum(table(c1 == c2)) == ncol(my_df)){
-  print ("Sanity check passsd")
-}else{
-  print ("Error!: Please debug your code")
-}
-
-rm(my_df, c1, c2)
-
-# create a new df with unique position numbers and cols
-Position = unique(my_df_o$Position) 
-Position_cols = as.data.frame(Position)
-
-head(Position_cols) ; tail(Position_cols)
-
-# specify active site residues and bg colour
-Position = c(49, 51, 57, 71
-             , 8, 96, 138
-             , 13, 68
-             , 103, 137
-             , 133, 134) #13
-
-lab_bg = rep(c("purple" 
-               , "yellow"
-               , "cornflowerblue"
-               , "blue"
-               , "green"), times = c(4, 3, 2, 2, 2)
-)
-
-# second bg colour for active site residues
-#lab_bg2 = rep(c("white" 
-#                , "green" , "white", "green"
-#                , "white"
-#                , "white"
-#                , "white"), times = c(4
-#                                      , 1, 1, 1
-#                                      , 2
-#                                      , 2
-#                                      , 2)
-#)
-
-#%%%%%%%%%
-# revised: leave the second box coloured as the first one incase there is no second colour
-#%%%%%%%%%
-lab_bg2 = rep(c("purple" 
-                , "green", "yellow", "green"
-                , "cornflowerblue"
-                , "blue"
-                , "green"), times = c(4
-                                      , 1, 1, 1
-                                      , 2
-                                      , 2
-                                      , 2))
-
-# fg colour for labels for active site residues
-lab_fg = rep(c("white" 
-               , "black"
-               , "black"
-               , "white"
-               , "black"), times = c(4, 3, 2, 2, 2))
-
-#%%%%%%%%%
-# revised: make the purple ones black
-# fg colour for labels for active site residues
-#%%%%%%%%%
-#lab_fg = rep(c("black" 
-#               , "black"
-#               , "black"
-#               , "white"
-#               , "black"), times = c(4, 3, 2, 2, 2))               
-               
-# combined df with active sites, bg and fg colours
-aa_cols_ref = data.frame(Position
-                         , lab_bg
-                         , lab_bg2
-                         , lab_fg
-                         , stringsAsFactors = F) #13, 4
-
-str(Position_cols); class(Position_cols)
-str(aa_cols_ref); class(aa_cols_ref)
-
-# since Position is int and numeric in the two dfs resp, 
-# converting numeric to int for consistency
-aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
-class(aa_cols_ref$Position)
-
-#===========
-# Merge 1: merging Positions df (Position_cols) and
-# active site cols (aa_cols_ref)
-# linking column: "Position"
-# This is so you can have colours defined for all 130 positions
-#===========
-head(Position_cols$Position); head(aa_cols_ref$Position)
-
-mut_pos_cols = merge(Position_cols, aa_cols_ref
-            , by = "Position"
-            , all.x = TRUE) 
-
-head(mut_pos_cols)
-# replace NA's 
-# :column "lab_bg" with "white"
-# : column "lab_fg" with "black"
-mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
-mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
-mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
-head(mut_pos_cols)
-
-#===========
-# Merge 2: Merge mut_pos_cols with mcsm df
-# Now combined the 130 positions with aa colours with 
-# the mcsm_data
-#===========
-# dfs to merge
-df0 = my_df_o
-df1 = mut_pos_cols
-
-# check the column on which merge will be performed
-head(df0$Position); tail(df0$Position)
-head(df1$Position); tail(df1$Position)
-
-# should now have 3 extra columns
-my_df = merge(df0, df1
-              , by = "Position"
-              , all.x = TRUE)
-
-# sanity check
-my_df[my_df$Position == "49",]
-my_df[my_df$Position == "13",]
-
-my_df$Position
-
-# clear variables
-rm(aa_cols_ref
-   , df0
-   , df1
-   , my_df_o
-   , Position_cols
-   , lab_bg
-   , lab_bg2
-   , lab_fg
-   , Position
-   )
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R
deleted file mode 100644
index 37dfe32..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/subcols_axis_PS.R
+++ /dev/null
@@ -1,208 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
-getwd()
-
-############################################################
-# 1: Installing and loading required packages and functions
-############################################################
-
-#source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#library(tidyverse)
-
-###########################
-#2: Read file: normalised file, output of step 4 mcsm pipeline
-###########################
-#my_df <- read.csv("../../Data/mcsm_complex1_normalised.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-#                  , header = T)
-
-# call script combining_df
-source("../combining_two_df.R")
-
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#  from Plotting to Scripts"
-#---------------------- PAY ATTENTION
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-#merged_df2 
-#merged_df2_comp 
-
-# df without NA for pyrazinamide:
-#merged_df3 
-#merged_df3_comp 
-#==========================
-###########################
-# Data to choose: 
-# We will be using the small dfs
-# to generate the coloured axis
-###########################
-
-# uncomment as necessary
-#!!!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df  = merged_df3_comp
-#!!!!!!!!!!!!!!!!!!!!!!!
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-str(my_df)
-my_df$Position
-c1 = my_df[my_df$Mutationinformation == "L4S",]
-
-# order my_df by Position
-my_df_o = my_df[order(my_df$Position),]
-head(my_df_o$Position); tail(my_df_o$Position)
-
-c2 = my_df_o[my_df_o$Mutationinformation == "L4S",]
-
-# sanity check
-if (sum(table(c1 == c2)) == ncol(my_df)){
-  print ("Sanity check passsd")
-}else{
-  print ("Error!: Please debug your code")
-}
-
-rm(my_df, c1, c2)
-
-# create a new df with unique position numbers and cols
-Position = unique(my_df_o$Position) #130
-Position_cols = as.data.frame(Position)
-
-head(Position_cols) ; tail(Position_cols)
-
-# specify active site residues and bg colour
-Position = c(49, 51, 57, 71
-             , 8, 96, 138
-             , 13, 68
-             , 103, 137
-             , 133, 134) #13
-
-lab_bg = rep(c("purple" 
-               , "yellow"
-               , "cornflowerblue"
-               , "blue"
-               , "green"), times = c(4, 3, 2, 2, 2)
-)
-
-# second bg colour for active site residues
-#lab_bg2 = rep(c("white" 
-#                , "green" , "white", "green"
-#                , "white"
-#                , "white"
-#                , "white"), times = c(4
-#                                      , 1, 1, 1
-#                                      , 2
-#                                      , 2
-#                                      , 2)
-#)
-
-#%%%%%%%%%
-# revised: leave the second box coloured as the first one incase there is no second colour
-#%%%%%%%%%
-lab_bg2 = rep(c("purple" 
-                , "green", "yellow", "green"
-                , "cornflowerblue"
-                , "blue"
-                , "green"), times = c(4
-                                      , 1, 1, 1
-                                      , 2
-                                      , 2
-                                      , 2))
-
-# fg colour for labels for active site residues
-lab_fg = rep(c("white" 
-               , "black"
-               , "black"
-               , "white"
-               , "black"), times = c(4, 3, 2, 2, 2))
-
-#%%%%%%%%%
-# revised: make the purple ones black
-# fg colour for labels for active site residues
-#%%%%%%%%%
-#lab_fg = rep(c("black" 
-#               , "black"
-#               , "black"
-#               , "white"
-#               , "black"), times = c(4, 3, 2, 2, 2))               
-               
-# combined df with active sites, bg and fg colours
-aa_cols_ref = data.frame(Position
-                         , lab_bg
-                         , lab_bg2
-                         , lab_fg
-                         , stringsAsFactors = F) #13, 4
-
-str(Position_cols); class(Position_cols)
-str(aa_cols_ref); class(aa_cols_ref)
-
-# since Position is int and numeric in the two dfs resp, 
-# converting numeric to int for consistency
-aa_cols_ref$Position = as.integer(aa_cols_ref$Position)
-class(aa_cols_ref$Position)
-
-#===========
-# Merge 1: merging Positions df (Position_cols) and
-# active site cols (aa_cols_ref)
-# linking column: "Position"
-# This is so you can have colours defined for all 130 positions
-#===========
-head(Position_cols$Position); head(aa_cols_ref$Position)
-
-mut_pos_cols = merge(Position_cols, aa_cols_ref
-            , by = "Position"
-            , all.x = TRUE) 
-
-head(mut_pos_cols)
-# replace NA's 
-# :column "lab_bg" with "white"
-# : column "lab_fg" with "black"
-mut_pos_cols$lab_bg[is.na(mut_pos_cols$lab_bg)] <- "white"
-mut_pos_cols$lab_bg2[is.na(mut_pos_cols$lab_bg2)] <- "white"
-mut_pos_cols$lab_fg[is.na(mut_pos_cols$lab_fg)] <- "black"
-head(mut_pos_cols)
-
-#===========
-# Merge 2: Merge mut_pos_cols with mcsm df
-# Now combined the 130 positions with aa colours with 
-# the mcsm_data
-#===========
-# dfs to merge
-df0 = my_df_o
-df1 = mut_pos_cols
-
-# check the column on which merge will be performed
-head(df0$Position); tail(df0$Position)
-head(df1$Position); tail(df1$Position)
-
-# should now have 3 extra columns
-my_df = merge(df0, df1
-              , by = "Position"
-              , all.x = TRUE)
-
-# sanity check
-my_df[my_df$Position == "49",]
-my_df[my_df$Position == "13",]
-
-my_df$Position
-
-# clear variables
-rm(aa_cols_ref
-   , df0
-   , df1
-   , my_df_o
-   , Position_cols
-   , lab_bg
-   , lab_bg2
-   , lab_fg
-   , Position
-   )
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
deleted file mode 100644
index 41ca884..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ /dev/null
@@ -1,27 +0,0 @@
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# This script only reads the pdb file of your complex
-
-# read in pdb file complex1 
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-
-# list of 8
-my_pdb = read.pdb(complex1
-                  , maxlines = -1
-                  , multi = FALSE 
-                  , rm.insert = FALSE
-                  , rm.alt = TRUE
-                  , ATOM.only = FALSE 
-                  , hex = FALSE
-                  , verbose = TRUE)
-
-rm(inDir, inFile, complex1)
-#====== end of script
-
diff --git a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
deleted file mode 100644
index 658eec4..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ /dev/null
@@ -1,386 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-
-source("Header_TT.R")
-
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was 
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T)
-str(my_df)
-
-#=========================================================
-# Processing P1: Replacing B factor with mean ratioDUET scores
-#=========================================================
-
-#########################
-# Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original DUET Scores
-# 3: replaced B-factors with DUET Scores
-#*********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
- #1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: DUET scores
-hist(my_df$average_DUETR
-     , xlab = "" 
-     , main = "Norm_DUET")
-
-plot(density(my_df$average_DUETR)
-     , xlab = ""
-     , main = "Norm_DUET")
-
-# 3: After the following replacement
-#********************************
-
-#=========
-# step 0_P1: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na 
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_DUETR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with DUET Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "DUET_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-#********************************
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# NOTE: This replaced B-factor distribution has the same
-# x-axis as the PredAff normalised values, but the distribution
-# is affected since 0 is overinflated. This is because all the positions
-# where there are no SNPs have been assigned 0.
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-
-
-
-
-#######################################################################
-#====================== end of section 1 ==============================
-#######################################################################
-
-
-
-
-
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace 
-rm(list = ls())
-
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-                  , header = T) 
-str(my_df)
-#rm(inDir, inFile)
-
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-
-source("read_pdb.R") # list of 8
-
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-
-# make a copy: required for downstream sanity checks
-d2 = d
-
-# sanity checks: B factor
-max(d$b); min(d$b)
-
-#*******************************************
-# plot histograms for inspection
-# 1: original B-factors
-# 2: original Pred Aff Scores
-# 3: replaced B-factors with PredAff Scores
-#********************************************
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-    , mar = c(1,3,5,2)
-    , mfrow = c(3,2))
-#par(mfrow = c(3,2))
-
-# 1: Original B-factor
-hist(d$b
-     , xlab = "" 
-     , main = "B-factor")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "B-factor")
-
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-     , xlab = "" 
-     , main = "Norm_lig_average")
-
-plot(density(my_df$average_PredAffR)
-     , xlab = ""
-     , main = "Norm_lig_average")
-
-# 3: After the following replacement
-#********************************
-
-#=================================================
-# Processing P2: Replacing  B values with ratioPredAff scores
-#=================================================
-# use match to perform this replacement linking with "position no"
-# in the pdb file, this corresponds to column "resno"
-# in my_df, this corresponds to column "Position"
-
-#=========
-# step 0_P2: DONT RUN once you have double checked the matched output
-#=========
-# sanity check:  match and assign to a separate column to double check
-# colnames(my_df)
-# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
-
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-
-# count number of 0's in Bactor
-sum(d$b == 0)
-#table(d$b)
-
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-
-# sanity check: should be 0
-sum(is.na(d$b))
-
-if (sum(d$b == 0) == b_na){
-  print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-  print("Error: NA replacement NOT successful, Debug code!")
-}
-
-max(d$b); min(d$b)
-
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-if (min(d$b) == min(my_df$average_PredAffR)){
-  print("Sanity check passed: B-factors replaced correctly")
-} else {
-  print ("Error: Debug code please")
-}
-
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d 
-
-max(d$b); min(d$b)
-
-#=========
-# step 5_P2
-#=========
-
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
-
-#********************************
-# Add the 3rd histogram and density plots for comparisons
-#********************************
-# Plots continued...
-# 3: hist and density of replaced B-factors with PredAff Scores
-hist(d$b
-     , xlab = ""
-     , main = "repalced-B")
-
-plot(density(d$b)
-     , xlab = ""
-     , main = "replaced-B")
-
-# graph titles
-mtext(text = "Frequency"
-      , side = 2
-      , line = 0
-      , outer = TRUE)
-
-mtext(text = "Lig_stability"
-      , side = 3
-      , line = 0
-      , outer = TRUE)
-
-#********************************
-
-###########
-# end of output files with Bfactors
-##########
diff --git a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
deleted file mode 100644
index 9f30f28..0000000
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ /dev/null
@@ -1,257 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-
-#########################################################
-# 1: Installing and loading required packages           #
-#########################################################
-
-source("Header_TT.R")
-#source("barplot_colour_function.R")
-
-##########################################################
-#           Checking: Entire data frame and for PS      #
-##########################################################
-
-###########################
-#2) Read file: combined one from the script
-###########################
-source("combining_two_df.R")
-
-# df with NA:
-# merged_df2
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                  , row.names = 1
-                  , stringsAsFactors = F
-                  , header = T)
-str(mcsm_data)
-my_colnames  = colnames(mcsm_data)
-
-#====================================
-# subset my_df to include only the columns in mcsm data
-my_df2 = my_df[my_colnames]
-#====================================
-# compare the two
-head(mcsm_data$Mutationinformation)
-head(mcsm_data$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
-head(mcsm_data_s$Mutationinformation)
-head(mcsm_data_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data_s, my_df2)
-
-#from lib compare
-compare(mcsm_data_s, my_df2) # seems rownames are the problem
-
-# FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-#====================================================== end of section 1
-
-
-
-##########################################################
-#             Checking: LIG(Filtered dataframe)          #
-##########################################################
-
-# clear workspace
-rm(list = ls())
-
-###########################
-#3) Read file: combined_lig from the script
-###########################
-source("combining_two_df_lig.R")
-
-# df with NA:
-# merged_df2 :
-# merged_df3:
-
-# df without NA:
-# merged_df2_comp:
-# merged_df3_comp:
-
-######################
-# You need to check it
-# with the merged_df3
-########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#clear variables
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# should be true
-identical(my_df$Position, my_df$position)
-
-#################################
-# Read file: normalised file
-# output of step 4 mcsm_pipeline
-#################################
-
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
-
-mcsm_data <- read.csv(inFile
-                      , row.names = 1
-                      , stringsAsFactors = F
-                      , header = T)
-str(mcsm_data)
-
-###########################
-# 4a: Filter/subset data: ONLY for LIGand analysis
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-# sanity checks
-upos = unique(mcsm_data$Position)
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# Lig filtered: subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-rm(mcsm_data) #to avoid confusion
-
-table(mcsm_data2$Dis_lig_Ang<10)
-table(mcsm_data2$Dis_lig_Ang>10)
-
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-upos_f = unique(mcsm_data2$Position); upos_f
-
-# colnames of df that you will need to subset the bigger df from
-my_colnames  = colnames(mcsm_data2)
-#====================================
-# subset bigger df i.e my_df to include only the columns in mcsm data2
-my_df2 = my_df[my_colnames] 
-
-rm(my_df) #to avoid confusion
-#====================================
-# compare the two
-head(mcsm_data2$Mutationinformation)
-head(mcsm_data2$Position)
-
-head(my_df2$Mutationinformation)
-head(my_df2$Position)
-
-# sort mcsm data by Mutationinformation
-mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
-head(mcsm_data2_s$Mutationinformation)
-head(mcsm_data2_s$Position)
-
-# now compare: should be True, but is false....
-# possibly due to rownames!?!
-identical(mcsm_data2_s, my_df2)
-
-# from library dplyr
-setdiff(mcsm_data2_s, my_df2)
-
-# from library compare
-compare(mcsm_data2_s, my_df2) # seems rownames are the problem
-
-#FIXME: automate this
-# write files: checked using meld and files are indeed identical
-#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
-#write.csv(my_df2, "my_df2.csv", row.names = F)
-
-
-##########################################################
-#  extract and write output file for SNP posn: all     #
-##########################################################
-
-head(merged_df3$Position)
-
-foo = merged_df3[order(merged_df3$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique
-
-# sanity check: 
-table(snp_pos_unique == combined_df$Position)
-
-#=====================
-# write_output files
-#=====================
-outDir = "~/Data/pyrazinamide/input/processed/"
-
-
-outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
-print(paste0("Output file name and path will be:","", outFile1))
-
-write.table(snp_pos_unique
-            , outFile1
-            , row.names = F
-            , col.names = F)
-            
-##############################################################
-#  extract and write output file for SNP posn: complete only #
-##############################################################
-head(merged_df3_comp$Position)
-
-foo = merged_df3_comp[order(merged_df3_comp$Position),]
-head(foo$Position)
-
-snp_pos_unique = unique(foo$Position); snp_pos_unique 
-
-# outDir = "~/Data/pyrazinamide/input/processed/" # already set
-
-outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
-print(paste0("Output file name and path will be:", outFile2))
-
-write.table(snp_pos_unique
-            , outFile2
-            , row.names = F
-            , col.names = F)
-#============================== end of script
-
-
diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py
index 7b59cfa..5d3dc64 100755
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@@ -48,9 +48,10 @@ gene = 'pncA'
 datadir = homedir + '/' + 'git/Data'
 
 #=======
-# input
+# input from outdir
 #=======
-indir = datadir + '/' + drug + '/' + 'output'
+#indir = datadir + '/' + drug + '/' + 'output'
+outdir = datadir + '/' + drug + '/' + 'output'
 #in_filename = 'pnca.dssp'
 in_filename = gene.lower() +'.dssp'
 infile = indir + '/' + in_filename
diff --git a/mk_drug_dirs.sh b/mk_drug_dirs.sh
index 6a6dd6d..a336ed3 100755
--- a/mk_drug_dirs.sh
+++ b/mk_drug_dirs.sh
@@ -4,9 +4,6 @@
 ## Structure:
 #
 # $DATA_DIR/$DRUG/input
-#                 |- original
-#                 |- processed
-#                 |- structure
 #                 
 # $DATA_DIR/$DRUG/output
 #                 |- plots
@@ -15,18 +12,17 @@
 DATA_DIR=~/git/Data
 
 if [[ $1 == '' ]]; then
+	echo "Error"
     echo "usage: mk-drug-dirs.sh <drug name>";
     exit;
 else
     DRUG=$1
-    echo Creating structure for: $DRUG
+    echo Creating directory structure for: $DRUG
 
     if [ -d $DATA_DIR ]
     then
         echo Doing creation in $DATA_DIR
-        mkdir -p $DATA_DIR/$DRUG/input/original
-        mkdir -p $DATA_DIR/$DRUG/input/processed
-        mkdir -p $DATA_DIR/$DRUG/input/structure
+        mkdir -p $DATA_DIR/$DRUG/input
         mkdir -p $DATA_DIR/$DRUG/output/plots
         mkdir -p $DATA_DIR/$DRUG/output/structure
         
diff --git a/meta_data_analysis/data_extraction.py b/scripts/data_extraction.py
similarity index 57%
rename from meta_data_analysis/data_extraction.py
rename to scripts/data_extraction.py
index 70f3008..451d6cf 100755
--- a/meta_data_analysis/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -11,63 +11,77 @@ Created on Tue Aug  6 12:56:03 2019
 
 # FIXME: import dirs.py to get the basic dir paths available
 #=======================================================================
-# TASK: extract ALL pncA_p. mutations from GWAS data
+# TASK: extract ALL <gene> matched mutations from GWAS data
 # Input data file has the following format: each row = unique sample id 
-# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide...
-# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,pncA_p.<wt>POS<mut>; pncA_c.<wt>POS<mut>...
-# where multiple mutations and multiple mutation types are separated by ';'. We are interested in the 
-# protein coding region i.e mutation with the 'p.' format.
-
-# the script splits the mutations on the ';' and extracts protein coding muts only
+# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col...
+# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_match<wt>POS<mut>; pncA_c.<wt>POS<mut>...
+# where multiple mutations and multiple mutation types are separated by ';'. 
+# We are interested in the protein coding region i.e mutation with the<gene>_'p.' format.
+# This script splits the mutations on the ';' and extracts protein coding muts only
 # where each row is a separate mutation
 # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
 
-# output files:
-# 0) pnca_common_ids.csv
-# 1) pnca_ambiguous_muts.csv
-# 2) pnca_mcsm_snps.csv
-# 3) pnca_metadata.csv
-# 4) pnca_all_muts_msa.csv
-# 5) pnca_mutational_positons.csv
+# output files: all lower case
+# 0) <gene>_common_ids.csv
+# 1) <gene>_ambiguous_muts.csv
+# 2) <gene>_mcsm_snps.csv
+# 3) <gene>_metadata.csv
+# 4) <gene>_all_muts_msa.csv
+# 5) <gene>_mutational_positons.csv
 #=======================================================================
 #%% load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
-
-#from pandas.api.types import is_string_dtype
-#from pandas.api.types import is_numeric_dtype
-
-#%% specify homedir as python doesn't recognise tilde
+import argparse
+#=======================================================================
+#%% homdir and curr dir and local imports
 homedir = os.path.expanduser('~') 
-
 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 
 # import aa dict
-from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+drug = args.drug
+gene = args.gene
 gene_match = gene + '_p.'
+# building cols to extract
+dr_muts_col = 'dr_mutations_' + drug
+other_muts_col = 'other_mutations_' + drug
 
+print('Extracting columns based on variables:\n'
+	, drug
+	, '\n'
+	, dr_muts_col
+	, '\n'
+	, other_muts_col
+	, '\n===============================================================')
+#=======================================================================
+#%% input and output dirs and files
 #=======
 # data dir
 #=======
-#indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 
 #=======
 # input
 #=======
-#indir = 'git/Data/pyrazinamide/input/original'
 in_filename  = 'original_tanushree_data_v2.csv'
 infile = datadir + '/' + in_filename
 print('Input filename: ', in_filename
-      , '\nInput path: ', indir
+      , '\nInput path: ', datadir
       , '\n============================================================')
 
 #=======
@@ -88,15 +102,15 @@ master_data  = pd.read_csv(infile, sep = ',')
 # column names
 #list(master_data.columns)
 
-# extract elevant columns to extract from meta data related to the pyrazinamide
+# extract elevant columns to extract from meta data related to the drug
 meta_data = master_data[['id'
        ,'country'
        ,'lineage'
        ,'sublineage'
        ,'drtype'
-       , 'pyrazinamide'
-       , 'dr_mutations_pyrazinamide'
-       , 'other_mutations_pyrazinamide'
+       , drug
+       , dr_muts_col
+       , other_muts_col
         ]] 
 
 del(master_data)
@@ -115,13 +129,13 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 meta_data.head()
 
 # equivalent of table in R
-# pyrazinamide counts
-meta_data.pyrazinamide.value_counts() 
-print('RESULT: Sus and Res samples:\n', meta_data.pyrazinamide.value_counts()
+# drug counts
+meta_data[drug].value_counts() 
+print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
 		, '\n===========================================================')
 
 # clear variables
-del(indir, in_filename,infile)
+del(in_filename,infile)
 #del(outdir)
 #%%
 # !!!IMPORTANT!!! sanity check:
@@ -129,18 +143,18 @@ del(indir, in_filename,infile)
 # can use it to check if your data extraction process for dr_muts 
 # and other_muts has worked correctly AND also to check the dim of the 
 # final formatted data.
-# This will have: unique COMBINATION of sample id and pncA_p.mutations
+# This will have: unique COMBINATION of sample id and <gene_match> mutations
 
 #========
-# First: counting pncA_p. mutations in dr_mutations_pyrazinamide column
+# First: counting <gene_match> mutations in dr_muts_col column
 #========
-print('Now counting WT & pncA_p. muts within the column: dr_mutations_pyrazinamide')
+print('Now counting WT &', gene_match, 'muts within the column:', dr_muts_col)
 
 # drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=['dr_mutations_pyrazinamide'])
+clean_df =  meta_data.dropna(subset=[dr_muts_col])
 
 # sanity check: count na
-na_count = meta_data['dr_mutations_pyrazinamide'].isna().sum()
+na_count = meta_data[dr_muts_col].isna().sum()
 
 if len(clean_df) == (total_samples - na_count):
    print('PASS: clean_df extracted: length is', len(clean_df)
@@ -150,7 +164,7 @@ else:
     print('FAIL: dropping NA failed'
          , '\n==========================================================')
 
-dr_pnca_count = 0
+dr_gene_count = 0
 wt = 0
 id_dr = []
 id2_dr = []
@@ -158,45 +172,44 @@ id2_dr = []
 for i, id in enumerate(clean_df.id):
 #    print (i, id)
 #    id_dr.append(id)
-#    count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count('pncA_p.') #works 2201
-    count_pnca_dr = clean_df.dr_mutations_pyrazinamide.iloc[i].count(gene_match) #works 2201
-    if count_pnca_dr > 0:
+    count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) 
+    if count_gene_dr > 0:
         id_dr.append(id)
-    if count_pnca_dr > 1:
+    if count_gene_dr > 1:
         id2_dr.append(id)
-#        print(id, count_pnca_dr)    
-    dr_pnca_count = dr_pnca_count + count_pnca_dr
-    count_wt = clean_df.dr_mutations_pyrazinamide.iloc[i].count('WT')
+#        print(id, count_gene_dr)    
+    dr_gene_count = dr_gene_count + count_gene_dr
+    count_wt = clean_df[dr_muts_col].iloc[i].count('WT')
     wt = wt + count_wt
     
 print('RESULTS:')        
-print('Total WT in dr_mutations_pyrazinamide:', wt)
-print('Total matches of', gene_match, 'in dr_mutations_pyrazinamide:', dr_pnca_count)
-print('Total samples with > 1', gene_match, 'muts in dr_mutations_pyrazinamide:', len(id2_dr) )
+print('Total WT in dr_muts_col:', wt)
+print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count)
+print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) )
 print('=================================================================')
 
-del(i, id, wt, id2_dr, clean_df, na_count, count_pnca_dr, count_wt)
+del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt)
 
 #========
-# Second: counting pncA_p. mutations in dr_mutations_pyrazinamide column
+# Second: counting <gene_match> mutations in dr_muts_col column
 #========
-print('Now counting WT & pncA_p. muts within the column: other_mutations_pyrazinamide')
+print('Now counting WT &', gene_match, 'muts within the column:', other_muts_col)
       
 # drop na and extract a clean df
-clean_df =  meta_data.dropna(subset=['other_mutations_pyrazinamide'])
+clean_df =  meta_data.dropna(subset=[other_muts_col])
 
 # sanity check: count na
-na_count = meta_data['other_mutations_pyrazinamide'].isna().sum()
+na_count = meta_data[other_muts_col].isna().sum()
 
 if len(clean_df) == (total_samples - na_count):
    print('PASS: clean_df extracted: length is', len(clean_df)
-         , '\nNo.of NA s=', na_count, '/', total_samples
+         , '\nNo.of NAs =', na_count, '/', total_samples
          , '\n=========================================================')
 else:
     print('FAIL: dropping NA failed'
          , '\n=========================================================')
 
-other_pnca_count = 0
+other_gene_count = 0
 wt_other = 0
 id_other = []
 id2_other = []
@@ -204,63 +217,63 @@ id2_other = []
 for i, id in enumerate(clean_df.id):
 #    print (i, id)
 #    id_other.append(id)
-#    count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count('pncA_p.')
-    count_pnca_other = clean_df.other_mutations_pyrazinamide.iloc[i].count(gene_match)
-    if count_pnca_other > 0:
+#    count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match')
+    count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
+    if count_gene_other > 0:
         id_other.append(id)            
-    if count_pnca_other > 1:
+    if count_gene_other > 1:
         id2_other.append(id)
-#        print(id, count_pnca_other)    
-    other_pnca_count = other_pnca_count + count_pnca_other    
-    count_wt = clean_df.other_mutations_pyrazinamide.iloc[i].count('WT')
+#        print(id, count_gene_other)    
+    other_gene_count = other_gene_count + count_gene_other    
+    count_wt = clean_df[other_muts_col].iloc[i].count('WT')
     wt_other = wt_other + count_wt   
 print('RESULTS:')
-print('Total WT in other_mutations_pyrazinamide:', wt_other)
-print('Total matches of', gene_match, 'in other_mutations_pyrazinamide:', other_pnca_count)
-print('Total samples with > 1', gene_match, 'muts in other_mutations_pyrazinamide:', len(id2_other) )
+print('Total WT in other_muts_col:', wt_other)
+print('Total matches of', gene_match, 'in', other_muts_col, ':', other_gene_count)
+print('Total samples with > 1', gene_match, 'muts in other_muts_col:', len(id2_other) )
 print('=================================================================')
 
-print('Predicting total no. of rows in your curated df:', dr_pnca_count + other_pnca_count )
-expected_rows = dr_pnca_count + other_pnca_count
+print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
+      , '\n===================================================================')
+expected_rows = dr_gene_count + other_gene_count
 
-del(i, id, wt_other, clean_df, na_count, id2_other, count_pnca_other, count_wt)
+del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt)
 
 #%%
 ############
 # extracting dr and other muts separately along with the common cols
 #############
-print('=================================================================')
-print('Extracting dr_muts in a dr_mutations_pyrazinamide with other meta_data')
+print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data')
 print('gene to extract:', gene_match )
 
 #===============
-# dr mutations: extract pncA_p. entries with meta data and ONLY dr_muts col
+# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col
 #===============
-# FIXME: replace pyrazinamide with variable containing the drug name
+# FIXME: replace drug with variable containing the drug name
 # !!! important !!!
 meta_data_dr = meta_data[['id'
        ,'country'
        ,'lineage'
        ,'sublineage'
        ,'drtype'
-       , 'pyrazinamide'
-       , 'dr_mutations_pyrazinamide'
+       , drug
+       , dr_muts_col
         ]] 
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
 print('actual dim:', meta_data_dr.shape
 	, '\n===============================================================')
 
 # Extract within this the gene of interest using string match
-#meta_pnca_dr = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
-meta_pnca_dr = meta_data_dr.loc[meta_data_dr.dr_mutations_pyrazinamide.str.contains(gene_match, na = False)] 
+#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] 
+meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] 
 
-dr_id = meta_pnca_dr['id'].unique()
+dr_id = meta_gene_dr['id'].unique()
 print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
 print('checking RESULT:',
       '\nexpected len =', len(id_dr),
-      '\nactual len =', len(meta_pnca_dr) )
+      '\nactual len =', len(meta_gene_dr) )
 
-if len(id_dr) == len(meta_pnca_dr):
+if len(id_dr) == len(meta_gene_dr):
     print('PASS: lengths match'
     , '\n===============================================================')
 else:
@@ -270,18 +283,18 @@ else:
 dr_id = pd.Series(dr_id)
 
 #=================
-# other mutations: extract pncA_p. entries
+# other mutations: extract gene_match entries
 #==================
-print('Extracting dr_muts in a other_mutations_pyrazinamide with other meta_data')
-# FIXME: replace pyrazinamide with variable containing the drug name
+print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
+# FIXME: replace drug with variable containing the drug name
 # !!! important !!!
 meta_data_other = meta_data[['id'
        ,'country'
        ,'lineage'
        ,'sublineage'
        ,'drtype'
-       , 'pyrazinamide'
-       , 'other_mutations_pyrazinamide'
+       , drug
+       , other_muts_col
         ]] 
 
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
@@ -289,15 +302,15 @@ print('actual dim:', meta_data_other.shape
 	, '\n===============================================================')
 
 # Extract within this the gene of interest using string match
-meta_pnca_other  = meta_data_other.loc[meta_data_other.other_mutations_pyrazinamide.str.contains(gene_match, na = False)] 
+meta_gene_other  = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] 
 
-other_id = meta_pnca_other['id'].unique()
+other_id = meta_gene_other['id'].unique()
 print('RESULT: No. of samples with other muts:', len(other_id))
 print('checking RESULT:',
       '\nexpected len =', len(id_other),
-      '\nactual len =', len(meta_pnca_other) )
+      '\nactual len =', len(meta_gene_other) )
 
-if len(id_other) == len(meta_pnca_other):
+if len(id_other) == len(meta_gene_other):
     print('PASS: lengths match'
     , '\n==============================================================')
 else:
@@ -308,7 +321,7 @@ other_id = pd.Series(other_id)
 #%% Find common IDs
 print('Now extracting common_ids...')
 common_mut_ids = dr_id.isin(other_id).sum() 
-print('RESULT: No. of common Ids:', common_mut_ids)
+print('RESULT: No. of common ids:', common_mut_ids)
 
 # sanity checks
 # check if True: should be since these are common
@@ -327,9 +340,9 @@ common_ids2.columns = ['index', 'id2']
 # should be True
 print(common_ids['id'].equals(common_ids2['id2']))
 
-# good sanity check: use it later to check pnca_sample_counts
-expected_pnca_samples = ( len(meta_pnca_dr) + len(meta_pnca_other) - common_mut_ids ) 
-print('expected no. of pnca samples:', expected_pnca_samples)
+# good sanity check: use it later to check gene_sample_counts
+expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) 
+print('expected no. of gene samples:', expected_gene_samples)
 print('=================================================================')
 #%% write file
 #print(outdir)
@@ -348,47 +361,47 @@ del(out_filename0)
 
 # clear variables
 del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
+#%% Now extract 'all' pncA mutations: i.e 'gene_match*'
+print('extracting from string match:', gene_match, 'mutations from cols:\n'
+      , dr_muts_col, 'and', other_muts_col, 'using string match:'
+      , '\n===================================================================')
+#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match) | meta_data[other_muts_col].str.contains(gene_match) ]
+meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ]
 
-#%% Now extract 'all' pncA mutations: i.e 'pncA_p.*'
-print('extracting all pncA mutations from dr_ and other_ cols using string match:', gene_match
-	, '\n===============================================================')
-#meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match) ]
-meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match, na = False) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match, na = False) ]
+extracted_gene_samples = meta_gene_all['id'].nunique()
+print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
+      , '\n===================================================================')
 
-extracted_pnca_samples = meta_pnca_all['id'].nunique()
-print('RESULT: actual no. of pnca samples extracted:', extracted_pnca_samples)
-print('======================================================================')
-
-# sanity check: length of pnca samples
+# sanity check: length of gene samples
 print('Performing sanity check:')
-if extracted_pnca_samples == expected_pnca_samples:
-    print('No. of pnca samples:', len(meta_pnca_all)
-          , '\nPASS: expected & actual no. of pnca samples match'
+if extracted_gene_samples == expected_gene_samples:
+    print('No. of gene samples:', len(meta_gene_all)
+          , '\nPASS: expected & actual no. of gene samples match'
           , '\n=========================================================')
 else:
     print('FAIL: Debug please!'
     , '\n===============================================================')
 
-# count NA  in pyrazinamide column
-pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
-print('No. of pnca samples without pza testing i.e NA in pza column:', pnca_na)
+# count NA  in drug column
+gene_na = meta_gene_all[drug].isna().sum() 
+print('No. of gene samples without pza testing i.e NA in pza column:', gene_na)
 
 # use it later to check number of complete samples from LF data
-comp_pnca_samples = len(meta_pnca_all) - pnca_na
-print('comp pnca samples tested for pza:', comp_pnca_samples)
+comp_gene_samples = len(meta_gene_all) - gene_na
+print('comp gene samples tested for pza:', comp_gene_samples)
 print('=================================================================')
 
 # Comment: This is still dirty data since these
-# are samples that have pncA_p. muts, but can have others as well
+# are samples that have gene_match muts, but can have others as well
 # since the format for mutations is mut1; mut2, etc.
-print('This is still dirty data: samples have pncA_p. muts, but may have others as well'
+print('This is still dirty data: samples have ', gene_match, 'muts but may have others as well'
       , '\nsince the format for mutations is mut1; mut2, etc.'
       , '\n=============================================================')
 
 #%% tidy_split():Function to split mutations on specified delimiter: ';'
 #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+print('Performing tidy_split(): to separate the mutations into indivdual rows')
 
-print('Performing tidy_spllit(): to separate the mutations into indivdual rows')
 # define the split function
 def tidy_split(df, column, sep='|', keep=False):
     '''
@@ -428,38 +441,38 @@ def tidy_split(df, column, sep='|', keep=False):
     
 #%% end of tidy_split()
 #=========
-# DF1: dr_mutations_pyrazinamide
+# DF1: dr_muts_col
 #=========
 ########
-# tidy_split(): on 'dr_mutations_pyrazinamide' column and remove leading white spaces
+# tidy_split(): on dr_muts_col column and remove leading white spaces
 ########    
-col_to_split1 = 'dr_mutations_pyrazinamide'
-print ('Firstly, applying tidy split on dr df:', meta_pnca_dr.shape
-       , '\ncolumn name:', col_to_split1
+col_to_split1 = dr_muts_col
+print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
+       , '\ncolumn name to apply tidy_split():', col_to_split1
        , '\n============================================================')
 # apply tidy_split()
-dr_WF0 = tidy_split(meta_pnca_dr, col_to_split1, sep = ';') 
+dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') 
 # remove leading white space else these are counted as distinct mutations as well
-dr_WF0['dr_mutations_pyrazinamide'] = dr_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
+dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() 
 
-# extract only the samples/rows with pncA_p.
-dr_pnca_WF0 = dr_WF0.loc[dr_WF0.dr_mutations_pyrazinamide.str.contains(gene_match)]
+# extract only the samples/rows with gene_match
+dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
 
 print('lengths after tidy split and extracting', gene_match, 'muts:'
-      , '\nold length:' , len(meta_pnca_dr)
+      , '\nold length:' , len(meta_gene_dr)
       , '\nlen after split:', len(dr_WF0)
-      , '\ndr_pnca_WF0 length:', len(dr_pnca_WF0)
-      , '\nexpected len:', dr_pnca_count)
+      , '\ndr_gene_WF0 length:', len(dr_gene_WF0)
+      , '\nexpected len:', dr_gene_count)
 
-if len(dr_pnca_WF0) == dr_pnca_count:
-    print('PASS: length of dr_pnca_WF0 match with expected length'
+if len(dr_gene_WF0) == dr_gene_count:
+    print('PASS: length of dr_gene_WF0 match with expected length'
     , '\n===============================================================')
 else:
     print('FAIL: lengths mismatch'
     , '\n===============================================================')
 
 # count the freq of 'dr_muts' samples
-dr_muts_df = dr_pnca_WF0 [['id', 'dr_mutations_pyrazinamide']] 
+dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] 
 print('dim of dr_muts_df:', dr_muts_df.shape)
 
 # add freq column
@@ -468,13 +481,13 @@ dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count')
 print('revised dim of dr_muts_df:', dr_muts_df.shape) 
 
 c1 = dr_muts_df.dr_sample_freq.value_counts()
-print('counting no. of sample frequency:\n', c1)
-print('=================================================================')
+print('counting no. of sample frequency:\n', c1
+      , '\n===================================================================')
 
-# sanity check: length of pnca samples
-if len(dr_pnca_WF0) == c1.sum():
+# sanity check: length of gene samples
+if len(dr_gene_WF0) == c1.sum():
     print('PASS: WF data has expected length'
-    , '\nlength of dr_pnca WFO:', c1.sum()
+    , '\nlength of dr_gene WFO:', c1.sum()
     , '\n===============================================================')
 else:
     print('FAIL: Debug please!'
@@ -483,7 +496,7 @@ else:
 #!!! Important !!!
 # Assign 'column name' on which split was performed as an extra column
 # This is so you can identify if mutations are dr_type or other in the final df
-dr_df = dr_pnca_WF0.assign(mutation_info = 'dr_mutations_pyrazinamide')
+dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
 print('dim of dr_df:', dr_df.shape
 	, '\n=============================================================='
 	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
@@ -493,35 +506,35 @@ print('dim of dr_df:', dr_df.shape
 # DF2: other_mutations_pyrazinamdie
 #=========
 ########
-# tidy_split(): on 'other_mutations_pyrazinamide' column and remove leading white spaces
+# tidy_split(): on other_muts_col column and remove leading white spaces
 ######## 
-col_to_split2 = 'other_mutations_pyrazinamide'
-print ('applying second tidy split separately on df:', meta_pnca_other.shape
-       , '\ncolumn name:', col_to_split2
+col_to_split2 = other_muts_col
+print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
+       , '\ncolumn name to apply tidy_split():', col_to_split2
        , '\n============================================================')
 
 # apply tidy_split()
-other_WF1 = tidy_split(meta_pnca_other, col_to_split2, sep = ';') 
+other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') 
 # remove the leading white spaces in the column
-other_WF1['other_mutations_pyrazinamide'] = other_WF1['other_mutations_pyrazinamide'].str.strip() 
+other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() 
 
-# extract only the samples/rows with pncA_p.
-other_pnca_WF1 = other_WF1.loc[other_WF1.other_mutations_pyrazinamide.str.contains(gene_match)]
+# extract only the samples/rows with gene_match
+other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
 
 print('lengths after tidy split and extracting', gene_match, 'muts:',
-      '\nold length:' , len(meta_pnca_other),
+      '\nold length:' , len(meta_gene_other),
       '\nlen after split:', len(other_WF1),
-      '\nother_pnca_WF1 length:', len(other_pnca_WF1),
-      '\nexpected len:', other_pnca_count)
+      '\nother_gene_WF1 length:', len(other_gene_WF1),
+      '\nexpected len:', other_gene_count)
 
-if len(other_pnca_WF1) == other_pnca_count:
-    print('PASS: length of dr_pnca_WF0 match with expected length
+if len(other_gene_WF1) == other_gene_count:
+    print('PASS: length of dr_gene_WF0 match with expected length'
     , '\n===============================================================')
 else:
-    print('FAIL: lengths mismatch
+    print('FAIL: lengths mismatch'
     , '\n===============================================================')    
 # count the freq of 'other muts' samples
-other_muts_df = other_pnca_WF1 [['id', 'other_mutations_pyrazinamide']] 
+other_muts_df = other_gene_WF1 [['id', other_muts_col]] 
 print('dim of other_muts_df:', other_muts_df.shape) 
 
 # add freq column
@@ -531,10 +544,10 @@ print('revised dim of other_muts_df:', other_muts_df.shape)
 c2 = other_muts_df.other_sample_freq.value_counts()
 print('counting no. of sample frequency:\n', c2)
 print('=================================================================')
-# sanity check: length of pnca samples
-if len(other_pnca_WF1) == c2.sum():
+# sanity check: length of gene samples
+if len(other_gene_WF1) == c2.sum():
     print('PASS: WF data has expected length'
-    , '\nlength of other_pnca WFO:', c2.sum()
+    , '\nlength of other_gene WFO:', c2.sum()
     , '\n===============================================================')
 else:
     print('FAIL: Debug please!'
@@ -543,7 +556,7 @@ else:
 #!!! Important !!!
 # Assign 'column name' on which split was performed as an extra column
 # This is so you can identify if mutations are dr_type or other in the final df
-other_df = other_pnca_WF1.assign(mutation_info = 'other_mutations_pyrazinamide')
+other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
 print('dim of other_df:', other_df.shape
 	, '\n==============================================================='
 	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
@@ -555,17 +568,19 @@ print('dim of other_df:', other_df.shape
 #!!! important !!!
 # change column names to allow concat:
 # dr_muts.. & other_muts : 'mutation'
-print('Now concatenating the two dfs by row')
+print('Now concatenating the two dfs by row'
+      , '\nfirst assigning a common colname: "mutation" to the col containing muts'
+      , '\nthis is done for both dfs'
+      , '\n===================================================================')
 
 dr_df.columns
-dr_df.rename(columns = {'dr_mutations_pyrazinamide': 'mutation'}, inplace = True)
+dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True)
 dr_df.columns
 
 other_df.columns
-other_df.rename(columns = {'other_mutations_pyrazinamide': 'mutation'}, inplace = True)
+other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
 other_df.columns
 
-print('=================================================================')
 print('Now appending the two dfs:'
       , '\ndr_df dim:', dr_df.shape
       , '\nother_df dim:', other_df.shape
@@ -582,18 +597,18 @@ else:
     print('FAIL: Debug please!')
 
 # concatenate (axis = 0): Rbind
-pnca_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
+gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
 
 # checking colnames and length after concat
 print('checking colnames AFTER concatenating the two dfs...')
-if (set(dr_df.columns) == set(pnca_LF0.columns)):
+if (set(dr_df.columns) == set(gene_LF0.columns)):
     print('PASS: column names match')
 else:
     print('FAIL: Debug please!')
     
 print('checking length AFTER concatenating the two dfs...')
 
-if len(pnca_LF0) == len(dr_df) +  len(other_df):
+if len(gene_LF0) == len(dr_df) +  len(other_df):
     print('PASS:length of df after concat match'
     , '\n===============================================================')
 else:
@@ -603,61 +618,59 @@ else:
 ###########
 # This is hopefully clean data, but just double check
 # Filter LF data so that you only have
-# mutations corresponding to pncA_p.* (string match pattern) 
+# mutations corresponding to gene_match* (string match pattern) 
 # this will be your list you run OR calcs 
 ###########
-print('length of pnca_LF0:', len(pnca_LF0),
+print('length of gene_LF0:', len(gene_LF0),
       '\nThis should be what you need. But just double check and extract', gene_match, 
-      '\nfrom LF0 (concatenated data)')
+      '\nfrom LF0 (concatenated data) using string match:', gene_match)
 
-print('using string match:', gene_match)
+print('Double checking and creating df: gene_LF1')
+gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)]   
 
-print('Double checking and creating df: pnca_LF1')
-pnca_LF1 = pnca_LF0[pnca_LF0['mutation'].str.contains(gene_match)]   
-
-if len(pnca_LF0) == len(pnca_LF1):
-    print('PASS: length of pnca_LF0 and pnca_LF1 match',
+if len(gene_LF0) == len(gene_LF1):
+    print('PASS: length of gene_LF0 and gene_LF1 match',
           '\nconfirming extraction and concatenating worked correctly')
 else:
     print('FAIL: BUT NOT FATAL!'
-          , '\npnca_LF0 and pnca_LF1 lengths differ'
+          , '\ngene_LF0 and gene_LF1 lengths differ'
           , '\nsuggesting error in extraction process'
-          , ' use pnca_LF1 for downstreama analysis'
+          , ' use gene_LF1 for downstreama analysis'
           , '\n=========================================================')    
 print('length of dfs pre and post processing...'
-      , '\npnca WF data (unique samples in each row):', extracted_pnca_samples
-      , '\npnca LF data (unique mutation in each row):', len(pnca_LF1)
+      , '\ngene WF data (unique samples in each row):', extracted_gene_samples
+      , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
       , '\n=============================================================')
 
-#%%
-# final sanity check
+#%% sanity check for extraction
 print('Verifying whether extraction process worked correctly...')
-if len(pnca_LF1) == expected_rows:
+if len(gene_LF1) == expected_rows:
     print('PASS: extraction process performed correctly'
           , '\nexpected length:', expected_rows
-          , '\ngot:', len(pnca_LF1)
-          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(pnca_LF1)
+          , '\ngot:', len(gene_LF1)
+          , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1)
           , '\n=========================================================')
 else:
     print('FAIL: extraction process has bugs'
           , '\nexpected length:', expected_rows
-          , '\ngot:', len(pnca_LF1)
+          , '\ngot:', len(gene_LF1)
           , ', \Debug please'
           , '\n=========================================================')
 #%%
-print('Perfmorning some more sanity checks...')
+print('Performing some more sanity checks...')
+
 # From LF1:
 # no. of unique muts
-distinct_muts = pnca_LF1.mutation.value_counts()
-print('Distinct mutations:', len(distinct_muts))
+distinct_muts = gene_LF1.mutation.value_counts()
+print('Distinct genomic mutations:', len(distinct_muts))
 
 # no. of samples contributing the unique muta
-pnca_LF1.id.nunique()
-print('No.of samples contributing to distinct muts:', pnca_LF1.id.nunique() )
+gene_LF1.id.nunique()
+print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique() )
 
 # no. of dr and other muts
-mut_grouped = pnca_LF1.groupby('mutation_info').mutation.nunique()
-print('No.of unique dr and other muts:', pnca_LF1.groupby('mutation_info').mutation.nunique() )
+mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique()
+print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique() )
 
 # sanity check
 if len(distinct_muts) == mut_grouped.sum() :
@@ -670,7 +683,7 @@ else:
           , '\nmuts should be distinct within dr* and other* type'
           , '\ninspecting ...'
           , '\n=========================================================')
-    muts_split = list(pnca_LF1.groupby('mutation_info'))
+    muts_split = list(gene_LF1.groupby('mutation_info'))
     dr_muts = muts_split[0][1].mutation 
     other_muts =  muts_split[1][1].mutation
 #   print('splitting muts by mut_info:', muts_split)
@@ -679,7 +692,7 @@ else:
 #%%
 # !!! IMPORTANT !!!!
 # sanity check: There should not be any common muts
-# i.e the same mutation cannot be classed as a 'drug' AND 'others'
+# i.e the same mutation cannot be classed as a drug AND 'others'
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
     print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category'
     , '\n===============================================================')
@@ -695,8 +708,8 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
           , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum()
           , '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)]
           , '\n========================================================='
-          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum(),
-          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)],
+          , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
+          , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
           , '\n=========================================================')    
 else:
     print('Error: ambiguous muts present, but extraction failed. Debug!'
@@ -706,22 +719,22 @@ print('Counting no. of ambiguous muts...')
 
 if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
     common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)),
-          'list of ambiguous mutations (see below):', *common_muts, sep = '\n'
-          , '\n=========================================================')
+    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+    print('\n===========================================================')
 else:
    print('Error: ambiguous muts detected, but extraction failed. Debug!'
-         , '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-         , '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+         , '\nNo. of ambiguous muts in dr:'
+         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+         , '\nNo. of ambiguous muts in other:'
+         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
          , '\n=========================================================')       
          
 #%% clear variables
-del(id_dr, id_other, meta_data, meta_pnca_dr, meta_pnca_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_pnca_count, pnca_LF0, pnca_na)  
+del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)  
 
-del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_muts_df, dr_pnca_WF0, dr_pnca_count, expected_pnca_samples, other_pnca_WF1)                     
+del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)                     
  
-#%% end of data extraction and some files writing. Below are some more files writing.
-
 #%%: write file: ambiguous muts
 # uncomment as necessary
 #print(outdir)
@@ -734,8 +747,8 @@ print('Writing file: ambiguous muts',
       '\nFilename:', out_filename1,
       '\nPath:',  outdir)
 
-#common_muts = ['pncA_p.Val180Phe','pncA_p.Gln10Pro'] # test
-inspect = pnca_LF1[pnca_LF1['mutation'].isin(common_muts)]
+#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
+inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
 inspect.to_csv(outfile1)
 
 print('Finished writing:', out_filename1
@@ -746,22 +759,33 @@ print('Finished writing:', out_filename1
       , '\n=============================================================')
       
 del(out_filename1)
-
-#%% read aa dict and pull relevant info
-print('Reading aa dict and fetching1 letter aa code'
+#%% end of data extraction and some files writing. Below are some more files writing.
+#=============================================================================
+#%% Formatting df: read aa dict and pull relevant info
+print('Now some more formatting:'
+      , '\nread aa dict and pull relevant info'
+      , '\nformat mutations:'
+      , '\nsplit mutation into mCSM style muts: '
       , '\nFormatting mutation in mCSM style format: {WT}<POS>{MUT}'
-      , '\nAdding aa properties'
-      , '\n============================================================')
-      
+      , '\nassign aa properties: adding 2 cols at a time for each prop'
+      , '\n===================================================================')
+
+# BEWARE hardcoding : only works as we are adding aa prop once for wt and once for mut
+#  in each lookup cycle 
+ncol_mutf_add = 3 # mut split into 3 cols
+ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
+
 #===========
 # Split 'mutation' column into three:  wild_type, position and
 # mutant_type separately. Then map three letter code to one using
-# reference_dict.
-# First: Import reference dict
-# Second: convert to mutation to lowercase for compatibility with dict 
+# reference_dict imported at the beginning.
+# After importing, convert to mutation to lowercase for compatibility with dict 
 #===========
-pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower()
+gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
+gene_regex = gene_match.lower()+'(\w{3})'
+print('gene regex being used:', gene_regex)
 
+mylen0 = len(gene_LF1.columns)
 #=======
 # Iterate through the dict, create a lookup dict i.e
 # lookup_dict = {three_letter_code: one_letter_code}.
@@ -770,17 +794,47 @@ pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower()
 # The three letter code is extracted using a string match match from the dataframe and then converted
 # to 'pandas series'since map only works in pandas series
 #=======
+print('Adding', ncol_mutf_add, 'more cols:\n')
 
+# initialise a sub dict that is lookup dict for three letter code to 1-letter code
+# adding three more cols
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
     lookup_dict[k] = v['one_letter_code']
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wild_type'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    pnca_LF1['mutant_type'] = mut.map(lookup_dict)
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wild_type'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    gene_LF1['mutant_type'] = mut.map(lookup_dict)
 
 # extract position info from mutation column separetly using string match
-pnca_LF1['position'] = pnca_LF1['mutation'].str.extract(r'(\d+)') 
+gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') 
+
+mylen1 = len(gene_LF1.columns)
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen1 ==  mylen0 + ncol_mutf_add:
+    print('PASS: successfully added', ncol_mutf_add, 'cols'
+      , '\nold length:', mylen0
+      , '\nnew len:', mylen1)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen0
+          , '\nnew len:', mylen1)
 
 # clear variables
 del(k, v, wt, mut, lookup_dict)
@@ -790,18 +844,45 @@ del(k, v, wt, mut, lookup_dict)
 # lookup_dict =  {three_letter_code: aa_prop_water} 
 # Do this for both wild_type and mutant as above.
 #=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
+print('Adding', ncol_aa_add, 'more cols:\n')
 
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
 for k, v in my_aa_dict.items():
     lookup_dict[k] = v['aa_prop_water']
     #print(lookup_dict)
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
-    
-# added two more cols
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
+
+mylen2 = len(gene_LF1.columns)    
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen2 == mylen1 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
 
 # clear variables
 del(k, v, wt, mut, lookup_dict)
@@ -811,19 +892,92 @@ del(k, v, wt, mut, lookup_dict)
 # lookup_dict =  {three_letter_code: aa_prop_polarity} 
 # Do this for both wild_type and mutant as above.
 #=========
-# initialise a sub dict that is lookup dict for three letter code to aa prop
-lookup_dict = dict()
+print('Adding', ncol_aa_add, 'more cols:\n')
 
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+# adding two more cols
+lookup_dict = dict()
 for k, v in my_aa_dict.items():
     lookup_dict[k] = v['aa_prop_polarity']
     #print(lookup_dict)
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
-    
-# added two more cols
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
     
+mylen3 = len(gene_LF1.columns)       
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen3 ==  mylen2 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen1
+      , '\nnew len:', mylen2)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen1
+          , '\nnew len:', mylen2)
+   
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#========
+# iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_calcprop} 
+# Do this for both wild_type and mutant as above.
+#=========
+print('Adding', ncol_aa_add, 'more cols:\n')
+
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_calcprop']
+    #print(lookup_dict)
+#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
+    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+    gene_LF1['wt_calcprop'] = wt.map(lookup_dict)   
+    mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
+ 
+mylen4 = len(gene_LF1.columns)   
+
+# sanity checks
+print('checking if 3-letter wt&mut residue extraction worked correctly')
+if wt.isna().sum() & mut.isna().sum() == 0:
+   print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+         , '\nNo NAs detected:'
+         , '\nwild-type\n', wt
+         , '\nmutant-type\n', mut
+         , '\ndim of df:', gene_LF1.shape)
+else:
+    print('FAIL: 3-letter wt&mut residue extraction failed'
+          , '\nNo NAs detected:'
+          , '\nwild-type\n', wt
+          , '\nmutant-type\n', mut
+          , '\ndim of df:', gene_LF1.shape)
+
+if mylen4 ==  mylen3 + ncol_aa_add:
+    print('PASS: successfully added', ncol_aa_add, 'cols'
+      , '\nold length:', mylen3
+      , '\nnew len:', mylen4)
+else:
+    print('FAIL: failed to add cols:'
+          , '\nold length:', mylen3
+          , '\nnew len:', mylen4)
+
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 
@@ -833,56 +987,62 @@ del(k, v, wt, mut, lookup_dict)
 # Do this for both wild_type and mutant as above.
 # caution: taylor mapping will create a list within a column
 #=========
+#print('Adding', ncol_aa_add, 'more cols:\n')
 #lookup_dict = dict()
-
 #for k, v in my_aa_dict.items():
 #    lookup_dict[k] = v['aa_taylor']
-#    #print(lookup_dict)
-#    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-#    pnca_LF1['wt_taylor'] = wt.map(lookup_dict)   
-#    mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-#    pnca_LF1['mut_taylor'] = mut.map(lookup_dict)
+    #print(lookup_dict)
+#    wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()
+#    gene_LF1['wt_taylor'] = wt.map(lookup_dict)   
+#    mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+#    gene_LF1['mut_taylor'] = mut.map(lookup_dict)
     
-# added two more cols
+#mylen5 = len(gene_LF1.columns)   
+
+# sanity checks
+#print('checking if 3-letter wt&mut residue extraction worked correctly')
+#if wt.isna().sum() & mut.isna().sum() == 0:
+#  print('PASS: 3-letter wt&mut residue extraction worked correctly:'
+#         , '\nNo NAs detected:'
+#         , '\nwild-type\n', wt
+#         , '\nmutant-type\n', mut
+#         , '\ndim of df:', gene_LF1.shape)
+#else:
+#    print('FAIL: 3-letter wt&mut residue extraction failed'
+#          , '\nNo NAs detected:'
+#          , '\nwild-type\n', wt
+#          , '\nmutant-type\n', mut
+#          , '\ndim of df:', gene_LF1.shape)
+
+#if mylen5 ==  mylen4 + ncol_aa_add:
+#    print('PASS: successfully added', ncol_aa_add, 'cols'
+#      , '\nold length:', mylen4
+#      , '\nnew len:', mylen5)
+#else:
+#    print('FAIL: failed to add cols:'
+#          , '\nold length:', mylen4
+#          , '\nnew len:', mylen5)
 # clear variables
 #del(k, v, wt, mut, lookup_dict)
 
-#========
-# iterate through the dict, create a lookup dict that i.e
-# lookup_dict =  {three_letter_code: aa_calcprop} 
-# Do this for both wild_type and mutant as above.
-#=========
-lookup_dict = dict()
-
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['aa_calcprop']
-    #print(lookup_dict)
-    wt = pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
-    pnca_LF1['wt_calcprop'] = wt.map(lookup_dict)   
-    mut = pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
-    pnca_LF1['mut_calcprop'] = mut.map(lookup_dict)
-    
-# added two more cols
-# clear variables
-del(k, v, wt, mut, lookup_dict)
-
 ########
 # combine the wild_type+poistion+mutant_type columns to generate 
 # Mutationinformation (matches mCSM output field)
 # Remember to use .map(str) for int col types to allow string concatenation
 #########
-pnca_LF1['Mutationinformation'] = pnca_LF1['wild_type'] + pnca_LF1.position.map(str) + pnca_LF1['mutant_type']
+gene_LF1['Mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
 print('Created column: Mutationinformation'
-	, '\n===============================================================')
+	, '\n====================================================================='
+    , gene_LF1.Mutationinformation.head(10))
 
 #%% Write file: mCSM muts
-snps_only = pd.DataFrame(pnca_LF1['Mutationinformation'].unique())
+snps_only = pd.DataFrame(gene_LF1['Mutationinformation'].unique())
 snps_only.head()
 # assign column name
 snps_only.columns = ['Mutationinformation']
 
 # count how many positions this corresponds to
-pos_only = pd.DataFrame(pnca_LF1['position'].unique()) 
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
 
 print('Checking NA in snps...')# should be 0
 if snps_only.Mutationinformation.isna().sum() == 0:
@@ -912,7 +1072,7 @@ print('Finished writing:', out_filename2
       , '\n=============================================================')
 del(out_filename2)
 
-#%% Write file: pnca_metadata (i.e pnca_LF1)
+#%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
 out_filename3 = gene.lower() + '_' + 'metadata.csv'
 outfile3 = outdir + '/' + out_filename3
@@ -921,15 +1081,15 @@ print('Writing file: LF formatted data'
       , '\nPath:', outdir
       , '\n============================================================')
 
-pnca_LF1.to_csv(outfile3, header = True, index = False)
+gene_LF1.to_csv(outfile3, header = True, index = False)
 print('Finished writing:', out_filename3
-      , '\nNo. of rows:', len(pnca_LF1)
-      , '\nNo. of cols:', len(pnca_LF1.columns)
+      , '\nNo. of rows:', len(gene_LF1)
+      , '\nNo. of cols:', len(gene_LF1.columns)
       , '\n=============================================================')
 del(out_filename3)
 
 #%% write file: mCSM style but with repitions for MSA and logo plots
-all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) 
+all_muts_msa = pd.DataFrame(gene_LF1['Mutationinformation']) 
 all_muts_msa.head()
 # assign column name
 all_muts_msa.columns = ['Mutationinformation']
@@ -978,7 +1138,7 @@ del(out_filename4)
 
 #%% write file for mutational positions
 # count how many positions this corresponds to
-pos_only = pd.DataFrame(pnca_LF1['position'].unique()) 
+pos_only = pd.DataFrame(gene_LF1['position'].unique()) 
 # assign column name
 pos_only.columns = ['position']
 # make sure dtype of column position is int or numeric and not string
diff --git a/meta_data_analysis/reference_dict.py b/scripts/reference_dict.py
similarity index 95%
rename from meta_data_analysis/reference_dict.py
rename to scripts/reference_dict.py
index 0461523..8087009 100644
--- a/meta_data_analysis/reference_dict.py
+++ b/scripts/reference_dict.py
@@ -23,34 +23,31 @@ homedir = os.path.expanduser('~')
 
 # set working dir
 #os.getcwd()
-#os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 #os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+#gene_match = gene + '_p.'
 
 #==========
 # data dir
 #==========
-#indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 
 #=======
 # input
 #=======
-indir = datadir + '/' + drug + 'input'
 in_filename = 'aa_codes.csv'
-infile = indir + '/' + in_filename 
+infile = datadir + '/' + in_filename 
 print('Input filename:', in_filename
-      , '\nInput path:', indir
+      , '\nInput path:', datadir
       , '\n============================================================')
 
 #=======
 # output: No output
 #=======
-
 #outdir = datadir + '/' + drug + '/' + 'output'  
 #out_filename = ''
 #outfile = outdir + '/' + out_filename
@@ -76,6 +73,7 @@ my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
 # using 'index' creates a dict of dicts
 # using 'records' creates a list of dicts
 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
+print('Printing my_aa_dict:', my_aa_dict.keys())
 
 #================================================
 # dict of aa with their corresponding properties