import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,59 @@
 mCSM Analysis
 =============
 This repo does mCSM analysis using Python (Pandas, numpy), bash and R.
 Requires an additional 'Data' directory (Supplied separately). Batteries not included.
 # directory struc
 # Assumptions
 1. git repos are cloned to `~/git`
 2. The script
 ## LSHTM\_analysis: Repo
 subdirs
 needs to call `../Data/input/original/`
 meta\_data\_analysis/
 	*.R
 	*.py
 needs to call `../Data/input/processed/<drug>`
 needs to output `../Data/output/results/<drug>`
 mcsm\_analysis/
 	<drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
 		scripts/ (changed from `Scripts/`)
 		*.R
 		*.py
 			mcsm/
 			*.sh
 			*.py
 			*.R
 			plotting/
 			*.R
 Data: Repo:
 # subdirs
 	input/
 		original/
 		processed/
 	output/
 	    <drug>/ (generated by `meta_data_analysis/pnca_data_extraction.py`. To be replaced with command line args or config option "soon")
    		results/
        		*.csv
        		*.xlsx
        		*.doc
        		*.txt
                plots/
                structure/
 More docs here as I write them.
--- a/mcsm.conf
+++ b/mcsm.conf
@ -0,0 +1,12 @@
 # This is not yet used, but will be soon :)
 [DEFAULT]
 mcsm_home = /home/tanu/git/github/LSHTM_analysis
 [pyrazinamide]
  gene = 'pnca'
  server = 'http://mcsm.melb.ac.uk/'
 [monkeyamide]
  gene = 'abcs'
  server = 'http://myserver.local/'
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -0,0 +1,512 @@
 ###########################
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ###########################
 # Data for bfactor figure
 # PS average
 # Lig average
 ###########################
 head(my_df$Position)
 head(my_df$ratioDUET)
 # order data frame
 df = my_df[order(my_df$Position),]
 head(df$Position)
 head(df$ratioDUET)
 #***********
 # PS: average by position
 #***********
 mean_DUET_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.DUET = mean(ratioDUET))
 #***********
 # Lig: average by position
 #***********
 mean_Lig_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.Lig = mean(ratioPredAff))
 #***********
 # cbind:mean_DUET_by_position and mean_Lig_by_position
 #***********
 combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
 # sanity check
 # mean_PS_Lig_Bfactor
 colnames(combined)
 colnames(combined) = c("Position"
 , "average_DUETR"
 , "Position2"
 , "average_PredAffR")
 colnames(combined)
 identical(combined$Position, combined$Position2)
 n = which(colnames(combined) == "Position2"); n
 combined_df = combined[,-n]
 max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
 max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
 #=============
 # output csv
 #============
 outDir = "~/Data/pyrazinamide/input/processed/"
 outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
 print(paste0("Output file with path will be:","", outFile))
 head(combined_df$Position); tail(combined_df$Position)
 write.csv(combined_df, outFile
 , row.names = F)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 require(data.table)
 require(dplyr)
 ########################################################################
 #		 Read file: call script for combining df for PS		   	   #
 ########################################################################
 source("../combining_two_df.R")
 ###########################
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 ###########################
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 ###########################
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ###########################
 # Data for bfactor figure
 # PS average
 # Lig average
 ###########################
 head(my_df$Position)
 head(my_df$ratioDUET)
 # order data frame
 df = my_df[order(my_df$Position),]
 head(df$Position)
 head(df$ratioDUET)
 #***********
 # PS: average by position
 #***********
 mean_DUET_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.DUET = mean(ratioDUET))
 #***********
 # Lig: average by position
 #***********
 mean_Lig_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.Lig = mean(ratioPredAff))
 #***********
 # cbind:mean_DUET_by_position and mean_Lig_by_position
 #***********
 combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
 # sanity check
 # mean_PS_Lig_Bfactor
 colnames(combined)
 colnames(combined) = c("Position"
 , "average_DUETR"
 , "Position2"
 , "average_PredAffR")
 colnames(combined)
 identical(combined$Position, combined$Position2)
 n = which(colnames(combined) == "Position2"); n
 combined_df = combined[,-n]
 max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
 max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
 #=============
 # output csv
 #============
 outDir = "~/git/Data/pyrazinamide/input/processed/"
 outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
 print(paste0("Output file with path will be:","", outFile))
 head(combined_df$Position); tail(combined_df$Position)
 write.csv(combined_df, outFile
 , row.names = F)
 # read in pdb file complex1
 inDir = "~/git/Data/pyrazinamide/input/structure"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 # read in pdb file complex1
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 #########################
 #3: Read complex pdb file
 ##########################
 source("Header_TT.R")
 # list of 8
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 rm(inDir, inFile)
 #====== end of script
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 complex1 = inFile
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 inFile
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 #inFile2 = paste0(inDir, "complex2_no_water.pdb")
 #complex2 = inFile2
 # list of 8
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 rm(inDir, inFile, complex1)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 source("Header_TT.R")
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			               #
 ########################################################################
 source("Header_TT.R")
 #########################################################
 # TASK: replace B-factors in the pdb file with normalised values
 # use the complex file with no water as mCSM lig was
 # performed on this file. You can check it in the script: read_pdb file.
 #########################################################
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
 , header = T)
 str(my_df)
 source("read_pdb.R") # list of 8
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 #1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: DUET scores
 hist(my_df$average_DUETR
 , xlab = ""
 , main = "Norm_DUET")
 plot(density(my_df$average_DUETR)
 , xlab = ""
 , main = "Norm_DUET")
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 #1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: DUET scores
 hist(my_df$average_DUETR
 , xlab = ""
 , main = "Norm_DUET")
 plot(density(my_df$average_DUETR)
 , xlab = ""
 , main = "Norm_DUET")
 #=========
 # step 1_P1
 #=========
 # Be brave and replace in place now (don't run sanity check)
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P1
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na
 # count number of 0's in Bactor
 sum(d$b == 0)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 # sanity check: should be True
 if (sum(d$b == 0) == b_na){
 print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
 print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if(max(d$b) == max(my_df$average_DUETR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_DUETR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 #=========
 # step 3_P1
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P1
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d
 max(d$b); min(d$b)
 #=========
 # step 5_P1
 #=========
 # output dir
 getwd()
 outDir = "~/git/Data/pyrazinamide/output/"
 getwd()
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
 outDir = "~/git/Data/pyrazinamide/input/structure"
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
 write.pdb(my_pdb, outFile)
 hist(d$b
 , xlab = ""
 , main = "repalced-B")
 plot(density(d$b)
 , xlab = ""
 , main = "replaced-B")
 # graph titles
 mtext(text = "Frequency"
 , side = 2
 , line = 0
 , outer = TRUE)
 mtext(text = "DUET_stability"
 , side = 3
 , line = 0
 , outer = TRUE)
 #=========================================================
 # Processing P2: Replacing  B values with PredAff Scores
 #=========================================================
 # clear workspace
 rm(list = ls())
 #=========================================================
 # Processing P2: Replacing  B values with PredAff Scores
 #=========================================================
 # clear workspace
 rm(list = ls())
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
 #                  , row.names = 1
 #                  , stringsAsFactors = F
 , header = T)
 str(my_df)
 #=========================================================
 # Processing P2: Replacing B factor with mean ratioLig scores
 #=========================================================
 #########################
 # 3: Read complex pdb file
 # form the R script
 ##########################
 source("read_pdb.R") # list of 8
 # extract atom list into a vari
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
 , header = T)
 str(my_df)
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 # 1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: Pred Aff scores
 hist(my_df$average_PredAffR
 , xlab = ""
 , main = "Norm_lig_average")
 plot(density(my_df$average_PredAffR)
 , xlab = ""
 , main = "Norm_lig_average")
 # 3: After the following replacement
 #********************************
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 # 1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: Pred Aff scores
 hist(my_df$average_PredAffR
 , xlab = ""
 , main = "Norm_lig_average")
 plot(density(my_df$average_PredAffR)
 , xlab = ""
 , main = "Norm_lig_average")
 # 3: After the following replacement
 #********************************
 #=========
 # step 1_P2: BE BRAVE and replace in place now (don't run step 0)
 #=========
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P2
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na
 # count number of 0's in Bactor
 sum(d$b == 0)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 if (sum(d$b == 0) == b_na){
 print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
 print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if (max(d$b) == max(my_df$average_PredAffR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_PredAffR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 #=========
 # step 3_P2
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P2
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d
 max(d$b); min(d$b)
 #=========
 # step 5_P2
 #=========
 write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
 # output dir
 getwd()
 # output dir
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
 outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
 write.pdb(my_pdb, outFile)
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@ -0,0 +1,129 @@
 #########################################################
 ### A) Installing and loading required packages
 #########################################################
 #if (!require("gplots")) {
 #  install.packages("gplots", dependencies = TRUE)
 #  library(gplots)
 #}
 if (!require("tidyverse")) {
  install.packages("tidyverse", dependencies = TRUE)
  library(tidyverse)
 }
 if (!require("ggplot2")) {
  install.packages("ggplot2", dependencies = TRUE)
  library(ggplot2)
 }
 if (!require("cowplot")) {
  install.packages("copwplot", dependencies = TRUE)
  library(ggplot2)
 }
 if (!require("ggcorrplot")) {
  install.packages("ggcorrplot", dependencies = TRUE)
  library(ggcorrplot)
 }
 if (!require("ggpubr")) {
  install.packages("ggpubr", dependencies = TRUE)
  library(ggpubr)
 }
 if (!require("RColorBrewer")) {
  install.packages("RColorBrewer", dependencies = TRUE)
  library(RColorBrewer)
 }
 if (!require ("GOplot")) {
  install.packages("GOplot")
  library(GOplot)
 }
 if(!require("VennDiagram")) {
  install.packages("VennDiagram", dependencies = T)
  library(VennDiagram)
 }
 if(!require("scales")) {
  install.packages("scales", dependencies = T)
  library(scales)
 }
 if(!require("plotrix")) {
  install.packages("plotrix", dependencies = T)
  library(plotrix)
 }
 if(!require("stats")) {
  install.packages("stats", dependencies = T)
  library(stats)
 }
 if(!require("stats4")) {
  install.packages("stats4", dependencies = T)
  library(stats4)
 }
 if(!require("data.table")) {
  library(stats4)
 }
 if (!require("PerformanceAnalytics")){
  install.packages("PerformanceAnalytics", dependencies = T)
  library(PerformaceAnalytics)
 }
 if (!require ("GGally")){
  install.packages("GGally")
  library(GGally)
 }
 if (!require ("corrr")){
  install.packages("corrr")
  library(corrr)
 }
 if (!require ("psych")){
  install.packages("psych")
  library(psych)
 }
 if (!require ("dplyr")){
  install.packages("dplyr")
  library(psych)
 }
 if (!require ("compare")){
  install.packages("compare")
  library(psych)
 }
 if (!require ("arsenal")){
  install.packages("arsenal")
  library(psych)
 }
 ####TIDYVERSE
 # Install
 #if(!require(devtools)) install.packages("devtools")
 #devtools::install_github("kassambara/ggcorrplot")
 library(ggcorrplot)
 ###for PDB files
 #install.packages("bio3d") 
 if(!require(bio3d)){
  install.packages("bio3d")
  library(bio3d)
 }
--- a/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
+++ b/mcsm_analysis/pyrazinamide/scripts/barplot_colour_function.R
@ -0,0 +1,27 @@
 #########################################################
 # 1b: Define function: coloured barplot by subgroup
 # LINK: https://stackoverflow.com/questions/49818271/stacked-barplot-with-colour-gradients-for-each-bar
 #########################################################
 ColourPalleteMulti <- function(df, group, subgroup){
  # Find how many colour categories to create and the number of colours in each
  categories <- aggregate(as.formula(paste(subgroup, group, sep="~" ))
                          , df
                          , function(x) length(unique(x)))
  #  return(categories) }
  category.start <- (scales::hue_pal(l = 100)(nrow(categories))) # Set the top of the colour pallete
  category.end  <- (scales::hue_pal(l = 40)(nrow(categories))) # set the bottom
  #return(category.start); return(category.end)}
  # Build Colour pallette
  colours <- unlist(lapply(1:nrow(categories),
                           function(i){
                             colorRampPalette(colors = c(category.start[i]
                                                         , category.end[i]))(categories[i,2])}))
  return(colours)
 }
 #########################################################
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -0,0 +1,299 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
 getwd()
 #########################################################
 # TASK: To combine mcsm and meta data with af and or
 #########################################################
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
 #library(tidyverse)
 #################################
 # Read file: normalised file
 # output of step 4 mcsm_pipeline
 #################################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
 mcsm_data = read.csv(inFile
                     , row.names = 1
                     , stringsAsFactors = F
                     , header = T) 
 rm(inDir, inFile)
 str(mcsm_data)
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 # spelling Correction 1: DUET
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
 # spelling Correction 2: Ligand
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
 # count na in each column
 na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
 # sort by Mutationinformation
 mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
 head(mcsm_data$Mutationinformation)
 # get freq count of positions and add to the df
 setDT(mcsm_data)[, occurrence := .N, by = .(Position)] 
 pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
 ###########################
 # 2: Read file: meta data with AFandOR
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
 meta_with_afor <- read.csv(inFile2
                      , stringsAsFactors = F
                      , header = T)
 rm(inDir, inFile2)
 str(meta_with_afor)
 # sort by Mutationinformation
 head(meta_with_afor$Mutationinformation)
 meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
 head(meta_with_afor$Mutationinformation)
 # sanity check: should be True for all the mentioned columns
 #is.numeric(meta_with_afor$OR)
 na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
 c1 = NULL
 for (i in na_var){
  print(i)
  c0 = is.numeric(meta_with_afor[,i])
  c1 = c(c0, c1)
  if ( all(c1) ){
    print("Sanity check passed: These are all numeric cols")
  } else{
    print("Error: Please check your respective data types")
  }
 }
 # If OR, and P value are not numeric, then convert to numeric and then count
 # else they will say 0
 na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
 str(na_count)
 # compare if the No of "NA" are the same for all these cols
 na_len = NULL
 for (i in na_var){
  temp = na_count[[i]]
  na_len = c(na_len, temp)
 }
 # extract how many NAs: 
 # should be all TRUE 
 # should be a single number since 
 # all the cols should have "equal" and "same" no. of NAs
 my_nrows = NULL
 for ( i in 1: (length(na_len)-1) ){
  #print(compare(na_len[i]), na_len[i+1])
  c = compare(na_len[i], na_len[i+1])
  if ( c$result ) {
    my_nrows = na_len[i] }
  else { 
  print("Error: Please check your numbers") 
  }
 }
 my_nrows
 #=#=#=#=#=#=#=#=#
 # COMMENT: AF, OR, pvalue, logor and neglog10pvalue
 # these are the same 7 ones
 #=#=#=#=#=#=#=#=#
 # sanity check
 #which(is.na(meta_with_afor$OR)) 
 # initialise an empty df with nrows as extracted above
 na_count_df = data.frame(matrix(vector(mode = 'numeric'
 #                                       , length = length(na_var)
                                       )
                                , nrow = my_nrows
 #                                , ncol = length(na_var)
                              ))
 # populate the df with the indices of the cols that are NA
 for (i in na_var){
  print(i)
  na_i = which(is.na(meta_with_afor[i]))
  na_count_df = cbind(na_count_df, na_i)
  colnames(na_count_df)[which(na_var == i)] <- i
 }
 # Now compare these indices to ensure these are the same
 c2 = NULL
 for ( i in 1: ( length(na_count_df)-1 ) ) {
 #  print(na_count_df[i] == na_count_df[i+1])
  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
  c2 = c(c1, c2)
  if ( all(c2) ) {
    print("Sanity check passed: The indices for AF, OR, etc are all the same")
  } else {
    print ("Error: Please check indices which are NA")
  }
 }
 rm( c, c0, c1, c2, i, my_nrows
    , na_count, na_i, na_len
    , na_var, temp
    , na_count_df
    , pos_count_check )
 ###########################
 # 3:merging two dfs: with NA
 ###########################
 # link col name  = Mutationinforamtion
 head(mcsm_data$Mutationinformation)
 head(meta_with_afor$Mutationinformation)
 #########
 # merge 1a: meta data with mcsm
 #########
 merged_df2 = merge(x = meta_with_afor
                  ,y = mcsm_data
                  , by = "Mutationinformation"
                  , all.y = T)
 head(merged_df2$Position)
 # sort by Position
 head(merged_df2$Position)
 merged_df2 = merged_df2[order(merged_df2$Position),]
 head(merged_df2$Position)
 merged_df2v2 = merge(x = meta_with_afor
                   ,y = mcsm_data
                   , by = "Mutationinformation"
                   , all.x = T) 
 #!=!=!=!=!=!=!=!
 # COMMENT: used all.y since position 186 is not part of the struc,
 # hence doesn't have a mcsm value
 # but 186 is associated with with mutation
 #!=!=!=!=!=!=!=!
 # should  be False
 identical(merged_df2, merged_df2v2)
 table(merged_df2$Position%in%merged_df2v2$Position)
 rm(merged_df2v2)
 #########
 # merge 1b:remove duplicate mutation information
 #########
 #==#=#=#=#=#=#
 # Cannot trust lineage, country from this df as the same mutation
 # can have many different lineages
 # but this should be good for the numerical corr plots
 #=#=#=#=#=#=#=
 merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
 head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
 # sanity checks
 # nrows of merged_df3 should be the same as the nrows of mcsm_data
 if(nrow(mcsm_data) == nrow(merged_df3)){
  print("sanity check: Passed")
 } else {
  print("Error!: check data, nrows is not as expected")
 }
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # uncomment as necessary
 # only need to run this if merged_df2v2 i.e non structural pos included
 #mcsm = mcsm_data$Mutationinformation
 #my_merged = merged_df3$Mutationinformation
 # find the index where it differs
 #diff_n = which(!my_merged%in%mcsm)
 #check if it is indeed pos 186
 #merged_df3[diff_n,]
 # remove this entry
 #merged_df3 = merged_df3[-diff_n,]]
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ###########################
 # 3b :merging two dfs: without NA
 ###########################
 #########
 # merge 2a:same as merge 1 but excluding NA
 #########
 merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
 #########
 # merge 2b: remove duplicate mutation information
 #########
 merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$AF),]
 # compare dfs: foo and merged_df3_com
 all.equal(foo, merged_df3)
 summary(comparedf(foo, merged_df3))
 #=============== end of combining df
 #clear variables
 rm(mcsm_data
   , meta_with_afor
   , foo)
 #rm(diff_n, my_merged, mcsm)
 #=====================
 # write_output files
 #=====================
 # output dir
 outDir = "~/git/Data/pyrazinamide/output/"
 getwd()
 outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
 write.csv(merged_df3, outFile1)
 #outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
 #write.csv(merged_df3_comp, outFile2)
 rm(outDir
   , outFile1
 #   , outFile2
 )
 #============================= end of script
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df_lig.R
@ -0,0 +1,348 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
 getwd()
 #########################################################
 # TASK: To combine mcsm and meta data with af and or
 # by filtering for distance to ligand (<10Ang)
 #########################################################
 #########################################################
 # Installing and loading required packages
 #########################################################
 #source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
 #library(tidyverse)
 #################################
 # Read file: normalised file
 # output of step 4 mcsm_pipeline
 #################################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
 mcsm_data = read.csv(inFile
                     , row.names = 1
                     , stringsAsFactors = F
                     , header = T) 
 rm(inDir, inFile)
 str(mcsm_data)
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 # spelling Correction 1: DUET
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
 # checks
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
 # spelling Correction 2: Ligand
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
 ########################### !!! only for mcsm_lig
 # 4: Filter/subset data 
 # Lig plots < 10Ang
 # Filter the lig plots for Dis_to_lig < 10Ang
 ###########################
 # check range of distances
 max(mcsm_data$Dis_lig_Ang)
 min(mcsm_data$Dis_lig_Ang)
 # count
 table(mcsm_data$Dis_lig_Ang<10)
 # subset data to have only values less than 10 Ang
 mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
 # sanity checks
 max(mcsm_data2$Dis_lig_Ang)
 min(mcsm_data2$Dis_lig_Ang)
 # count no of unique positions
 length(unique(mcsm_data2$Position))
 # count no of unique mutations
 length(unique(mcsm_data2$Mutationinformation))
 # count Destabilisinga and stabilising
 table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
 #<<<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT: so as not to alter the script
 mcsm_data = mcsm_data2
 #<<<<<<<<<<<<<<<<<<<<<<<<<<<
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(mcsm_data$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 # clear variables
 rm(mcsm_data2)
 # count na in each column
 na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
 head(mcsm_data$Mutationinformation)
 mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
 mcsm_data[mcsm_data$Mutationinformation=="L4S",]
 # sort by Mutationinformation
 mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
 head(mcsm_data$Mutationinformation)
 # check
 mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
 mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
 # get freq count of positions and add to the df
 setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
 pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
 ###########################
 # 2: Read file: meta data with AFandOR
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
 meta_with_afor <- read.csv(inFile2
                      , stringsAsFactors = F
                      , header = T)
 str(meta_with_afor)
 # sort by Mutationinformation
 head(meta_with_afor$Mutationinformation)
 meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
 head(meta_with_afor$Mutationinformation)
 # sanity check: should be True for all the mentioned columns
 #is.numeric(meta_with_afor$OR)
 na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
 c1 = NULL
 for (i in na_var){
  print(i)
  c0 = is.numeric(meta_with_afor[,i])
  c1 = c(c0, c1)
  if ( all(c1) ){
    print("Sanity check passed: These are all numeric cols")
  } else{
    print("Error: Please check your respective data types")
  }
 }
 # If OR, and P value are not numeric, then convert to numeric and then count
 # else they will say 0
 # NOW count na in each column: if you did it before, then 
 # OR and Pvalue column would say 0 na since these were not numeric
 na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
 str(na_count)
 # compare if the No of "NA" are the same for all these cols
 na_len = NULL
 na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
 for (i in na_var){
  temp = na_count[[i]]
  na_len = c(na_len, temp)
 }
 my_nrows = NULL
 for ( i in 1: (length(na_len)-1) ){
  #print(compare(na_len[i]), na_len[i+1])
  c = compare(na_len[i], na_len[i+1])
  if ( c$result ) {
    my_nrows = na_len[i] }
  else { 
    print("Error: Please check your numbers") 
  }
 }
 my_nrows
 #=#=#=#=#=#=#=#=#
 # COMMENT: AF, OR, pvalue, logor and neglog10pvalue
 # all have 81 NA, with pyrazinamide with 960
 # and these are the same 7 ones
 #=#=#=#=#=#=#=#=#
 # sanity check
 #which(is.na(meta_with_afor$OR)) 
 # initialise an empty df with nrows as extracted above
 na_count_df = data.frame(matrix(vector(mode = 'numeric'
 #                                      , length = length(na_var) 
                                      )
                                , nrow = my_nrows
 #                                , ncol = length(na_var)
                                ))
 # populate the df with the indices of the cols that are NA
 for (i in na_var){
  print(i)
  na_i = which(is.na(meta_with_afor[i]))
  na_count_df = cbind(na_count_df, na_i)
  colnames(na_count_df)[which(na_var == i)] <- i
 } 
 # Now compare these indices to ensure these are the same
 c2 = NULL
 for ( i in 1: ( length(na_count_df)-1 ) ) {
  #  print(na_count_df[i] == na_count_df[i+1])
  c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
  c2 = c(c1, c2)
  if ( all(c2) ) {
    print("Sanity check passed: The indices for AF, OR, etc are all the same")
  } else {
    print ("Error: Please check indices which are NA")
  }
 }
 rm( c, c1, c2, i, my_nrows
    , na_count, na_i, na_len
    , na_var, temp
    , na_count_df
    , pos_count_check )
 ###########################
 # 3:merging two dfs: with NA
 ###########################
 # link col name  = Mutationinforamtion
 head(mcsm_data$Mutationinformation)
 head(meta_with_afor$Mutationinformation)
 #########
 # merge 1a: meta data with mcsm
 #########
 merged_df2 = merge(x = meta_with_afor
                  , y = mcsm_data
                  , by = "Mutationinformation"
                  , all.y = T)
 head(merged_df2$Position)
 # sort by Position
 head(merged_df2$Position)
 merged_df2 = merged_df2[order(merged_df2$Position),]
 head(merged_df2$Position)
 merged_df2v2 = merge(x = meta_with_afor
                   ,y = mcsm_data
                   , by = "Mutationinformation"
                   , all.x = T) 
 #!=!=!=!=!=!=!=!
 # COMMENT: used all.y since position 186 is not part of the struc,
 # hence doesn't have a mcsm value
 # but 186 is associated with with mutation
 #!=!=!=!=!=!=!=!
 # should  be False
 identical(merged_df2, merged_df2v2)
 table(merged_df2$Position%in%merged_df2v2$Position)
 rm(merged_df2v2)
 #########
 # merge 1b:remove duplicate mutation information
 #########
 #==#=#=#=#=#=#
 # Cannot trust lineage, country from this df as the same mutation
 # can have many different lineages
 # but this should be good for the numerical corr plots
 #=#=#=#=#=#=#=
 merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
 head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
 # sanity checks
 # nrows of merged_df3 should be the same as the nrows of mcsm_data
 if(nrow(mcsm_data) == nrow(merged_df3)){
  print("sanity check: Passed")
 } else {
  print("Error!: check data, nrows is not as expected")
 }
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # uncomment as necessary
 # only need to run this if merged_df2v2 i.e non structural pos included
 #mcsm = mcsm_data$Mutationinformation
 #my_merged = merged_df3$Mutationinformation
 # find the index where it differs
 #diff_n = which(!my_merged%in%mcsm)
 #check if it is indeed pos 186
 #merged_df3[diff_n,]
 # remove this entry
 #merged_df3 = merged_df3[-diff_n,] 
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ###########################
 # 3b :merging two dfs: without NA
 ###########################
 #########
 # merge 2a:same as merge 1 but excluding NA
 #########
 merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
 #########
 # merge 2b: remove duplicate mutation information
 #########
 merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
 # FIXME: add this as a sanity check. I have manually checked!
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$AF),]
 # compare dfs: foo and merged_df3_com
 all.equal(foo, merged_df3)
 summary(comparedf(foo, merged_df3))
 #=============== end of combining df
 #clear variables
 rm(mcsm_data
   , meta_with_afor
   , foo)
 #rm(diff_n, my_merged, mcsm)
 #===============end of script
 #=====================
 # write_output files
 #=====================
 # Not required as this is a subset of the "combining_two_df.R" script
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step0_check_duplicate_SNPs.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 #*************************************
 # need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #**********************************************************************
 # TASK: Text file containing a list of SNPs; SNP in the format(C2E)
 # per line. Sort by unique, which automatically removes duplicates.
 # sace file in current directory
 #**********************************************************************
 infile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2.csv"
 outfile="${HOME}/git/Data/input/processed/pyrazinamide/pnca_mis_SNPs_v2_unique.csv"
 # sort unique entries and output to current directory
 sort -u ${infile} > ${outfile}
 # count no. of unique snps mCSM will run on
 count=$(wc -l < ${outfile})
 # print to console no. of unique snps mCSM will run on
 echo "${count} unique mutations for mCSM to run on"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -0,0 +1,72 @@
 #!/bin/bash
 #*************************************
 #need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #**********************************************************************
 # TASK: submit requests using curl: HANDLE redirects and refresh url. 
 # Iterate over mutation file and write/append result urls to a file
 # result url file: stored in the /Results directory
 # mutation file: one mutation per line, no chain ID
 # output: in a file, should be n urls (n=no. of mutations in file)
 # NOTE: these are just result urls, not actual values for results
 #**********************************************************************
 ## iterate over mutation file; line by line and submit query using curl
 filename="../Data/pnca_mis_SNPs_v2_unique.csv"
 ## some useful messages
 echo -n -e "Processing $(wc -l < ${filename}) entries from ${filename}\n"
 COUNT=0
 while read -r line; do
 ((COUNT++))
    mutation="${line}"
 #    echo "${mutation}"
 pdb='../Data/complex1_no_water.pdb'
 mutation="${mutation}"
 chain="A"
 lig_id="PZA"
 affin_wt="0.99"
 host="http://biosig.unimelb.edu.au"
 call_url="/mcsm_lig/prediction"
 ##=========================================
 ##html field_names names required for curl
 ##complex_field:wild=@
 ##mutation_field:mutation=@
 ##chain_field:chain=@
 ##ligand_field:lig_id@
 ##energy_field:affin_wt
 #=========================================
 refresh_url=$(curl -L \
     -sS \
     -F "wild=@${pdb}" \
     -F "mutation=${mutation}" \
     -F "chain=${chain}" \
     -F "lig_id=${lig_id}" \
     -F "affin_wt=${affin_wt}" \
     ${host}${call_url} | grep "http-equiv")
 #echo $refresh_url
 #echo ${host}${refresh_url}
 #use regex to extract the relevant bit from the refresh url
 #regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
 #Now build: result url using host and refresh url and write the urls to a file in the Results dir
 result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
 sleep 10
 echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
 echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
 #echo -n '.'
 done < "${filename}"
 echo
 echo "Processing Complete"
 ##end of submitting query, receiving result url and storing results url in a file
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -0,0 +1,59 @@
 #!/bin/bash
 #*************************************
 #need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #********************************************************************
 # TASK: submit result urls and fetch actual results using curl
 # iterate over each result url from the output of step1 in the stored
 # in file in /Results.
 # Use curl to fetch results and extract relevant sections using hxtools
 # and store these in another file in /Results 
 # This script takes two arguments:
 # 	input file: file containing results url
 #				In this case: 336_mCSM_lig_complex1_result_url.txt
 # 	output file: name of the file where extracted results will be stored
 #				In this case : it is 336_mCSM_lig_complex1_output_MASTER.txt
 #*********************************************************************
 #if [ "$#" -ne 2 ]; then
  #if [ -Z $1 ]; then
 #  echo "
 #  Please provide both Input and Output files.
 #  Usage: batch_read_urls.sh INFILE OUTFILE
 #  "
 #  exit 1
 #fi
 # First argument: Input File
 # Second argument: Output File
 #infile=$1
 #outfile=$2
 infile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_result_url.txt"
 outfile="${HOME}/git/LSHTM_analysis/mcsm_complex1/Results/336_mCSM_lig_complex1_output_MASTER.txt"
 echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
 echo
 COUNT=0
 while read -r line; do
 #COUNT=$(($COUNT+1))
 ((COUNT++))
  curl --silent ${line} \
    | hxnormalize -x \
    | hxselect -c div.span4 \
    | hxselect -c div.well \
    | sed -r -e 's/<[^>]*>//g' \
    | sed -re 's/ +//g' \
    >> ${outfile}
  #| tee -a ${outfile}
 #  echo -n '.'
 echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."  
 done < "${infile}"
 echo
 echo "Processing Complete"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -0,0 +1,52 @@
 #!/bin/bash
 #*************************************
 #need to be in the correct directory
 #*************************************
 ##: comments for code
 #: commented out code
 #********************************************************************
 # TASK: Intermediate results processing
 # output file has a convenient delimiter of ":" that can be used to 
 # format the file into two columns (col1: field_desc and col2: values)
 # However the section "PredictedAffinityChange:...." and 
 # "DUETstabilitychange:.." are split over multiple lines and 
 # prevent this from happening.Additionally there are other empty lines
 # that need to be omiited. In order ensure these sections are not split
 # over multiple lines, this script is written.
 #*********************************************************************
 infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
 #sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${infile} \
 # | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${infile}
 # Outputs records separated by a newline, that look something like this:
 # PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
 # Mutationinformation:
 # Wild-type:L
 # Position:4
 # Mutant-type:W
 # Chain:A
 # LigandID:PZA
 # Distancetoligand:15.911&Aring;
 # DUETstabilitychange:-2.169Kcal/mol
 # 
 # PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
 # (...etc)
 # This script brings everything in a convenient format for further processing in python.
 # bear in mind, this replaces the file in place, so make sure you retain a copy for your records
 sed -i '/PredictedAffinityChange/ {
 N
 N
 N
 N
 s/\n//g
 }
 /DUETstabilitychange:/ {
 N
 N
 s/\n//g
 }
 /^$/d' ${infile}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@ -0,0 +1,29 @@
 #!/usr/bin/python
 import pandas as pd
 from collections import defaultdict
 #file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
 outCols=[
        'PredictedAffinityChange',
        'Mutationinformation',
        'Wild-type',
        'Position',
        'Mutant-type',
        'Chain',
        'LigandID',
        'Distancetoligand',
        'DUETstabilitychange'
        ]
 lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
 outputs = defaultdict(list)
 for item in lines:
 	col, val = item.split(':')
 	outputs[col].append(val)
 dfOut=pd.DataFrame(outputs)
 pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_data_cleaning.R
@ -0,0 +1,207 @@
 getwd()
 #setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
 setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
 #setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
 getwd()
 #=======================================================
 #TASK: To tidy the columns so you can generate figures
 #=======================================================
 ####################
 #### read file #####: this will be the output from python script (csv file)
 ####################
 data = read.csv("336_complex1_formatted_results.csv"
              , header = T
              , stringsAsFactors = FALSE)
 dim(data)
 #335, 10
 str(data)
 ###########################
 ##### Data processing #####
 ###########################
 # populate mutation information columns as currently it is empty
 head(data$Mutationinformation)
 tail(data$Mutationinformation)
 # should not be blank: create muation information
 data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
 head(data$Mutationinformation)
 tail(data$Mutationinformation)
 #write.csv(data, 'test.csv')
 ##########################################
 # Remove duplicate SNPs as a sanity check
 ##########################################
 #very important
 table(duplicated(data$Mutationinformation))
 #FALSE   
 #335
 #extract duplicated entries
 dups = data[duplicated(data$Mutationinformation),] #0
 #No of dups should match with the no. of TRUE in the above table 
 #u_dups = unique(dups$Mutationinformation) #10
 sum( table(dups$Mutationinformation) ) #13
 rm(dups)
 #***************************************************************
 #select non-duplicated SNPs and create a new df
 df = data[!duplicated(data$Mutationinformation),] #309, 10
 #***************************************************************
 #sanity check
 u = unique(df$Mutationinformation)
 u2 = unique(data$Mutationinformation)
 table(u%in%u2)
 #TRUE 
 #309 
 #should all be 1, hence 309 1's
 sum(table(df$Mutationinformation) == 1)
 #sort df by Position
 #MANUAL CHECKPOINT:  
 #foo <- df[order(df$Position),]
 #df <- df[order(df$Position),]
 rm(u, u2, dups)
 ####################
 #### give meaningful colnames to reflect units to enable correct data type
 ####################
 #=======
 #STEP 1
 #========
 #make a copy of the PredictedAffinityColumn and call it Lig_outcome
 df$Lig_outcome = df$PredictedAffinityChange #335, 11
 #make Predicted...column numeric and outcome column categorical
 head(df$PredictedAffinityChange)
 df$PredictedAffinityChange = gsub("log.*"
                                  , ""
                                  , df$PredictedAffinityChange)
 #sanity checks
 head(df$PredictedAffinityChange)
 #should be numeric, check and if not make it numeric
 is.numeric( df$PredictedAffinityChange )
 #change to numeric
 df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
 #should be TRUE
 is.numeric( df$PredictedAffinityChange )
 #change the column name to indicate units
 n = which(colnames(df) == "PredictedAffinityChange"); n
 colnames(df)[n] = "PredAffLog"
 colnames(df)[n]
 #========
 #STEP 2
 #========
 #make Lig_outcome column categorical showing effect of mutation
 head(df$Lig_outcome)
 df$Lig_outcome = gsub("^.*-"
                  , "",
                  df$Lig_outcome)
 #sanity checks
 head(df$Lig_outcome)
 #should be factor, check and if not change it to factor
 is.factor(df$Lig_outcome) 
 #change to factor
 df$Lig_outcome = as.factor(df$Lig_outcome)
 #should be TRUE
 is.factor(df$Lig_outcome) 
 #========
 #STEP 3
 #========
 #gsub
 head(df$Distancetoligand)
 df$Distancetoligand = gsub("&Aring;"
                           , ""
                           , df$Distancetoligand)
 #sanity checks
 head(df$Distancetoligand)
 #should be numeric, check if not change it to numeric
 is.numeric(df$Distancetoligand)
 #change to numeric
 df$Distancetoligand = as.numeric(df$Distancetoligand)
 #should be TRUE
 is.numeric(df$Distancetoligand)
 #change the column name to indicate units
 n = which(colnames(df) == "Distancetoligand")
 colnames(df)[n] <- "Dis_lig_Ang"
 colnames(df)[n]
 #========
 #STEP 4
 #========
 #gsub
 head(df$DUETstabilitychange)
 df$DUETstabilitychange = gsub("Kcal/mol"
                              , ""
                              , df$DUETstabilitychange)
 #sanity checks
 head(df$DUETstabilitychange)
 #should be numeric, check if not change it to numeric
 is.numeric(df$DUETstabilitychange)
 #change to numeric 
 df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
 #should be TRUE
 is.numeric(df$DUETstabilitychange)
 #change the column name to indicate units
 n = which(colnames(df) == "DUETstabilitychange"); n
 colnames(df)[n] = "DUETStability_Kcalpermol"
 colnames(df)[n]
 #========
 #STEP 5
 #========
 #create yet another extra column: classification of DUET stability only
 df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
                         , "Stabilizing"
                         , "Destabilizing")  #335, 12
 table(df$Lig_outcome)
 #Destabilizing   Stabilizing 
 #281             54 
 table(df$DUET_outcome)
 #Destabilizing   Stabilizing 
 #288             47 
 #==============================
 #FIXME
 #Insert a venn diagram
 #================================
 #========
 #STEP 6
 #========
 # assign wild and mutant colnames correctly
 wt = which(colnames(df) == "Wild.type"); wt
 colnames(df)[wt] <- "Wild_type"
 colnames(df[wt])
 mut = which(colnames(df) == "Mutant.type"); mut
 colnames(df)[mut] <- "Mutant_type"
 colnames(df[mut])
 #========
 #STEP 7
 #========
 #create an extra column: maybe useful for some plots
 df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
 #clear variables
 rm(n, wt, mut)
 ################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_normalise.R
@ -0,0 +1,252 @@
 getwd()
 #setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
 setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
 #setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
 getwd()
 #=======================================================
 #TASK:read cleaned data and perform rescaling
  # of DUET stability scores
  # of Pred affinity
 #compare scaling methods with plots
 #output normalised file
 #=======================================================
 ####################
 #### read file #####: this will be the output of my R script that cleans the data columns
 ####################
 source("../Scripts/step3c_data_cleaning.R")
 ##This will outut two dataframes:
 ##data: unclean data: 335, 10
 ##df : cleaned df 335, 13
 ## you can remove data if you want as you will not need it
 rm(data)
 colnames(df)
 #===================
 #3a: PredAffLog
 #===================
 n = which(colnames(df) == "PredAffLog"); n
 group = which(colnames(df) == "Lig_outcome"); group 
 #===================================================
 # order according to PredAffLog values
 #===================================================
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$PredAffLog)
 #ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
 df = df[order(df$PredAffLog),] 
 head(df$PredAffLog)
 #sanity checks
 head(df[,n]) #all negatives
 tail(df[,n]) #all positives
 #sanity checks
 mean(df[,n])
 #-0.9526746
 tapply(df[,n], df[,group], mean)
 #Destabilizing   Stabilizing 
 #-1.2112100      0.3926667 
 #===========================
 #Same as above: in 2 steps
 #===========================
 #find range of your data
 my_min = min(df[,n]); my_min #-3.948
 my_max = max(df[,n]); my_max #2.23
 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
 #       =====> chosen one:as Nick prefers
 #===============================================
 df$ratioPredAff = ifelse(df[,n] < 0
                      , df[,n]/abs(my_min)
                      , df[,n]/my_max
                      )#335 14
 #sanity checks
 head(df$ratioPredAff)
 tail(df$ratioPredAff)
 min(df$ratioPredAff); max(df$ratioPredAff)
 tapply(df$ratioPredAff, df$Lig_outcome, min)
 #Destabilizing   Stabilizing 
 #-1.000000000   0.005381166 
 tapply(df$ratioPredAff, df$Lig_outcome, max)
 #Destabilizing   Stabilizing 
 #-0.001266464   1.000000000
 #should be the same as below (281 and 54)
 sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
 table(df$Lig_outcome)
 #Destabilizing   Stabilizing 
 #281              54
 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
 #uncomment as necessary
 my_title = "Ligand_stability"
 #my_title = colnames(df[n])
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    , mfrow = c(2,2))
 hist(df[,n]
     , xlab = ""
     , main = "Raw values"
 )
 hist(df$ratioPredAff
     , xlab = ""
     , main = "ratio rescaling"
 )
 # Plot density plots underneath
 plot(density( df[,n] )
     , main = "Raw values"
 )
 plot(density( df$ratioPredAff )
     , main = "ratio rescaling"
 )
 # titles
 mtext(text = "Frequency"
       , side = 2
       , line = 0
       , outer = TRUE)
 mtext(text = my_title
      , side = 3
      , line = 0
      , outer = TRUE)
 #clear variables 
 rm(my_min, my_max, my_title, n, group)
 #===================
 # 3b: DUET stability
 #===================
 dim(df) #335, 14
 n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
 group = which(colnames(df) == "DUET_outcome"); group #12
 #===================================================
 # order according to DUET scores
 #===================================================
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$DUETStability_Kcalpermol)
 #ORDER BY DUET scores: negative values at the top and positive at the bottom
 df = df[order(df$DUETStability_Kcalpermol),] 
 #sanity checks
 head(df[,n]) #negatives
 tail(df[,n]) #positives
 #sanity checks
 mean(df[,n])
 #[1] -1.173316
 tapply(df[,n], df[,group], mean)
 #Destabilizing   Stabilizing 
 #-1.4297257     0.3978723
 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
 #       =====> chosen one:as Nick prefers
 #===============================================
 #find range of your data
 my_min = min(df[,n]); my_min #-3.87
 my_max = max(df[,n]); my_max #1.689
 df$ratioDUET = ifelse(df[,n] < 0
                      , df[,n]/abs(my_min)
                      , df[,n]/my_max
                    ) #335, 15
 #sanity check
 head(df$ratioDUET)
 tail(df$ratioDUET)
 min(df$ratioDUET); max(df$ratioDUET)
 #sanity checks
 tapply(df$ratioDUET, df$DUET_outcome, min)
 #Destabilizing   Stabilizing 
 #-1.00000000    0.01065719
 tapply(df$ratioDUET, df$DUET_outcome, max)
 #Destabilizing   Stabilizing 
 #-0.003875969   1.000000000 
 #should be the same as below (267 and 42)
 sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
 table(df$DUET_outcome)
 #Destabilizing   Stabilizing 
 #288             47
 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
 #uncomment as necessary
 my_title = "DUET_stability"
 #my_title = colnames(df[n])
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    , mfrow = c(2,2))
 hist(df[,n]
     , xlab = ""
     , main = "Raw values"
 )
 hist(df$ratioDUET
     , xlab = ""
     , main = "ratio rescaling"
 )
 # Plot density plots underneath
 plot(density( df[,n] )
     , main = "Raw values"
 )
 plot(density( df$ratioDUET )
     , main = "ratio rescaling"
 )
 # graph titles
 mtext(text = "Frequency"
      , side = 2
      , line = 0
      , outer = TRUE)
 mtext(text = my_title
      , side = 3
      , line = 0
      , outer = TRUE)
 #===================
 # write output as csv file
 #===================
 write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm_mean_stability.R
@ -0,0 +1,131 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 require(data.table)
 require(dplyr)
 ########################################################################
 #		 Read file: call script for combining df for PS		   	   #
 ########################################################################
 source("../combining_two_df.R")
 ###########################
 # This will return:
 # df with NA:
 # merged_df2 
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 ###########################
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 ###########################
 # you need merged_df3 
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3 
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ###########################
 # Data for bfactor figure
 # PS average 
 # Lig average
 ###########################
 head(my_df$Position)
 head(my_df$ratioDUET)
 # order data frame 
 df = my_df[order(my_df$Position),]
 head(df$Position)
 head(df$ratioDUET)
 #***********
 # PS: average by position
 #***********
 mean_DUET_by_position <- df %>%
  group_by(Position) %>%
  summarize(averaged.DUET = mean(ratioDUET))
 #***********
 # Lig: average by position
 #***********
 mean_Lig_by_position <- df %>%
  group_by(Position) %>%
  summarize(averaged.Lig = mean(ratioPredAff))
 #***********
 # cbind:mean_DUET_by_position and mean_Lig_by_position
 #***********
 combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
 # sanity check
 # mean_PS_Lig_Bfactor
 colnames(combined)
 colnames(combined) = c("Position"
                       , "average_DUETR"
                       , "Position2"
                       , "average_PredAffR")
 colnames(combined)
 identical(combined$Position, combined$Position2)
 n = which(colnames(combined) == "Position2"); n
 combined_df = combined[,-n]
 max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
 max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
 #=============
 # output csv
 #============
 outDir = "~/git/Data/pyrazinamide/input/processed/"
 outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
 print(paste0("Output file with path will be:","", outFile))
 head(combined_df$Position); tail(combined_df$Position)
 write.csv(combined_df, outFile
          , row.names = F)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.RData
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/.Rhistory
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/OR_PS_Ligand_combined_plot.R
@ -0,0 +1,250 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 require(cowplot)
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for OR and stability plots
 # you need merged_df3_comp
 # since these are matched 
 # to allow pairwise corr
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3_comp
 #my_df = merged_df3
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # sanity check
 # Ensure correct data type in columns to plot: need to be factor
 is.numeric(my_df$OR)
 #[1] TRUE
 #<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 # FOR PS Plots
 #<<<<<<<<<<<<<<<<<<<
 PS_df  = my_df
 rm(my_df)
 #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
 ########################################################################
 #		 Read file: call script for combining df for lig		   	   #
 ########################################################################
 getwd()
 source("combining_two_df_lig.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for OR and stability plots
 # you need merged_df3_comp
 # since these are matched 
 # to allow pairwise corr
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df2  = merged_df3_comp
 #my_df2 = merged_df3
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df2)
 str(my_df2)
 # sanity check
 # Ensure correct data type in columns to plot: need to be factor
 is.numeric(my_df2$OR)
 #[1] TRUE
 # sanity check: should be <10
 if (max(my_df2$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 #<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 # FOR Lig Plots
 #<<<<<<<<<<<<<<<<
 Lig_df  = my_df2
 rm(my_df2)
 #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
 #############
 # Plots: Bubble plot
 # x = Position, Y = stability
 # size of dots = OR
 # col: stability
 #############
 #=================
 # generate plot 1: DUET vs OR by position as geom_points
 #=================  
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 # Spelling Correction: made redundant as already corrected at the source
 #PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
 #PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
 table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
 g = ggplot(PS_df, aes(x = factor(Position)
                   , y = ratioDUET))
 p1 = g + 
  geom_point(aes(col = DUET_outcome
                 , size = OR)) +
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = my_als)
        , axis.title.y = element_text(size = my_als) 
        , legend.text = element_text(size = my_als)
        , legend.title = element_text(size = my_als) ) +
  #, legend.key.size = unit(1, "cm")) +
  labs(title = ""
       , x = "Position"
       , y = "DUET(PS)"
       , size = "Odds Ratio"
       , colour = "DUET Outcome") +
  guides(colour = guide_legend(override.aes = list(size=4))) 
 p1 
 #=================
 # generate plot 2: Lig vs OR by position as geom_points
 #=================  
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 # Spelling Correction: made redundant as already corrected at the source
 #Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
 #Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
 table(Lig_df$Lig_outcome)
 g = ggplot(Lig_df, aes(x = factor(Position)
                   , y = ratioPredAff))
 p2 = g + 
  geom_point(aes(col = Lig_outcome
                   , size = OR))+
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = my_als)
        , axis.title.y = element_text(size = my_als) 
        , legend.text = element_text(size = my_als)
        , legend.title = element_text(size = my_als) ) +
  #, legend.key.size = unit(1, "cm")) +
  labs(title = ""
       , x = "Position"
       , y = "Ligand Affinity"
       , size = "Odds Ratio"
       , colour = "Ligand Outcome"
       ) +
  guides(colour = guide_legend(override.aes = list(size=4))) 
 p2
 #======================
 #combine using cowplot
 #======================
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots"
 getwd()
 svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
 #png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
 theme_set(theme_gray()) # to preserve default theme
 printFile = cowplot::plot_grid(plot_grid(p1, p2
                             , ncol = 1
                             , align = 'v'
                             , labels = c("A", "B")
                             , label_size = my_als+5))
 print(printFile)
 dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_LIG.R
@ -0,0 +1,154 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 ########################################################################
 #		 Read file: call script for combining df for lig		   	   #
 ########################################################################
 source("../combining_two_df_lig.R") 
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for Lig plots
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(my_df$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Plot: Barplot with scores (unordered)
 # corresponds to Lig_outcome
 # Stacked Barplot with colours: Lig_outcome @ position coloured by 
 # Lig_outcome. This is a barplot where each bar corresponds 
 # to a SNP and is coloured by its corresponding Lig_outcome.
 #============================
 #===================
 # Data for plots
 #===================
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df  = my_df 
 #%%%%%%%%%%%%%%%%%%%%%%%%
 rm(my_df)
 # sanity checks
 upos = unique(my_df$Position)
 # should be a factor
 is.factor(df$Lig_outcome)
 #TRUE
 table(df$Lig_outcome)
 # should be -1 and 1: may not be in this case because you have filtered the data
 # FIXME: normalisation before or after filtering?
 min(df$ratioPredAff) #
 max(df$ratioPredAff) #
 # sanity checks
 tapply(df$ratioPredAff, df$Lig_outcome, min)
 tapply(df$ratioPredAff, df$Lig_outcome, max)
 #******************
 # generate plot
 #******************
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 my_title = "Ligand affinity"
 # axis label size
 my_xaxls = 13
 my_yaxls = 15
 # axes text size
 my_xaxts = 15
 my_yaxts = 15
 # no ordering of x-axis
 g = ggplot(df, aes(factor(Position, ordered = T)))
 g + 
  geom_bar(aes(fill = Lig_outcome), colour = "grey") +
  theme( axis.text.x = element_text(size = my_xaxls
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = my_yaxls 
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = my_xaxts)
         , axis.title.y = element_text(size = my_yaxts ) ) +
  labs(title = my_title
       , x = "Position"
       , y = "Frequency")
 # for sanity and good practice
 rm(df)
 #======================= end of plot
 # axis colours labels
 # https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
 # https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_2colours_PS.R
@ -0,0 +1,149 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 		Installing and loading required packages and functions		   #	
 ########################################################################
 source("../Header_TT.R")
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #==========================
 ###########################
 # Data for DUET plots
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3 
 #my_df  = merged_df3_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 # sanity check
 is.factor(my_df$DUET_outcome)
 my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
 is.factor(my_df$DUET_outcome)
 #[1] TRUE
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Plot 2: Barplot with scores (unordered)
 # corresponds to DUET_outcome
 # Stacked Barplot with colours: DUET_outcome @ position coloured by 
 # DUET outcome. This is a barplot where each bar corresponds 
 # to a SNP and is coloured by its corresponding DUET_outcome
 #============================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df  = my_df 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 upos = unique(df$Position)
 # should be a factor
 is.factor(my_df$DUET_outcome)
 #[1] TRUE
 table(my_df$DUET_outcome)
 # should be -1 and 1
 min(df$ratioDUET)
 max(df$ratioDUET)
 tapply(df$ratioDUET, df$DUET_outcome, min)
 tapply(df$ratioDUET, df$DUET_outcome, max)
 #******************
 # generate plot
 #******************
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 my_title = "Protein stability (DUET)"
 # axis label size
 my_xaxls = 13
 my_yaxls = 15
 # axes text size
 my_xaxts = 15
 my_yaxts = 15
 # no ordering of x-axis
 g = ggplot(df, aes(factor(Position, ordered = T)))
 g + 
  geom_bar(aes(fill = DUET_outcome), colour = "grey") +
  theme( axis.text.x = element_text(size = my_xaxls
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = my_yaxls 
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = my_xaxts)
         , axis.title.y = element_text(size = my_yaxts ) ) +
  labs(title = my_title
       , x = "Position"
       , y = "Frequency")
 # for sanity and good practice
 rm(df)
 #======================= end of plot
 # axis colours labels
 # https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
 # https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_LIG.R
@ -0,0 +1,202 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
 getwd()
 ########################################################################
 # 		Installing and loading required packages and functions		   #	
 ########################################################################
 source("../Header_TT.R")
 source("../barplot_colour_function.R")
 ########################################################################
 #		 Read file: call script for combining df for lig		   	   #
 ########################################################################
 source("../combining_two_df_lig.R") 
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for Lig plots
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 # sanity check
 is.factor(my_df$Lig_outcome)
 my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
 is.factor(my_df$Lig_outcome)
 #[1] TRUE
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(my_df$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Plot: Barplot with scores (unordered)
 # corresponds to Lig_outcome
 # Stacked Barplot with colours: Lig_outcome @ position coloured by 
 # stability scores. This is a barplot where each bar corresponds 
 # to a SNP and is coloured by its corresponding Lig stability value.
 # Normalised values (range between -1 and 1 ) to aid visualisation
 # NOTE: since barplot plots discrete values, colour = score, so number of
 # colours will be equal to the no. of unique normalised scores 
 # rather than a continuous scale
 # will require generating the colour scale separately.
 #============================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df  = my_df 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 table(df$Lig_outcome)
 # should be -1 and 1: may not be in this case because you have filtered the data
 # FIXME: normalisation before or after filtering?
 min(df$ratioPredAff) #
 max(df$ratioPredAff) #
 # sanity checks
 # very important!!!!
 tapply(df$ratioPredAff, df$Lig_outcome, min)
 tapply(df$ratioPredAff, df$Lig_outcome, max)
 #******************
 # generate plot
 #******************
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 # My colour FUNCTION: based on group and subgroup
 # in my case;
 # df = df
 # group = Lig_outcome
 # subgroup = normalised score i.e ratioPredAff
 # Prepare data: round off ratioLig scores
 # round off to 3 significant digits:
 # 165 if no rounding is performed: used to generate the originalgraph
 # 156 if rounded to 3 places
 # FIXME: check if reducing precision creates any ML prob
 # check unique values in normalised data
 u = unique(df$ratioPredAff) 
 # <<<<< -------------------------------------------
 # Run this section if rounding is to be used
 # specify number for rounding
 n = 3 
 df$ratioLigR = round(df$ratioPredAff, n) 
 u = unique(df$ratioLigR) # 156
 # create an extra column called group which contains the "gp name and score" 
 # so colours can be generated for each unique values in this column
 my_grp = df$ratioLigR
 df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
 # else 
 # uncomment the below if rounding is not required
 #my_grp = df$ratioLig
 #df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
 # <<<<< -----------------------------------------------
 # Call the function to create the palette based on the group defined above
 colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
 my_title = "Ligand affinity"
 # axis label size
 my_xaxls = 13
 my_yaxls = 15
 # axes text size
 my_xaxts = 15
 my_yaxts = 15
 # no ordering of x-axis
 g = ggplot(df, aes(factor(Position, ordered = T)))
 g + 
  geom_bar(aes(fill = group), colour = "grey") +
  scale_fill_manual( values = colours
                     , guide = 'none') +
  theme( axis.text.x = element_text(size = my_xaxls
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = my_yaxls 
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = my_xaxts)
         , axis.title.y = element_text(size = my_yaxts ) ) +
  labs(title = my_title
       , x = "Position"
       , y = "Frequency")
 # for sanity and good practice
 rm(df)
 #======================= end of plot
 # axis colours labels
 # https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
 # https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/barplots_subcolours_PS.R
@ -0,0 +1,192 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 		Installing and loading required packages and functions		   #	
 ########################################################################
 source("../Header_TT.R")
 source("../barplot_colour_function.R")
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for DUET plots
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3 
 #my_df  = merged_df3_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 # sanity check
 is.factor(my_df$DUET_outcome)
 my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
 is.factor(my_df$DUET_outcome)
 #[1] TRUE
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Barplot with scores (unordered)
 # corresponds to DUET_outcome
 # Stacked Barplot with colours: DUET_outcome @ position coloured by 
 # stability scores. This is a barplot where each bar corresponds 
 # to a SNP and is coloured by its corresponding DUET stability value.
 # Normalised values (range between -1 and 1 ) to aid visualisation
 # NOTE: since barplot plots discrete values, colour = score, so number of
 # colours will be equal to the no. of unique normalised scores 
 # rather than a continuous scale
 # will require generating the colour scale separately.
 #============================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df  = my_df 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 upos = unique(df$Position)
 # should be a factor
 is.factor(my_df$DUET_outcome)
 #[1] TRUE
 table(df$DUET_outcome)
 # should be -1 and 1
 min(df$ratioDUET)
 max(df$ratioDUET)
 tapply(df$ratioDUET, df$DUET_outcome, min)
 tapply(df$ratioDUET, df$DUET_outcome, max)
 #******************
 # generate plot
 #******************
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 # My colour FUNCTION: based on group and subgroup
 # in my case;
 # df = df
 # group = DUET_outcome
 # subgroup = normalised score i.e ratioDUET
 # Prepare data: round off ratioDUET scores
 # round off to 3 significant digits:
 # 323 if no rounding is performed: used to generate the original graph
 # 287 if rounded to 3 places
 # FIXME: check if reducing precicion creates any ML prob
 # check unique values in normalised data
 u = unique(df$ratioDUET) 
 # <<<<< -------------------------------------------
 # Run this section if rounding is to be used
 # specify number for rounding
 n = 3 
 df$ratioDUETR = round(df$ratioDUET, n)
 u = unique(df$ratioDUETR)
 # create an extra column called group which contains the "gp name and score" 
 # so colours can be generated for each unique values in this column
 my_grp = df$ratioDUETR
 df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
 # else 
 # uncomment the below if rounding is not required
 #my_grp = df$ratioDUET
 #df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
 # <<<<< -----------------------------------------------
 # Call the function to create the palette based on the group defined above
 colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
 my_title = "Protein stability (DUET)"
 # axis label size
 my_xaxls = 13
 my_yaxls = 15
 # axes text size
 my_xaxts = 15
 my_yaxts = 15
 # no ordering of x-axis
 g = ggplot(df, aes(factor(Position, ordered = T)))
 g + 
  geom_bar(aes(fill = group), colour = "grey") +
  scale_fill_manual( values = colours
                     , guide = 'none') +
  theme( axis.text.x = element_text(size = my_xaxls
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = my_yaxls 
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = my_xaxts)
         , axis.title.y = element_text(size = my_yaxts ) ) +
  labs(title = my_title
       , x = "Position"
       , y = "Frequency")
 # for sanity and good practice
 rm(df)
 #======================= end of plot
 # axis colours labels
 # https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
 # https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_LIG.R
@ -0,0 +1,215 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #require(data.table)
 #require(dplyr)
 ########################################################################
 #		 Read file: call script for combining df for lig		   	   #
 ########################################################################
 source("../combining_two_df_lig.R") 
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for Lig plots
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 # sanity check
 is.factor(my_df$Lig_outcome)
 my_df$Lig_outcome = as.factor(my_df$lig_outcome)
 is.factor(my_df$Lig_outcome)
 #[1] TRUE
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(my_df$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Plot: Basic barplots 
 #===========================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT 
 df  = my_df 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 str(df)
 if (identical(df$Position, df$position)){
  print("Sanity check passed: Columns 'Position' and 'position' are identical")
 } else{
  print("Error!: Check column names and info contained")
 }
 #****************
 # generate plot: No of stabilising and destabilsing muts
 #****************
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 svg('basic_barplots_LIG.svg')
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 # uncomment as necessary for either directly outputting results or 
 # printing on the screen
 g = ggplot(df, aes(x = Lig_outcome))
 #prinfFile = g + geom_bar(
  g + geom_bar(
  aes(fill = Lig_outcome)
  , show.legend = TRUE
 ) + geom_label(
  stat = "count"
  , aes(label = ..count..)
  , color = "black"
  , show.legend = FALSE
  , size = 10) + theme(
    axis.text.x = element_blank()
    , axis.title.x = element_blank()
    , axis.title.y = element_text(size=my_als)
    , axis.text.y = element_text(size = my_ats)
    , legend.position = c(0.73,0.8)
    , legend.text = element_text(size=my_als-2)
    , legend.title = element_text(size=my_als)
    , plot.title = element_blank()
  ) + labs(
    title = ""
    , y = "Number of SNPs"
    #, fill='Ligand Outcome'
  )  + scale_fill_discrete(name = "Ligand Outcome"
                           , labels = c("Destabilising", "Stabilising"))
 print(prinfFile)
 dev.off()
 #****************
 # generate plot: No of positions
 #****************
 #get freq count of positions so you can subset freq<1
 #require(data.table)
 setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
 head(df$pos_count)
 table(df$pos_count)
 # this is cummulative
 #1  2  3  4  5  6 
 #5 24 36 56 30 18 
 # use group by on this
 snpsBYpos_df <- df %>%
  group_by(Position) %>%
  summarize(snpsBYpos = mean(pos_count)) 
 table(snpsBYpos_df$snpsBYpos)
 #1  2  3  4  5  6 
 #5 12 12 14  6  3
 # this is what will get plotted
 svg('position_count_LIG.svg')
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
 prinfFile = g + geom_bar(
  #g + geom_bar(
  aes (alpha = 0.5)
  , show.legend = FALSE
 ) +
  geom_label(
    stat = "count", aes(label = ..count..)
    , color = "black"
    , size = 10
  ) +
  theme( 
    axis.text.x = element_text(
      size = my_ats
      , angle = 0
    )
    , axis.text.y = element_text(
      size = my_ats
      , angle = 0
      , hjust = 1
    )
    , axis.title.x = element_text(size = my_als)
    , axis.title.y = element_text(size = my_als)
    , plot.title = element_blank()
  ) +
  labs(
    x = "Number of SNPs"
    , y = "Number of Sites"
  )
 print(prinfFile)
 dev.off()
 ########################################################################
 #               			end of Lig barplots         			   #
 ########################################################################
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/basic_barplots_PS.R
@ -0,0 +1,211 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 		Installing and loading required packages and functions		   #	
 ########################################################################
 source("../Header_TT.R")
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #==========================
 ###########################
 # Data for DUET plots
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3 
 #my_df  = merged_df3_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 # sanity check
 is.factor(my_df$DUET_outcome)
 my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
 is.factor(my_df$DUET_outcome)
 #[1] TRUE
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Plot: Basic barplots 
 #===========================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT 
 df  = my_df
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 str(df)
 if (identical(df$Position, df$position)){
  print("Sanity check passed: Columns 'Position' and 'position' are identical")
 } else{
  print("Error!: Check column names and info contained")
  }
 #****************
 # generate plot: No of stabilising and destabilsing muts
 #****************
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 svg('basic_barplots_DUET.svg')
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 theme_set(theme_grey())
 # uncomment as necessary for either directly outputting results or 
 # printing on the screen
 g = ggplot(df, aes(x = DUET_outcome))
 prinfFile = g + geom_bar(
 #g + geom_bar(
  aes(fill = DUET_outcome)
  , show.legend = TRUE
  ) + geom_label(
    stat = "count"
    , aes(label = ..count..)
    , color = "black"
    , show.legend = FALSE
    , size = 10) + theme(
      axis.text.x = element_blank()
      , axis.title.x = element_blank()
      , axis.title.y = element_text(size=my_als)
      , axis.text.y = element_text(size = my_ats)
    , legend.position = c(0.73,0.8)
    , legend.text = element_text(size=my_als-2)
    , legend.title = element_text(size=my_als)
    , plot.title = element_blank()
    ) + labs(
      title = ""
      , y = "Number of SNPs"
      #, fill='DUET Outcome'
      ) + scale_fill_discrete(name = "DUET Outcome"
                              , labels = c("Destabilising", "Stabilising"))
 print(prinfFile)
 dev.off()
 #****************
 # generate plot: No of positions
 #****************
 #get freq count of positions so you can subset freq<1
 #setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
 setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
 table(df$pos_count)
 # this is cummulative
 #1   2   3   4   5   6 
 #34  76  63 104  40  18 
 # use group by on this
 snpsBYpos_df <- df %>%
  group_by(Position) %>%
  summarize(snpsBYpos = mean(pos_count))
 table(snpsBYpos_df$snpsBYpos)
 #1  2  3  4  5  6 
 #34 38 21 26  8  3 
 foo = select(df, Mutationinformation
             , WildPos
             , wild_type
             , mutant_type
             , mutation_info
             , position
             , pos_count) #335, 5
 getwd()
 write.csv(foo, "../Data/pos_count_freq.csv")
 svg('position_count_DUET.svg')
 my_ats = 25 # axis text size
 my_als = 22 # axis label size
 g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
 prinfFile = g + geom_bar(
 #g + geom_bar(
  aes (alpha = 0.5)
  , show.legend = FALSE
  ) +
  geom_label(
    stat = "count", aes(label = ..count..)
    , color = "black"
    , size = 10
    ) +
  theme( 
    axis.text.x = element_text(
      size = my_ats
      , angle = 0
      )
    , axis.text.y = element_text(
      size = my_ats
      , angle = 0
      , hjust = 1
      )
  , axis.title.x = element_text(size = my_als)
  , axis.title.y = element_text(size = my_als)
  , plot.title = element_blank()
  ) +
  labs(
    x = "Number of SNPs"
    , y = "Number of Sites"
    )
 print(prinfFile)
 dev.off()
 ########################################################################
 #               			end of DUET barplots         			   #
 ########################################################################
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_PS.R
@ -0,0 +1,175 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 		Installing and loading required packages and functions		   #	
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #==========================
 ###########################
 # Data for PS Corr plots
 # you need merged_df3_comp
 # since these are matched 
 # to allow pairwise corr
 ###########################
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3_comp 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Plot: Correlation plots
 #===========================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df  = my_df
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 str(df)
 table(df$DUET_outcome)
 # unique positions
 length(unique(df$Position)) #{RESULT: unique positions for comp data}
 # subset data to generate pairwise correlations
 corr_data = df[, c("ratioDUET"
 #                  , "ratioPredAff"
 #                  , "DUETStability_Kcalpermol"
 #                  , "PredAffLog"
 #                  , "OR"
                   , "logor"
 #                  , "pvalue"
                   , "neglog10pvalue"
                   , "AF"
                   , "DUET_outcome"
 #                  , "Lig_outcome"
                   , "pyrazinamide"
                   )]
 dim(corr_data)
 rm(df)
 # assign nice colnames (for display)
 my_corr_colnames = c("DUET"
 #                    , "Ligand Affinity"
 #                    , "DUET_raw"
 #                    , "Lig_raw"
 #                    , "OR"
                     , "Log(Odds Ratio)"
 #                    , "P-value"
                     , "-LogP"
                     , "Allele Frequency"
                     , "DUET_outcome"
 #                    , "Lig_outcome"
                     , "pyrazinamide")
 # sanity check
 if (length(my_corr_colnames) == length(corr_data)){
  print("Sanity check passed: corr_data and corr_names match in length")
 }else{
  print("Error: length mismatch!")
 }
 colnames(corr_data)
 colnames(corr_data) <- my_corr_colnames
 colnames(corr_data)
 ###############
 # PLOTS: corr
 # http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
 ###############
 #default pairs plot
 start = 1
 end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
 offset = 1
 my_corr = corr_data[start:(end-offset)]
 head(my_corr)
 #my_cols = c("#f8766d", "#00bfc4")
 # deep blue :#007d85
 # deep red: #ae301e
 #==========
 # psych: ionformative since it draws the ellipsoid
 # https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
 # http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
 #==========
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots"
 getwd()
 svg('DUET_corr.svg', width = 15, height = 15)
 printFile = pairs.panels(my_corr[1:4]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
             , pch = 21
             , jitter = T
             #, alpha = .05
             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 3
             , cex.axis = 2.5
             , cex.labels = 3
             , cex.cor = 1
             , smooth = F
 )
 print(printFile)
 dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/corr_plots_v3_lig.R
@ -0,0 +1,187 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages			   #	
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 ########################################################################
 #		 Read file: call script for combining df for lig		   	   #
 ########################################################################
 source("../combining_two_df_lig.R") 
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for Lig Corr plots
 # you need merged_df3_comp
 # since these are matched 
 # to allow pairwise corr
 ###########################
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df3_comp 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(my_df$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Plot: Correlation plots
 #===========================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT 
 df  = my_df 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # sanity checks
 str(df)
 table(df$Lig_outcome)
 # unique positions
 length(unique(df$Position)) #{RESULT: unique positions for comp data}
 # subset data to generate pairwise correlations
 corr_data = df[, c(#"ratioDUET",
                  "ratioPredAff"
 #                  , "DUETStability_Kcalpermol"
 #                  , "PredAffLog"
 #                  , "OR"
                   , "logor"
 #                  , "pvalue"
                   , "neglog10pvalue"
                   , "AF"
 #                  , "DUET_outcome"
                   , "Lig_outcome"
                   , "pyrazinamide"
                   )] 
 dim(corr_data)
 rm(df)
 # assign nice colnames (for display)
 my_corr_colnames = c(#"DUET",
                     "Ligand Affinity"
 #                    ,"DUET_raw" 
 #                    , "Lig_raw"
 #                    , "OR"
                     , "Log(Odds Ratio)"
 #                    , "P-value"
                     , "-LogP"
                     , "Allele Frequency"
 #                    , "DUET_outcome"
                     , "Lig_outcome"
                     , "pyrazinamide")
 # sanity check
 if (length(my_corr_colnames) == length(corr_data)){
  print("Sanity check passed: corr_data and corr_names match in length")
 }else{
  print("Error: length mismatch!")
 }
 colnames(corr_data)
 colnames(corr_data) <- my_corr_colnames
 colnames(corr_data)
 ###############
 # PLOTS: corr
 # http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
 ###############
 # default pairs plot
 start = 1
 end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
 offset = 1
 my_corr = corr_data[start:(end-offset)]
 head(my_corr)
 #my_cols = c("#f8766d", "#00bfc4")
 # deep blue :#007d85
 # deep red: #ae301e
 #==========
 # psych: ionformative since it draws the ellipsoid
 # https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
 # http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
 #==========
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots"
 getwd()
 svg('Lig_corr.svg', width = 15, height = 15)
 printFile = pairs.panels(my_corr[1:4]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
             , pch = 21
             , jitter = T
 #            , alpha = .05
 #            , points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 3
             , cex.axis = 2.5
             , cex.labels = 3
             , cex.cor = 1
             , smooth = F
 )
 print(printFile)
 dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_basic_barplot.R
@ -0,0 +1,227 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") 
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 require(data.table)
 ########################################################################
 #		 Read file: call script for combining df		   	  		   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #==========================
 ###########################
 # Data for plots
 # you need merged_df2, comprehensive one
 # since this has one-many relationship
 # i.e the same SNP can belong to multiple lineages
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 #==========================
 # Plot: Lineage barplot
 # x = lineage y = No. of samples
 # col = Lineage
 # fill = lineage
 #============================
 table(my_df$lineage)
 #        lineage1   lineage2   lineage3   lineage4   lineage5   lineage6 lineageBOV 
 #3        104       1293        264       1311          6          6        105 
 #===========================
 # Plot: Lineage Barplots
 #===========================
 #===================
 # Data for plots
 #===================
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df <- my_df
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(my_df)
 # get freq count of positions so you can subset freq<1
 #setDT(df)[, lineage_count := .N, by = .(lineage)]
 #******************
 # generate plot: barplot of mutation by lineage
 #******************
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")
 df_lin = subset(df, subset = lineage %in% sel_lineages )
 #FIXME; add sanity check for numbers.
 # Done this manually
 ############################################################
 #########
 # Data for barplot: Lineage barplot
 # to show total samples and number of unique mutations 
 # within each linege
 ##########
 # Create df with lineage inform & no. of unique mutations
 # per lineage and total samples within lineage
 # this is essentially barplot with two y axis
 bar = bar = as.data.frame(sel_lineages) #4, 1
 total_snps_u = NULL
 total_samples = NULL
 for (i in sel_lineages){
  #print(i)
  curr_total = length(unique(df$id)[df$lineage==i])
  total_samples = c(total_samples, curr_total)
  print(total_samples)
  foo = df[df$lineage==i,]
  print(paste0(i, "======="))
  print(length(unique(foo$Mutationinformation)))
  curr_count = length(unique(foo$Mutationinformation))
  total_snps_u = c(total_snps_u, curr_count)
 }
 print(total_snps_u)
 bar$num_snps_u = total_snps_u
 bar$total_samples = total_samples
 bar
 #*****************
 # generate plot: lineage barplot with two y-axis
 #https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
 #*****************
 bar$num_snps_u = y1
 bar$total_samples = y2
 sel_lineages = x
 to_plot = data.frame(x = x
                      , y1 = y1
                      , y2 = y2)
 to_plot
 melted = melt(to_plot, id = "x")
 melted
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 svg('lineage_basic_barplot.svg')
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 g = ggplot(melted
           , aes(x = x
                 , y = value
                 , fill = variable)
           )
 printFile = g + geom_bar(
 #g + geom_bar(
  stat = "identity"
  , position = position_stack(reverse = TRUE)
  , alpha=.75
  , colour='grey75'
    ) + theme(
    axis.text.x = element_text(
      size = my_ats
 #      , angle= 30
    )
  , axis.text.y = element_text(size = my_ats
  #, angle = 30
  , hjust = 1
  , vjust = 0)
  , axis.title.x = element_text(
    size = my_als
    , colour = 'black'
    )
  , axis.title.y = element_text(
    size = my_als
    , colour = 'black'
  )
  , legend.position = "top"
  , legend.text = element_text(size = my_als)
  #) + geom_text(
  ) + geom_label(
    aes(label = value)
    , size = 5
    , hjust = 0.5
    , vjust = 0.5
    , colour = 'black'
    , show.legend = FALSE
    #, check_overlap = TRUE
    , position = position_stack(reverse = T)
    #, position = ('
  ) + labs(
    title = ''
    , x = ''
    , y = "Number"
    , fill = 'Variable'
    , colour = 'black'
  ) + scale_fill_manual(
      values = c('grey50', 'gray75')
      , name=''
      , labels=c('Mutations', 'Total Samples')
    ) + scale_x_discrete(
      breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
      , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
    )
 print(printFile)
 dev.off()
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_LIG.R
@ -0,0 +1,233 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 #require(data.table)
 ########################################################################
 #		 Read file: call script for combining df for Lig		   	   #
 ########################################################################
 source("../combining_two_df_lig.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for plots
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 table(my_df$mutation_info)
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(my_df$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 #===================
 # Data for plots
 #===================
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")
 # uncomment as necessary
 df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
 #78     961      195     803 
 # when merged_df2_comp is used
 #lineage1 lineage2 lineage3 lineage4 
 #77     955      194     770
 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}
 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
 if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df <- df_lin
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(df_lin)
 #******************
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
 library(plotly)
 library(ggridges)
 fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
 names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 g <- ggplot(df, aes(x = ratioPredAff)) + 
  geom_density(aes(fill = Lig_outcome)
               , alpha = 0.5) + 
  facet_wrap( ~ lineage
             , scales = "free"
             , labeller = labeller(lineage = fooNames) ) +
  coord_cartesian(xlim = c(-1, 1)
 #                  , ylim = c(0, 6)
 #                  , clip = "off"
 ) 
    ggtitle("Kernel Density estimates of Ligand affinity by lineage")
 ggplotly(g)
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
 names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 svg('lineage_dist_LIG.svg')
 printFile = ggplot( df, aes(x = ratioPredAff
                          , y = Lig_outcome) ) +
  geom_density_ridges_gradient( aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
 #              , switch = 'x'
              , labeller = labeller(lineage = fooNames) ) +
  coord_cartesian( xlim = c(-1, 1)
 #                  , ylim = c(0, 6)
 #                  , clip = "off"
                  ) +
  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
                        , name = "Ligand Affinity" ) +
  theme( axis.text.x = element_text( size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
 #         , axis.text.y = element_text( size = my_ats
 #                                       , angle = 0
 #                                       , hjust = 1
 #                                       , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
         , legend.text = element_text(size = 10)
         , legend.title = element_text(size = my_als)
 #         , legend.position = c(0.3, 0.8)
 #         , legend.key.height = unit(1, 'mm')
      ) 
 print(printFile)
 dev.off()
 #=!=!=!=!=!=!
 # COMMENT: When you look at all mutations, the lineage differences disappear...
 # The pattern we are interested in is possibly only for dr_mutations
 #=!=!=!=!=!=!
 #===================================================
 # COMPARING DISTRIBUTIONS
 head(df$lineage)
 df$lineage = as.character(df$lineage)
 lin1 = df[df$lineage == "lineage1",]$ratioPredAff
 lin2 = df[df$lineage == "lineage2",]$ratioPredAff
 lin3 = df[df$lineage == "lineage3",]$ratioPredAff
 lin4 = df[df$lineage == "lineage4",]$ratioPredAff
 # ks test
 ks.test(lin1,lin2) 
 ks.test(lin1,lin3) 
 ks.test(lin1,lin4) 
 ks.test(lin2,lin3) 
 ks.test(lin2,lin4) 
 ks.test(lin3,lin4) 
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -0,0 +1,212 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 #require(data.table)
 ########################################################################
 #		 Read file: call script for combining df for PS			   	   #
 ########################################################################
 source("../combining_two_df.R")
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 #==========================
 # This will return:
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for plots
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 ###########################
 # uncomment as necessary
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 table(my_df$mutation_info)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 #===================
 # Data for plots
 #===================
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")
 # uncomment as necessary
 df_lin = subset(my_df, subset = lineage %in% sel_lineages )
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
 #104     1293      264     1311 
 # when merged_df2_comp is used
 #lineage1 lineage2 lineage3 lineage4 
 #99     1275      263     1255
 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}
 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
 if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 # REASSIGNMENT
 df <- df_lin
 #<<<<<<<<<<<<<<<<<<<<<<<<<
 rm(df_lin)
 #******************
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
 library(plotly)
 library(ggridges)
 g <- ggplot(df, aes(x = ratioDUET)) + 
  geom_density(aes(fill = DUET_outcome)
               , alpha = 0.5) + facet_wrap(~ lineage,
                                           scales = "free") +
  ggtitle("Kernel Density estimates of Protein stability by lineage")
 ggplotly(g)
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
 names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 svg('lineage_dist_PS.svg')
 printFile = ggplot( df, aes(x = ratioDUET
                            , y = DUET_outcome) )+
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient( aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
 #             , switch = 'x'
              , labeller = labeller(lineage = fooNames) ) +
  coord_cartesian( xlim = c(-1, 1)
 #                  , ylim = c(0, 6)
 #                  , clip = "off" 
                ) +
  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
  theme( axis.text.x = element_text( size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
 #         , axis.text.y = element_text( size = my_ats
 #                                       , angle = 0
 #                                       , hjust = 1
 #                                       , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size=my_als)
         , legend.text = element_text(size=10)
         , legend.title = element_text(size=my_als)
 #         , legend.position = c(0.3, 0.8)
 #         , legend.key.height = unit(1, 'mm')
        ) 
 print(printFile)
 dev.off()
 #=!=!=!=!=!=!
 # COMMENT: When you look at all mutations, the lineage differences disappear...
 # The pattern we are interested in is possibly only for dr_mutations
 #=!=!=!=!=!=!
 #===================================================
 # COMPARING DISTRIBUTIONS
 head(df$lineage)
 df$lineage = as.character(df$lineage)
 lin1 = df[df$lineage == "lineage1",]$ratioDUET
 lin2 = df[df$lineage == "lineage2",]$ratioDUET
 lin3 = df[df$lineage == "lineage3",]$ratioDUET
 lin4 = df[df$lineage == "lineage4",]$ratioDUET
 # ks test
 ks.test(lin1,lin2) 
 ks.test(lin1,lin3) 
 ks.test(lin1,lin4) 
 ks.test(lin2,lin3)
 ks.test(lin2,lin4)  
 ks.test(lin3,lin4)  
--- a/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/read_pdb.R
@ -0,0 +1,27 @@
 #########################
 #3: Read complex pdb file
 ##########################
 source("Header_TT.R")
 # This script only reads the pdb file of your complex
 # read in pdb file complex1 
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 #inFile2 = paste0(inDir, "complex2_no_water.pdb")
 #complex2 = inFile2
 # list of 8
 my_pdb = read.pdb(complex1
                  , maxlines = -1
                  , multi = FALSE 
                  , rm.insert = FALSE
                  , rm.alt = TRUE
                  , ATOM.only = FALSE 
                  , hex = FALSE
                  , verbose = TRUE)
 rm(inDir, inFile, complex1)
 #====== end of script
--- a/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
+++ b/mcsm_analysis/pyrazinamide/scripts/replaceBfactor_pdb.R
@ -0,0 +1,386 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			               #
 ########################################################################
 source("Header_TT.R")
 #########################################################
 # TASK: replace B-factors in the pdb file with normalised values
 # use the complex file with no water as mCSM lig was 
 # performed on this file. You can check it in the script: read_pdb file.
 #########################################################
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
                  , header = T)
 str(my_df)
 #=========================================================
 # Processing P1: Replacing B factor with mean ratioDUET scores
 #=========================================================
 #########################
 # Read complex pdb file
 # form the R script
 ##########################
 source("read_pdb.R") # list of 8
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 #*******************************************
 # plot histograms for inspection
 # 1: original B-factors
 # 2: original DUET Scores
 # 3: replaced B-factors with DUET Scores
 #*********************************************
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 #1: Original B-factor
 hist(d$b
     , xlab = "" 
     , main = "B-factor")
 plot(density(d$b)
     , xlab = ""
     , main = "B-factor")
 # 2: DUET scores
 hist(my_df$average_DUETR
     , xlab = "" 
     , main = "Norm_DUET")
 plot(density(my_df$average_DUETR)
     , xlab = ""
     , main = "Norm_DUET")
 # 3: After the following replacement
 #********************************
 #=========
 # step 0_P1: DONT RUN once you have double checked the matched output
 #=========
 # sanity check:  match and assign to a separate column to double check
 # colnames(my_df)
 # d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
 #=========
 # step 1_P1
 #=========
 # Be brave and replace in place now (don't run sanity check)
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P1
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na 
 # count number of 0's in Bactor
 sum(d$b == 0)
 #table(d$b)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 # sanity check: should be True
 if (sum(d$b == 0) == b_na){
  print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
  print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if(max(d$b) == max(my_df$average_DUETR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_DUETR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 #=========
 # step 3_P1
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P1
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d 
 max(d$b); min(d$b)
 #=========
 # step 5_P1
 #=========
 # output dir
 getwd()
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
 write.pdb(my_pdb, outFile)
 #********************************
 # Add the 3rd histogram and density plots for comparisons
 #********************************
 # Plots continued...
 # 3: hist and density of replaced B-factors with DUET Scores
 hist(d$b
     , xlab = ""
     , main = "repalced-B")
 plot(density(d$b)
     , xlab = ""
     , main = "replaced-B")
 # graph titles
 mtext(text = "Frequency"
      , side = 2
      , line = 0
      , outer = TRUE)
 mtext(text = "DUET_stability"
      , side = 3
      , line = 0
      , outer = TRUE)
 #********************************
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # NOTE: This replaced B-factor distribution has the same
 # x-axis as the PredAff normalised values, but the distribution
 # is affected since 0 is overinflated. This is because all the positions
 # where there are no SNPs have been assigned 0.
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #######################################################################
 #====================== end of section 1 ==============================
 #######################################################################
 #=========================================================
 # Processing P2: Replacing  B values with PredAff Scores
 #=========================================================
 # clear workspace 
 rm(list = ls())
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
                  , header = T) 
 str(my_df)
 #rm(inDir, inFile)
 #########################
 # 3: Read complex pdb file
 # form the R script
 ##########################
 source("read_pdb.R") # list of 8
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 #*******************************************
 # plot histograms for inspection
 # 1: original B-factors
 # 2: original Pred Aff Scores
 # 3: replaced B-factors with PredAff Scores
 #********************************************
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
    , mar = c(1,3,5,2)
    , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 # 1: Original B-factor
 hist(d$b
     , xlab = "" 
     , main = "B-factor")
 plot(density(d$b)
     , xlab = ""
     , main = "B-factor")
 # 2: Pred Aff scores
 hist(my_df$average_PredAffR
     , xlab = "" 
     , main = "Norm_lig_average")
 plot(density(my_df$average_PredAffR)
     , xlab = ""
     , main = "Norm_lig_average")
 # 3: After the following replacement
 #********************************
 #=================================================
 # Processing P2: Replacing  B values with ratioPredAff scores
 #=================================================
 # use match to perform this replacement linking with "position no"
 # in the pdb file, this corresponds to column "resno"
 # in my_df, this corresponds to column "Position"
 #=========
 # step 0_P2: DONT RUN once you have double checked the matched output
 #=========
 # sanity check:  match and assign to a separate column to double check
 # colnames(my_df)
 # d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
 #=========
 # step 1_P2: BE BRAVE and replace in place now (don't run step 0)
 #=========
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P2
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na
 # count number of 0's in Bactor
 sum(d$b == 0)
 #table(d$b)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 if (sum(d$b == 0) == b_na){
  print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
  print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if (max(d$b) == max(my_df$average_PredAffR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_PredAffR)){
  print("Sanity check passed: B-factors replaced correctly")
 } else {
  print ("Error: Debug code please")
 }
 #=========
 # step 3_P2
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P2
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d 
 max(d$b); min(d$b)
 #=========
 # step 5_P2
 #=========
 # output dir
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
 write.pdb(my_pdb, outFile)
 #********************************
 # Add the 3rd histogram and density plots for comparisons
 #********************************
 # Plots continued...
 # 3: hist and density of replaced B-factors with PredAff Scores
 hist(d$b
     , xlab = ""
     , main = "repalced-B")
 plot(density(d$b)
     , xlab = ""
     , main = "replaced-B")
 # graph titles
 mtext(text = "Frequency"
      , side = 2
      , line = 0
      , outer = TRUE)
 mtext(text = "Lig_stability"
      , side = 3
      , line = 0
      , outer = TRUE)
 #********************************
 ###########
 # end of output files with Bfactors
 ##########
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@ -0,0 +1,257 @@
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 #########################################################
 # 1: Installing and loading required packages           #
 #########################################################
 source("Header_TT.R")
 #source("barplot_colour_function.R")
 ##########################################################
 #           Checking: Entire data frame and for PS      #
 ##########################################################
 ###########################
 #2) Read file: combined one from the script
 ###########################
 source("combining_two_df.R")
 # df with NA:
 # merged_df2
 # merged_df3:
 # df without NA:
 # merged_df2_comp:
 # merged_df3_comp:
 ######################
 # You need to check it
 # with the merged_df3
 ########################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df = merged_df3
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 #clear variables
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # should be true
 identical(my_df$Position, my_df$position)
 #################################
 # Read file: normalised file
 # output of step 4 mcsm_pipeline
 #################################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
 mcsm_data <- read.csv(inFile
                  , row.names = 1
                  , stringsAsFactors = F
                  , header = T)
 str(mcsm_data)
 my_colnames  = colnames(mcsm_data)
 #====================================
 # subset my_df to include only the columns in mcsm data
 my_df2 = my_df[my_colnames]
 #====================================
 # compare the two
 head(mcsm_data$Mutationinformation)
 head(mcsm_data$Position)
 head(my_df2$Mutationinformation)
 head(my_df2$Position)
 # sort mcsm data by Mutationinformation
 mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
 head(mcsm_data_s$Mutationinformation)
 head(mcsm_data_s$Position)
 # now compare: should be True, but is false....
 # possibly due to rownames!?!
 identical(mcsm_data_s, my_df2)
 # from library dplyr
 setdiff(mcsm_data_s, my_df2)
 #from lib compare
 compare(mcsm_data_s, my_df2) # seems rownames are the problem
 # FIXME: automate this
 # write files: checked using meld and files are indeed identical
 #write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
 #write.csv(my_df2, "my_df2.csv", row.names = F)
 #====================================================== end of section 1
 ##########################################################
 #             Checking: LIG(Filtered dataframe)          #
 ##########################################################
 # clear workspace
 rm(list = ls())
 ###########################
 #3) Read file: combined_lig from the script
 ###########################
 source("combining_two_df_lig.R")
 # df with NA:
 # merged_df2 :
 # merged_df3:
 # df without NA:
 # merged_df2_comp:
 # merged_df3_comp:
 ######################
 # You need to check it
 # with the merged_df3
 ########################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df = merged_df3
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 #clear variables
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # should be true
 identical(my_df$Position, my_df$position)
 #################################
 # Read file: normalised file
 # output of step 4 mcsm_pipeline
 #################################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
 mcsm_data <- read.csv(inFile
                      , row.names = 1
                      , stringsAsFactors = F
                      , header = T)
 str(mcsm_data)
 ###########################
 # 4a: Filter/subset data: ONLY for LIGand analysis
 # Lig plots < 10Ang
 # Filter the lig plots for Dis_to_lig < 10Ang
 ###########################
 # sanity checks
 upos = unique(mcsm_data$Position)
 # check range of distances
 max(mcsm_data$Dis_lig_Ang)
 min(mcsm_data$Dis_lig_Ang)
 # Lig filtered: subset data to have only values less than 10 Ang
 mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
 rm(mcsm_data) #to avoid confusion
 table(mcsm_data2$Dis_lig_Ang<10)
 table(mcsm_data2$Dis_lig_Ang>10)
 max(mcsm_data2$Dis_lig_Ang)
 min(mcsm_data2$Dis_lig_Ang)
 upos_f = unique(mcsm_data2$Position); upos_f
 # colnames of df that you will need to subset the bigger df from
 my_colnames  = colnames(mcsm_data2)
 #====================================
 # subset bigger df i.e my_df to include only the columns in mcsm data2
 my_df2 = my_df[my_colnames] 
 rm(my_df) #to avoid confusion
 #====================================
 # compare the two
 head(mcsm_data2$Mutationinformation)
 head(mcsm_data2$Position)
 head(my_df2$Mutationinformation)
 head(my_df2$Position)
 # sort mcsm data by Mutationinformation
 mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
 head(mcsm_data2_s$Mutationinformation)
 head(mcsm_data2_s$Position)
 # now compare: should be True, but is false....
 # possibly due to rownames!?!
 identical(mcsm_data2_s, my_df2)
 # from library dplyr
 setdiff(mcsm_data2_s, my_df2)
 # from library compare
 compare(mcsm_data2_s, my_df2) # seems rownames are the problem
 #FIXME: automate this
 # write files: checked using meld and files are indeed identical
 #write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
 #write.csv(my_df2, "my_df2.csv", row.names = F)
 ##########################################################
 #  extract and write output file for SNP posn: all     #
 ##########################################################
 head(merged_df3$Position)
 foo = merged_df3[order(merged_df3$Position),]
 head(foo$Position)
 snp_pos_unique = unique(foo$Position); snp_pos_unique
 # sanity check: 
 table(snp_pos_unique == combined_df$Position)
 #=====================
 # write_output files
 #=====================
 outDir = "~/Data/pyrazinamide/input/processed/"
 outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
 print(paste0("Output file name and path will be:","", outFile1))
 write.table(snp_pos_unique
            , outFile1
            , row.names = F
            , col.names = F)
 ##############################################################
 #  extract and write output file for SNP posn: complete only #
 ##############################################################
 head(merged_df3_comp$Position)
 foo = merged_df3_comp[order(merged_df3_comp$Position),]
 head(foo$Position)
 snp_pos_unique = unique(foo$Position); snp_pos_unique 
 # outDir = "~/Data/pyrazinamide/input/processed/" # already set
 outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
 print(paste0("Output file name and path will be:", outFile2))
 write.table(snp_pos_unique
            , outFile2
            , row.names = F
            , col.names = F)
 #============================== end of script
--- a/meta_data_analysis/.Rhistory
+++ b/meta_data_analysis/.Rhistory
@ -0,0 +1,512 @@
 , stringsAsFactors = F)
 x = as.numeric(grepl(i,raw_data$all_muts_pza))
 # DV: pyrazinamide 0 or 1
 y = as.numeric(raw_data$pyrazinamide)
 table(y,x)
 # run glm model
 model = glm(y ~ x, family = binomial)
 #model = glm(y ~ x, family = binomial(link = "logit"))
 summary(model)
 #**********
 # extract relevant model output
 #**********
 # extract log OR i.e the Beta estimate of the logistic model for a given snp
 my_logor = summary(model)$coefficients[2,1]
 print(paste0('Beta:', my_logor))
 # extract SE of the logistic model for a given snp
 my_se = summary(model)$coefficients[2,2]
 print(paste0('SE:', my_se))
 # extract Z of the logistic model for a given snp
 my_zval = summary(model)$coefficients[2,3]
 print(paste0('Z-value:', my_zval))
 # Dervive OR i.e exp(my_logor) from the logistic model for a given snp
 #my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
 my_or = exp(summary(model)$coefficients[2,1])
 print(paste0('OR:', my_or))
 # sanity check : should be True
 log(my_or) == my_logor
 # extract P-value of the logistic model for a given snp
 my_pval = summary(model)$coefficients[2,4]
 print(paste0('P-value:', my_pval))
 # extract confint interval of snp (2 steps, since the output is a named number)
 ci_mod = exp(confint(model))[2,]
 my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
 print(paste0('CI:', my_ci))
 #*************
 # Assign the regression output in the original df
 # you can use ('=' or '<-/->')
 #*************
 #pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
 my_logor
 pnca_snps_or$Mutationinformation == i
 View(pnca_snps_or)
 #===============
 # Step 4: Calculate for one snp
 # using i, when you run the loop, it is easy
 #===============
 i = "pnca_p.trp68gly"
 pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
 , stringsAsFactors = F
 , header = T) #2133
 # uncomment as necessary
 pnca_snps_or = pnca_snps_or[1:5,]
 pnca_snps_or = pnca_snps_or[c(1:5),]
 #===============
 pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
 , stringsAsFactors = F
 , header = T) #2133
 pnca_snps_or = pnca_snps_or[1:5,]
 pnca_snps_or = pnca_snps_or[c(1:5),]
 pnca_snps_or = pnca_snps_or[1:5]
 pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
 , stringsAsFactors = F
 , header = T) #2133
 pnca_snps_or = pnca_snps_or[1:5]
 pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
 , stringsAsFactors = F
 , header = T) #2133
 foo = pnca_snps_or[c(1:5,)]
 foo = pnca_snps_or[c(1:5),]
 foo = as.data.frame(pnca_snps_or[c(1:5),])
 View(foo)
 # create an empty dataframe
 pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
 # IV: corresponds to each unique snp (extracted using grep)
 x = as.numeric(grepl(i,raw_data$all_muts_pza))
 # DV: pyrazinamide 0 or 1
 y = as.numeric(raw_data$pyrazinamide)
 table(y,x)
 # run glm model
 model = glm(y ~ x, family = binomial)
 #model = glm(y ~ x, family = binomial(link = "logit"))
 summary(model)
 my_logor = summary(model)$coefficients[2,1]
 print(paste0('Beta:', my_logor))
 # extract SE of the logistic model for a given snp
 my_se = summary(model)$coefficients[2,2]
 print(paste0('SE:', my_se))
 # extract Z of the logistic model for a given snp
 my_zval = summary(model)$coefficients[2,3]
 print(paste0('Z-value:', my_zval))
 # Dervive OR i.e exp(my_logor) from the logistic model for a given snp
 #my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
 my_or = exp(summary(model)$coefficients[2,1])
 print(paste0('OR:', my_or))
 # sanity check : should be True
 log(my_or) == my_logor
 # extract P-value of the logistic model for a given snp
 my_pval = summary(model)$coefficients[2,4]
 print(paste0('P-value:', my_pval))
 # extract confint interval of snp (2 steps, since the output is a named number)
 ci_mod = exp(confint(model))[2,]
 my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
 print(paste0('CI:', my_ci))
 #*************
 # Assign the regression output in the original df
 # you can use ('=' or '<-/->')
 #*************
 #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
 my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
 my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
 my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
 my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
 my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
 #===============
 # Step 4: Iterate through this unique list
 # and calculate OR, but only for one snp
 # this is test before you apply it all others
 #===============
 pnca_snps_or$mutation == i
 View(pnca_snps_or)
 # create an empty dataframe
 pnca_snps_or = data.frame(mutation = i)
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
 my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
 my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
 my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
 my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
 my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
 View(pnca_snps_or_copy)
 #===============
 # Step 4: Iterate through this unique list
 # and calculate OR, but only for one snp
 # this is test before you apply it all others
 #===============
 #reset original df so you don't make a mistake
 pnca_snps_or = pnca_snps_or_copy
 for (i in pnca_snps_unique){
 print(i)
 }
 pnca_snps_or = pnca_snps_or_copy #2133, 1
 #........................................
 # create an empty dataframe : uncomment as necessary
 pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
 #........................................
 # create an empty dataframe : uncomment as necessary
 pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
 #........................................
 # create an empty dataframe : uncomment as necessary
 pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
 View(pnca_snps_or)
 # IV: corresponds to each unique snp (extracted using grep)
 x = as.numeric(grepl(i,raw_data$all_muts_pza))
 # DV: pyrazinamide 0 or 1
 y = as.numeric(raw_data$pyrazinamide)
 table(y,x)
 # run glm model
 model = glm(y ~ x, family = binomial)
 #model = glm(y ~ x, family = binomial(link = "logit"))
 summary(model)
 #**********
 # extract relevant model output
 #**********
 # extract log OR i.e the Beta estimate of the logistic model for a given snp
 my_logor = summary(model)$coefficients[2,1]
 print(paste0('Beta:', my_logor))
 # extract SE of the logistic model for a given snp
 my_se = summary(model)$coefficients[2,2]
 print(paste0('SE:', my_se))
 # extract Z of the logistic model for a given snp
 my_zval = summary(model)$coefficients[2,3]
 print(paste0('Z-value:', my_zval))
 # Dervive OR i.e exp(my_logor) from the logistic model for a given snp
 #my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
 my_or = exp(summary(model)$coefficients[2,1])
 print(paste0('OR:', my_or))
 # sanity check : should be True
 log(my_or) == my_logor
 # extract P-value of the logistic model for a given snp
 my_pval = summary(model)$coefficients[2,4]
 print(paste0('P-value:', my_pval))
 # extract confint interval of snp (2 steps, since the output is a named number)
 ci_mod = exp(confint(model))[2,]
 my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
 print(paste0('CI:', my_ci))
 #*************
 # Assign the regression output in the original df
 # you can use ('=' or '<-/->')
 #*************
 #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
 my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
 my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
 my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
 my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
 my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
 View(pnca_snps_or)
 pnca_snps_or = pnca_snps_or_copy #2133, 1
 for (i in pnca_snps_unique){
 print(i)
 #*************
 # start logistic regression model building
 #*************
 # set the IV and DV for the logistic regression model
 # IV: corresponds to each unique snp (extracted using grep)
 x = as.numeric(grepl(i,raw_data$all_muts_pza))
 # DV: pyrazinamide 0 or 1
 y = as.numeric(raw_data$pyrazinamide)
 table(y,x)
 # run glm model
 model = glm(y ~ x, family = binomial)
 #model = glm(y ~ x, family = binomial(link = "logit"))
 summary(model)
 #**********
 # extract relevant model output
 #**********
 # extract log OR i.e the Beta estimate of the logistic model for a given snp
 my_logor = summary(model)$coefficients[2,1]
 print(paste0('Beta:', my_logor))
 # extract SE of the logistic model for a given snp
 my_se = summary(model)$coefficients[2,2]
 print(paste0('SE:', my_se))
 # extract Z of the logistic model for a given snp
 my_zval = summary(model)$coefficients[2,3]
 print(paste0('Z-value:', my_zval))
 # Dervive OR i.e exp(my_logor) from the logistic model for a given snp
 #my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
 my_or = exp(summary(model)$coefficients[2,1])
 print(paste0('OR:', my_or))
 # sanity check : should be True
 log(my_or) == my_logor
 # extract P-value of the logistic model for a given snp
 my_pval = summary(model)$coefficients[2,4]
 print(paste0('P-value:', my_pval))
 # extract confint interval of snp (2 steps, since the output is a named number)
 ci_mod = exp(confint(model))[2,]
 my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
 print(paste0('CI:', my_ci))
 #*************
 # Assign the regression output in the original df
 # you can use ('=' or '<-/->')
 #*************
 #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
 my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
 my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
 my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
 my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
 my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
 }
 warnings()
 View(pnca_snps_or)
 View(pnca_snps_or_copy)
 #sanity check
 pnca_snps_or$mutation == i1
 #sanity check
 pnca_snps_or[pnca_snps_or$mutation == i1]
 pnca_snps_or[pnca_snps_or$mutation == i2]
 pnca_snps_or[pnca_snps_or$mutation == i2,]
 pnca_snps_or1 = unique(pnca_snps_or)
 write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
 # you only need it for the unique mutations
 pnca_snps_or = unique(pnca_snps_or) #2133, 1
 for (i in pnca_snps_unique){
 print(i)
 #*************
 # start logistic regression model building
 #*************
 # set the IV and DV for the logistic regression model
 # IV: corresponds to each unique snp (extracted using grep)
 x = as.numeric(grepl(i,raw_data$all_muts_pza))
 # DV: pyrazinamide 0 or 1
 y = as.numeric(raw_data$pyrazinamide)
 table(y,x)
 # run glm model
 model = glm(y ~ x, family = binomial)
 #model = glm(y ~ x, family = binomial(link = "logit"))
 summary(model)
 #**********
 # extract relevant model output
 #**********
 # extract log OR i.e the Beta estimate of the logistic model for a given snp
 my_logor = summary(model)$coefficients[2,1]
 print(paste0('Beta:', my_logor))
 # extract SE of the logistic model for a given snp
 my_se = summary(model)$coefficients[2,2]
 print(paste0('SE:', my_se))
 # extract Z of the logistic model for a given snp
 my_zval = summary(model)$coefficients[2,3]
 print(paste0('Z-value:', my_zval))
 # Dervive OR i.e exp(my_logor) from the logistic model for a given snp
 #my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
 my_or = exp(summary(model)$coefficients[2,1])
 print(paste0('OR:', my_or))
 # sanity check : should be True
 log(my_or) == my_logor
 # extract P-value of the logistic model for a given snp
 my_pval = summary(model)$coefficients[2,4]
 print(paste0('P-value:', my_pval))
 # extract confint interval of snp (2 steps, since the output is a named number)
 ci_mod = exp(confint(model))[2,]
 my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
 print(paste0('CI:', my_ci))
 #*************
 # Assign the regression output in the original df
 # you can use ('=' or '<-/->')
 #*************
 #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
 my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
 my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
 my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
 my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
 my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
 }
 View(pnca_snps_or)
 2.290256e+01
 1.561132e+06
 3.242285e-04
 #sanity check
 pnca_snps_or[pnca_snps_or$mutation == i1]
 pnca_snps_or[pnca_snps_or$mutation == i2,]
 write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
 my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
 , stringsAsFactors = FALSE) #11374, 19
 View(my_data)
 # remove the first column
 my_data = my_data[-1] #11374, 18
 # check if first col is 'id': should be TRUE
 colnames(my_data)[1] == 'id'
 # sanity check
 snps_all = unique(my_data$mutation)# 337
 pnca_snps_or = snps_all
 pnca_snps_or = as.data.frame(snps_all)
 View(pnca_snps_or)
 snps_all[-"true_wt"]
 pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
 View(pnca_snps_or)
 snps_all = as.data.frame(snps_all)
 View(snps_all)
 #remove true_wt entry
 w1 = which(rownames(snps_all) == "true_wt")
 View(snps_all)
 #remove true_wt entry
 w1 = which(snps_all$snps_all == "true_wt")
 rm(pnca_snps_or)
 pnca_snps_or = snps_all[-w1]
 pnca_snps_or = snps_all[,-w1]
 pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
 #remove true_wt entry
 w1 = which(snps_all) == "true_wt"
 pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
 my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
 , stringsAsFactors = FALSE) #11374, 19
 # remove the first column
 my_data = my_data[-1] #11374, 18
 # check if first col is 'id': should be TRUE
 colnames(my_data)[1] == 'id'
 # sanity check
 snps_all = unique(my_data$mutation)# 337
 snps_all = as.data.frame(snps_all)
 snps_all[-c(1,1)]
 pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
 pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
 #remove true_wt entry
 #w1 = which(snps_all) == "true_wt"
 pnca_snps_or = snps_all
 pnca_snps_or = pnca_snps_or_copy
 #remove true_wt entry
 #w1 = which(snps_all) == "true_wt"
 pnca_snps_or = snps_all
 pnca_snps_or -> pnca_snps_or_copy
 #===============
 # Step 4: Iterate through this unique list
 # and calculate OR for each snp
 # and assign to the pnca_snps_or df that has
 # each row as a unique snp
 #===============
 # reset original df so you don't make a mistake: IMPORTANT
 pnca_snps_or = pnca_snps_or_copy #2133, 1
 # you only need it for the unique mutations
 pnca_snps_or = unique(pnca_snps_or) #337, 1
 for (i in pnca_snps_unique){
 print(i)
 #*************
 # start logistic regression model building
 #*************
 # set the IV and DV for the logistic regression model
 # IV: corresponds to each unique snp (extracted using grep)
 x = as.numeric(grepl(i,raw_data$all_muts_pza))
 # DV: pyrazinamide 0 or 1
 y = as.numeric(raw_data$pyrazinamide)
 table(y,x)
 # run glm model
 model = glm(y ~ x, family = binomial)
 #model = glm(y ~ x, family = binomial(link = "logit"))
 summary(model)
 #**********
 # extract relevant model output
 #**********
 # extract log OR i.e the Beta estimate of the logistic model for a given snp
 my_logor = summary(model)$coefficients[2,1]
 print(paste0('Beta:', my_logor))
 # extract SE of the logistic model for a given snp
 my_se = summary(model)$coefficients[2,2]
 print(paste0('SE:', my_se))
 # extract Z of the logistic model for a given snp
 my_zval = summary(model)$coefficients[2,3]
 print(paste0('Z-value:', my_zval))
 # Dervive OR i.e exp(my_logor) from the logistic model for a given snp
 #my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
 my_or = exp(summary(model)$coefficients[2,1])
 print(paste0('OR:', my_or))
 # sanity check : should be True
 log(my_or) == my_logor
 # extract P-value of the logistic model for a given snp
 my_pval = summary(model)$coefficients[2,4]
 print(paste0('P-value:', my_pval))
 # extract confint interval of snp (2 steps, since the output is a named number)
 ci_mod = exp(confint(model))[2,]
 my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
 print(paste0('CI:', my_ci))
 #*************
 # Assign the regression output in the original df
 # you can use ('=' or '<-/->')
 #*************
 #pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
 my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
 my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
 my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
 my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
 my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
 my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
 }
 getwd()
 #setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
 setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
 #setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
 getwd()
 #===============
 # Step 1: read raw data
 #===============
 raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
 ,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
 raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
 # combine the two mutation columns
 raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
 head(raw_data$all_mutations_pyrazinamide)
 # create yet another column that contains all the mutations but in lower case
 raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
 table(grepl("pnca_p",raw_data$all_muts_pza))
 #FALSE  TRUE
 #10603  1908
 pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
 , stringsAsFactors = F
 , header = T) #2133
 # subset a snall section to test
 #pnca_snps_or_copy = pnca_snps_or
 #pnca_snps_or = pnca_snps_or_copy
 pnca_snps_unique = unique(pnca_snps_or$mutation) #293
 i2 = "pnca_p.trp68gly" # Should exist
 grep(i2, pnca_snps_unique)
 my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
 , stringsAsFactors = FALSE) #11374, 19
 # remove the first column
 my_data = my_data[-1] #11374, 18
 # check if first col is 'id': should be TRUE
 colnames(my_data)[1] == 'id'
 # sanity check
 head(my_data$mutation)
 my_data = unique(my_data$mutation)
 my_data[!duplicated(my_data$mutation)]
 my_data_unique = my_data[!duplicated(my_data$mutation),]
 my_data[!duplicated('mutation'),]
 my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
 my_data_unique = my_data[!duplicated(my_data['mutation']),]
 getwd()
 setwd("/git/LSHTM_analysis/meta_data_analysis")
 getwd()
 getwd()
 setwd("/git/github/LSHTM_analysis/meta_data_analysis")
 getwd()
 #===============
 # Step 1: read GWAS raw data stored in Data_original/
 #===============
 infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
 c = file.choose()
 c = file.choose(../Data_original)
 c = read.csv(file.choose(), stringsAsFactors = F)
 #===============
 # Step 1: read GWAS raw data stored in Data_original/
 #===============
 infile = read.csv(file.choose(), stringsAsFactors = F))
 c = read.csv(file.choose(), stringsAsFactors = F)
 #===============
 # Step 1: read GWAS raw data stored in Data_original/
 #===============
 infile = read.csv(file.choose(), stringsAsFactors = F)
 #===============
 # Step 1: read GWAS raw data stored in Data_original/
 #===============
 infile = read.csv(file.choose(), stringsAsFactors = F)
 raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
 outdir = paste0("../mcsm_analysis",drug,"/Data/")
 # define output variables
 drug  = 'pyrazinamide'
 outdir = paste0("../mcsm_analysis",drug,"/Data/")
 outdir = paste0("../mcsm_analysis/",drug,"/Data/")
 outFile = "meta_data_with_AFandOR.csv"
 output_filename = paste0(outdir, outFile)
 output_filename
--- a/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
+++ b/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -0,0 +1,7 @@
 #!/usr/bin/python3
 # Initialise a blank 'Data' directory and drug subdirs etc.
 # TODO:
 # - Read base dir from config file
 # - Create eg: '~/git/Data/{original,processed}
 # - Create eg: '~/git/Data/processed/' + drug (for each drug)
 # - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
--- a/meta_data_analysis/pnca_AF_and_OR_calcs.R
+++ b/meta_data_analysis/pnca_AF_and_OR_calcs.R
@ -0,0 +1,241 @@
 getwd()
 setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
 getwd()
 #===============
 # Step 1: read GWAS raw data stored in Data_original/
 #===============
 infile = read.csv(file.choose(), stringsAsFactors = F)
 raw_data = infile[,c("id"
                     , "pyrazinamide"
                     , "dr_mutations_pyrazinamide"
                     , "other_mutations_pyrazinamide")]
 #####
 # 1a: exclude na
 #####
 raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
 total_samples = length(unique(raw_data$id))
 print(total_samples)
 # sanity check: should  be true
 is.numeric(total_samples) 
 #####
 # 1b: combine the two mutation columns
 #####
 raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
                                            , raw_data$other_mutations_pyrazinamide)
 head(raw_data$all_mutations_pyrazinamide)
 #####
 # 1c: create yet another column that contains all the mutations but in lower case
 #####
 raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
 # sanity checks
 table(grepl("pnca_p",raw_data$all_muts_pnca))
 # sanity check: should be TRUE
 sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
 # set up variables: can be used for logistic regression as well
 i  = "pnca_p.ala134gly" # has a NA, should NOT exist
 table(grepl(i,raw_data$all_muts_pnca))
 i = "pnca_p.trp68gly"
 table(grepl(i,raw_data$all_muts_pnca))
 mut = grepl(i,raw_data$all_muts_pnca)
 dst = raw_data$pyrazinamide
 table(mut, dst)
 #chisq.test(table(mut,dst))
 #fisher.test(table(mut, dst))
 #table(mut)
 ###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
 pnca_snps_or = read.csv(file.choose()
                        , stringsAsFactors = F
                        , header = T)
 # extract unique snps to iterate over for AF and OR calcs
 # total no of unique snps
 # AF and OR calculations
 pnca_snps_unique = unique(pnca_snps_or$mutation) 
 # Define OR function
 x = as.numeric(mut)
 y = dst
 or = function(x,y){
  tab = as.matrix(table(x,y))
  a = tab[2,2]
  if (a==0){ a<-0.5}
  b = tab[2,1]
  if (b==0){ b<-0.5}
  c = tab[1,2]
  if (c==0){ c<-0.5}
  d = tab[1,1]
  if (d==0){ d<-0.5}
  (a/b)/(c/d)
  }
 dst = raw_data$pyrazinamide
 ors = sapply(pnca_snps_unique,function(m){
  mut = grepl(m,raw_data$all_muts_pnca)
  or(mut,dst)
 })
 ors
 pvals = sapply(pnca_snps_unique,function(m){
  mut = grepl(m,raw_data$all_muts_pnca)
  fisher.test(mut,dst)$p.value
 })
 pvals
 afs = sapply(pnca_snps_unique,function(m){
  mut = grepl(m,raw_data$all_muts_pnca)
  mean(mut)
 })
 afs
 # check ..hmmm
 afs['pnca_p.trp68gly']
 afs['pnca_p.gln10pro'] 
 afs['pnca_p.leu4ser'] 
 #plot(density(log(ors)))
 #plot(-log10(pvals))
 #hist(log(ors)
 #     ,breaks = 100
 #     )
 # subset df cols to add to the calc param df
 pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')] 
 pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
 rownames(pnca_snps_cols) = pnca_snps_cols$mutation
 head(rownames(pnca_snps_cols))
 #snps_with_AF_and_OR
 # combine
 comb_AF_and_OR = data.frame(ors, pvals, afs)
 head(rownames(comb_AF_and_OR))
 # sanity checks: should be the same
 dim(comb_AF_and_OR); dim(pnca_snps_cols)
 table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
 table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
 # merge the above two df whose dim you checked
 snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
                            , by = "row.names"
 #                            , all.x = T
                            )
 #rm(pnca_snps_cols, pnca_snps_or, raw_data)
 #===============
 # Step 3: Read data file where you will add the calculated OR 
 # Note: this is the big file with one-many relationship between snps and lineages
 # i.e fname4 from 'pnca_extraction.py'
 #===============
 my_data = read.csv(file.choose()
                   , row.names = 1
                   , stringsAsFactors = FALSE)
 head(my_data)
 length(unique(my_data$id))
 # check if first col is 'id': should be TRUE
 colnames(my_data)[1] == 'id'
 # sanity check
 head(my_data$mutation)
 # FILES TO MERGE:
 # comb_AF_and_OR: file containing OR
 # my_data = big meta data file 
 # linking column: mutation
 head(my_data)
 merged_df = merge(my_data # big file
                  , snps_with_AF_and_OR # small (afor file)
                  , by = "mutation"
                  , all.x = T) # because you want all the entries of the meta data 
 # sanity checks: should be True 
 # FIXME: I have checked this manually, but make it so it is a pass or a fail!
 comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors  
 merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
 merged_df[merged_df$Mutationinformation.x == "Q10P",]
 # sanity check: very important!
 colnames(merged_df)
 table(merged_df$mutation_info.x == merged_df$mutation_info.y)
 #FIXME: what happened to other 7 and FALSE
 table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
 # problem
 identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
 #merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
 #throw away the y because that is a smaller df
 d1 = which(colnames(merged_df) == "mutation_info.y") #21
 d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
 merged_df2 = merged_df[-c(d1, d2)] #3093 20
 colnames(merged_df2)
 # rename cols 
 colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
 colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
 colnames(merged_df2)
 # should be 0
 sum(is.na(merged_df2$Mutationinformation))
 # count na in each column
 na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
 # only some or and Af should be NA
 #Row.names           ors               pvals               afs 
 #81                  81                81                  81 
 colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
 colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
 colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
 colnames(merged_df2)
 # add log OR and neglog pvalue
 merged_df2$logor = log(merged_df2$OR)
 is.numeric(merged_df2$logor)
 merged_df2$neglog10pvalue = -log10(merged_df2$pvalue) 
 is.numeric(merged_df2$neglog10pvalue)
 # write file out
 #write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
 # define output variables
 drug  = 'pyrazinamide'
 out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
 outFile = "meta_data_with_AFandOR.csv"
 output_filename = paste0(outdir, outFile)
 write.csv(merged_df2, output_filename
          , row.names = F)
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -0,0 +1,626 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 """
 # FIXME: include error checking to enure you only
 # concentrate on positions that have structural info?
 #%% load libraries
 ###################
 # load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
 #from pandas.api.types import is_string_dtype
 #from pandas.api.types import is_numeric_dtype
 # to create dir
 #my_dir = os.path.expanduser('~/some_dir')
 #make sure mcsm_analysis/ exists
 #or specify the output directory
 #%%
 #%%
 #%%
 #========================================================
 # TASK: extract ALL pncA mutations from GWAS data
 #========================================================
 #%%
 ####################
 # my working dir
 os.getcwd()
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #%%
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
 #%%
 #NOTE: Out_dir MUST exis
 # User defined dir strpyrazinamide
 drug = 'pyrazinamide'
 gene = 'pnca'
 out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
 # = out_dir + drug
 data_dir = homedir + '/git/Data'
 if not os.path.exists(data_dir):
    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
    os.makedirs(data_dir)
    die()
 if not os.path.exists(out_dir):
    print('Error!', out_dir, 'does not exist. Please create it')
    exit()
 #if not os.path.exists(work_dir):
 #    print('creating dir that does not exist', 'dir_name:', work_dir)
 #    os.makedirs(work_dir)
 else:
    print('Dir exists: Carrying on')
 # now create sub dir structure within work_dir
 # pyrazinamide/mcsm_analysis
 # we need three dir
 # Data
 # Scripts
    # Plotting
 # Results
    # Plots
 # create a list of dir names
 #dir_names = ['Data', 'Scripts', 'Results']
 #for i in dir_names:
 #    this_dir = (work_dir + '/' + i)
 #    if not os.path.exists(this_dir):
 #        print('creating dir that does not exist:', this_dir)
 #        os.makedirs(this_dir)
 #else:
 #    print('Dir exists: Carrying on')
 # Create sub dirs
 # 1)        
 # Scripts
    # Plotting
 #subdir_plotting = work_dir + '/Scripts/Plotting'
 #if not os.path.exists(subdir_plotting):
 #      print('creating dir that does not exist:', subdir_plotting)
 #      os.makedirs(subdir_plotting)
 #else:
 #    print('Dir exists: Carrying on')
 # 2)    
 # Results
    # Plots
 #subdir_plots = work_dir + '/Results/Plots'        
 #if not os.path.exists(subdir_plots):
 #        print('creating dir that does not exist:', subdir_plots)
 #        os.makedirs(subdir_plots)    
 #else:
 #    print('Dir exists: Carrying on')
 # clear varaibles
 #del(dir_names, drug, i, subdir_plots, subdir_plotting)
 #exit()
 #%%
 #==============================================================================
 ############
 # STEP 1: Read file original_tanushree_data_v2.csv
 ############
 data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
 meta_data = pd.read_csv(data_file, sep = ',') 
 # column names
 list(meta_data.columns)
 # extract elevant columns to extract from meta data related to the pyrazinamide
 meta_data = meta_data[['id'
       ,'country'
       ,'lineage'
       ,'sublineage'
       ,'drtype'
       , 'pyrazinamide'
       , 'dr_mutations_pyrazinamide'
       , 'other_mutations_pyrazinamide'
        ]] 
 # checks
 total_samples = meta_data['id'].nunique() # 19265
 # counts NA per column
 meta_data.isna().sum()
 # glance
 meta_data.head()
 # equivalent of table in R
 # pyrazinamide counts
 meta_data.pyrazinamide.value_counts() 
 #%%
 ############
 # STEP 2: extract entries containing selected genes: 
 # pyrazinamide: pnca_p.
 # in the dr_mutations and other mutations"
 # as we are interested in the mutations in the protein coding region only 
 # (corresponding to a structure)
 # and drop the entries with NA
 #############
 meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
 meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
 del(meta_pza)
 ##########################
 # pyrazinamide: pnca_p.
 ##########################
 meta_data_pnca = meta_data[['id'
       ,'country'
       ,'lineage'
       ,'sublineage'
       ,'drtype'
       , 'pyrazinamide'
       , 'dr_mutations_pyrazinamide'
       , 'other_mutations_pyrazinamide'
        ]] 
 del(meta_data)
 # sanity checks
 # dr_mutations only
 meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
 meta_pnca_dr['id'].nunique() 
 del(meta_pnca_dr)
 # other mutations
 meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
 meta_pnca_other['id'].nunique() 
 del(meta_pnca_other)
 # Now extract "all" mutations
 meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
 meta_pnca_all['id'].nunique() 
 pnca_samples = len(meta_pnca_all)
 pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
 comp_pnca_samples = pnca_samples - pnca_na 
 #=#=#=#=#=#=#
 # COMMENT: use it later to check number of complete samples from LF data
 #=#=#=#=#=#=#
 # sanity checks
 meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
 meta_pnca_all.other_mutations_pyrazinamide.value_counts()
 # more sanity checks 
 # !CAUTION!: muts will change depending on your gene
 # dr muts : insert
 meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
 meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
 meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
 meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
 m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
 # other_muts
 meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
 meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
 #=#=#=#=#=#=#=#=#=#
 # FIXME
 # COMMENTS: both mutations columns are separated by ; 
 # CHECK if there are mutations that exist both in dr and other_muts!
 # doesn't make any sense for the same mut to exist in both, I would have thought!
 #=#=#=#=#=#=#=#=#=#
 # remove not required variables
 del(meta_data_pnca)
 ############
 # STEP 3: split the columns of 
 # a) dr_mutation_... (;) as 
 # the column has snps related to multiple genes.
 # useful links
 # https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
 # this one works beautifully
 # https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
 ############
 # sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
 meta_pnca_all.isna().sum()
 #=#=#=#=#=#=#=#=#=#
 # COMMENT: no NA's in dr_mutations/other_mutations_columns
 #=#=#=#=#=#=#=#=#=#
 # define the split function
 def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row
    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df
 ########
 # 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
 #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
 ########    
 meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 
 # remove leading white space else these are counted as distinct mutations as well
 meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
 ########
 # 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
 ######## 
 meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 
 # remove the leading white spaces in the column
 meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 
 ##########
 # Step 4: Reshape data so that all mutations are in one column and the 
 # annotations for the mutation reflect the column name
 # LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
 # data frame “df” is passed to melt() function
 # id_vars is the variable which need to be left unaltered
 # var_name are the column names so we named it as 'mutation_info'
 # value_name are its values so we named it as 'mutation'
 ##########
 meta_pnca_WF1.columns
 meta_pnca_LF0 = pd.melt(meta_pnca_WF1
                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
                      , var_name = 'mutation_info'
                      , value_name = 'mutation') 
 # sanity check: should be true
 if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
    print('sanity check passed: Long format df has the expected length')
 else:
    print("Sanity check failed: Debug please!")
 ###########
 # Step 5: This is still dirty data. Filter LF data so that you only have
 # mutations corresponding to pnca_p. 
 # this will be your list you run OR calcs 
 ###########
 meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 
 # sanity checks
 # unique samples
 meta_pnca_LF1['id'].nunique()
 if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
    print("Sanity check passed: No of samples with pncA mutations match")
 else:
    print("Sanity check failed: Debug please!")
 # count if all the mutations are indeed in the protein coding region 
 # i.e begin with pnca_p
 meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
 # should  be true.
 # and check against the length of the df, which should match
 if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
 else:
    print("Sanity check failed: Debug please!")
 ###########
 # Step 6: Filter dataframe with "na" in the drug column
 # This is because for OR, you can't use the snps that have the
 # NA in the specified drug column
 # it creates problems when performing calcs in R inside the loop
 # so best to filter it here
 ###########
 # NOT NEEDED FOR all snps, only for extracting valid OR snps
 del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
 ###########
 # Step 7: count unique pncA_p mutations (all and comp cases)
 ###########
 meta_pnca_LF1['mutation'].nunique() 
 meta_pnca_LF1.groupby('mutation_info').nunique()
 meta_pnca_LF1['id'].nunique()  
 meta_pnca_LF1['mutation'].nunique() 
 meta_pnca_LF1.groupby('id').nunique()
 ###########
 # Step 8: convert all snps only (IN LOWERCASE)
 # because my mcsm file intergated has lowercase
 ###########
 # convert mutation to lower case as it needs to exactly match the dict key
 #meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
 meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
 ###########
 # Step 9 : Split 'mutation' column into three:  wild_type, position and
 # mutant_type separately. Then map three letter code to one from the 
 # referece_dict imported pncaeady. First convert to mutation to lowercase
 # to allow to match entries from dict 
 ###########
 #=======
 # Step 9a: iterate through the dict, create a lookup dict i.e
 # lookup_dict = {three_letter_code: one_letter_code}.
 # lookup dict should be the key and the value (you want to create a column for)
 # Then use this to perform the mapping separetly for wild type and mutant cols.
 # The three letter code is extracted using a regex match from the dataframe and then converted
 # to 'pandas series'since map only works in pandas series
 #=======
 # initialise a sub dict that is a lookup dict for three letter code to one
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['one_letter_code']
    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
 # extract position info from mutation column separetly using regex
 meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 #=========
 # Step 9b: iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_prop_water} 
 # Do this for both wild_type and mutant as above.
 #=========
 # initialise a sub dict that is lookup dict for three letter code to aa prop
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_prop_water']
    #print(lookup_dict)
    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
 # added two more cols
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 #========
 # Step 9c: iterate through the dict, create a lookup dict that i.e
 # lookup_dict =  {three_letter_code: aa_prop_polarity} 
 # Do this for both wild_type and mutant as above.
 #=========
 # initialise a sub dict that is lookup dict for three letter code to aa prop
 lookup_dict = dict()
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_prop_polarity']
    #print(lookup_dict)
    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
 # added two more cols
 # clear variables
 del(k, v, wt, mut, lookup_dict)
 ########
 # Step 10: combine the wild_type+poistion+mutant_type columns to generate 
 # Mutationinformation (matches mCSM output field)
 # Remember to use .map(str) for int col types to allow string concatenation
 #########
 meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
 #=#=#=#=#=#=#
 # Step 11:
 # COMMENT: there is more processing in the older version of this script
 # consult if necessary
 # possibly due to the presence of true_wt
 # since this file doesn't contain any true_wt, we won't need it(hopefully!)
 #=#=#=#=#=#=#
 #%%
 ###########
 # Step 12: Output files for only SNPs to run mCSM
 ###########
 #=========
 # Step 12a: all SNPs to run mCSM
 #=========
 snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
 pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
 # assign meaningful colnames 
 #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
 #list(snps_only.columns)
 snps_only.isna().sum() # should be 0
 # output csv: all SNPS for mCSM analysis
 # specify variable name for output file
 gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname1 = '_snps_'
 nrows = len(snps_only) 
 #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
 output_file_path = data_dir + '/input/processed/' + drug + '/'
 if not os.path.exists(output_file_path):
    print( output_file_path, 'does not exist. Creating')
    os.makedirs(output_file_path)
    exit()
 output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
 print(output_filename) #<<<- check
 # write to csv: without column or row names
 # Bad practice: numbers at the start of a filename
 snps_only.to_csv(output_filename, header = False, index = False)
 #=========
 # Step 12b: all snps with annotation
 #=========
 # all snps, selected cols
 #pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
 #                               , 'drtype', 'pyrazinamide'
 #                               , 'mutation_info', 'mutation', 'Mutationinformation']] 
 #len(pnca_snps_ALL) 
 # sanity check
 #meta_pnca_LF1['mutation'].nunique() 
 # output csv: WITH column but WITHOUT row names(all snps with meta data)
 # specify variable name for output file
 #gene = 'pnca'
 #drug = 'pyrazinamide'
 #my_fname2 = '_snps_with_metadata_'
 #nrows = len(pnca_snps_ALL) 
 #output_file_path = work_dir + '/Data/'
 #output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
 #print(output_filename)  #<<<- check
 # write out file
 #pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
 #=========
 # Step 12c: comp snps for OR calcs with annotation
 #=========
 # remove all NA's from pyrazinamide column from LF1
 # counts NA per column
 meta_pnca_LF1.isna().sum()
 # remove NA
 meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
 # sanity checks
 # should be True
 len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
 # unique counts
 meta_pnca_LF2['mutation'].nunique() 
 meta_pnca_LF2.groupby('mutation_info').nunique() 
 # sanity check
 meta_pnca_LF2['id'].nunique() 
 # should be True
 if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
    print ('sanity check passed: complete numbers match')
 else:
    print('Error: Please Debug!')
 # value counts
 meta_pnca_LF2.mutation.value_counts()
 #meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
 # valid/comp snps
 # uncomment as necessary
 pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
 len(pnca_snps_COMP) 
 # output csv: WITH column but WITHOUT row names (COMP snps with meta data)
 # specify variable name for output file
 gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname3 = '_comp_snps_with_metadata_'
 nrows = len(pnca_snps_COMP) 
 #output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
 #print(output_filename) #<<<-check
 # write out file
 #pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
 #=========
 # Step 12d: comp snps only
 #=========
 # output csv: comp SNPS for info (i.e snps for which OR exist)
 # specify variable name for output file
 snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
 gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname1 = '_comp_snps_'
 nrows = len(snps_only) 
 output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
 print(output_filename) #<<<- check
 # write to csv: without column or row names
 snps_only.to_csv(output_filename, header = False, index = False)
 #=#=#=#=#=#=#=#
 # COMMENT: LF1 is the file to extract all unique snps for mcsm 
 # but you have that already in file called pnca_snps...
 # LF2: is the file for extracting snps tested for DS and hence OR calcs
 #=#=#=#=#=#=#=#
 ###########
 # Step 13 : Output the whole df i.e 
 # file for meta_data which is now formatted with
 # each row as a unique snp rather than the original version where
 # each row is a unique id
 # ***** This is the file you will ADD the AF and OR calculations to *****
 ###########
 # output csv: the entire DF
 # specify variable name for output file
 gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname4 = '_metadata'
 #nrows = len(meta_pnca_LF1)
 output_filename = output_file_path + gene + my_fname4  + '.csv'
 print(output_filename) #<<<-check
 # write out file
 meta_pnca_LF1.to_csv(output_filename) 
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -0,0 +1,121 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Jun 18 11:32:28 2019
@author: tanushree
 """
 ############################################
 #load libraries
 import pandas as pd
 import os
 #############################################
 #!#########################!
 # REQUIREMNETS:
 # Data_original/ must exist
 # containing GWAS data
 #!#########################!
 print(os.getcwd()) 
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 os.chdir(homedir + '/git/Data/input/original') 
 print(os.getcwd())
 #==========
 #read file
 #==========
 my_aa = pd.read_csv('aa_codes.csv') #20, 6
 #assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
 #=========================================================
 #convert file to  dict of dicts
 #=========================================================
 #convert each row into a dict of dicts so that there are 20 aa and 5 keys within
 #with your choice of column name that you have assigned to index as the "primary key". 
 #using 'index' creates a dict of dicts
 #using 'records' creates a list of dicts
 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
 #================================================
 #dict of aa with their corresponding properties
 #This is defined twice
 #================================================
 #7 categories: no overlap
 qualities1 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidic'
             , ('N', 'Q'): 'Amidic'
             , ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
             , ('S', 'T'): 'Hydroxylic'
             , ('F', 'W', 'Y'): 'Aromatic'
             , ('C', 'M'): 'Sulphur'
 }
 #9 categories: allowing for overlap
 qualities2 = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q'): 'Polar'
             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
             , ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
             , ('S', 'G', 'A', 'P'): 'Small'
             , ('F', 'W', 'Y', 'H'): 'Aromatic'
             , ('V', 'I', 'L', 'M'): 'Aliphatic'
             , ('C', 'G', 'P'): 'Special'
 }
 qualities_taylor = { ('R', 'H', 'K'): 'Basic'
             , ('D', 'E'): 'Acidc'
             , ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
             , ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
             #, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
             , ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small' 
             , ('F', 'W', 'Y', 'H'): 'Aromatic'
             , ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
             , ('C', 'G', 'P'): 'Special'
 }
 qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
                   , ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
 }
 qualities_polarity = { ('D', 'E'): 'acidic'
                      , ('H', 'K', 'R'): 'basic'
                      , ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
                      , ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'    
 }
 #==============================================================================                
 #adding amino acid properties to my dict of dicts                      
 for k, v in my_aa_dict.items():
    #print (k,v)
    v['aa_prop1'] = str() #initialise keys 
    v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
    v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
    v['aa_prop_water'] = str() #initialise keys
    v['aa_prop_polarity'] = str() #initialise keys
    for group in qualities1:
        if v['one_letter_code'] in group:
            v['aa_prop1']+= qualities1[group] # += for str concat   
    for group in qualities2:
        if v['one_letter_code'] in group:
            v['aa_prop2'].append(qualities2[group]) # append to list
    for group in qualities_taylor:
        if v['one_letter_code'] in group:
            v['aa_taylor'].append(qualities_taylor[group]) # append to list           
    for group in qualities_water:
        if v['one_letter_code'] in group:
            v['aa_prop_water']+= qualities_water[group] # += for str concat          
    for group in qualities_polarity:
        if v['one_letter_code'] in group:
            v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat 
 #COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
 #==============================================================================
--- a/mk-drug-dirs.sh
+++ b/mk-drug-dirs.sh
@ -0,0 +1,40 @@
 #!/bin/bash
 # Create a drug directory structure for processing
 #
 #
 # Structure:
 #
 # $DATA_DIR/$DRUG/input
 #                 |- original
 #                 |- processed
 #                 |- structure
 #                 
 # $DATA_DIR/$DRUG/output
 #                 |- plots
 #                 |- structure
 DATA_DIR=~/git/Data
 if [[ $1 == '' ]]; then
    echo "usage: mk-drug-dirs.sh <drug name>";
    exit;
 else
    DRUG=$1
    echo Creating structure for: $DRUG
    if [ -d $DATA_DIR ]
    then
        echo Doing creation in $DATA_DIR
        mkdir -p $DATA_DIR/$DRUG/input/original
        mkdir -p $DATA_DIR/$DRUG/input/processed
        mkdir -p $DATA_DIR/$DRUG/input/structure
        mkdir -p $DATA_DIR/$DRUG/output/plots
        mkdir -p $DATA_DIR/$DRUG/output/structure
    else
        echo "Error: $DATA_DIR does not exist. Did you check it out of git?"
        exit
    fi
 fi