diff --git a/mcsm_analysis/pyrazinamide/scripts/.Rhistory b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
index 8055405..0112d24 100644
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@@ -1,512 +1,7 @@
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
 source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
 source("../combining_two_df.R")
-###########################
-# This will return:
-# df with NA:
-# merged_df2
-# merged_df3
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile)
-#====== end of script
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-inFile
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile, complex1)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-source("Header_TT.R")
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-source("Header_TT.R")
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-source("read_pdb.R") # list of 8
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-outDir = "~/git/Data/pyrazinamide/input/structure"
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-hist(d$b
-, xlab = ""
-, main = "repalced-B")
-plot(density(d$b)
-, xlab = ""
-, main = "replaced-B")
-# graph titles
-mtext(text = "Frequency"
-, side = 2
-, line = 0
-, outer = TRUE)
-mtext(text = "DUET_stability"
-, side = 3
-, line = 0
-, outer = TRUE)
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-#=========================================================
-# Processing P2: Replacing B factor with mean ratioLig scores
-#=========================================================
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-source("read_pdb.R") # list of 8
-# extract atom list into a vari
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P2
-#=========
-write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
-# output dir
-getwd()
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
+source("../combining_two_df.R")
+source("../combining_two_df.R")
diff --git a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
index 2069fa7..9eae42a 100644
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@@ -1,25 +1,31 @@
 #########################################################
 ### A) Installing and loading required packages
 #########################################################
+#lib_loc = "/usr/local/lib/R/site-library")
 
 #if (!require("gplots")) {
 #  install.packages("gplots", dependencies = TRUE)
 #  library(gplots)
 #}
 
-if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
-}
+#if (!require("tidyverse")) {
+#  install.packages("tidyverse", dependencies = TRUE)
+#  library(tidyverse)
+#}
 
 if (!require("ggplot2")) {
   install.packages("ggplot2", dependencies = TRUE)
   library(ggplot2)
 }
 
+if (!require("plotly")) {
+  install.packages("plotly", dependencies = TRUE)
+  library(plotly)
+}
+
 if (!require("cowplot")) {
   install.packages("copwplot", dependencies = TRUE)
-  library(ggplot2)
+  library(cowplot)
 }
 
 if (!require("ggcorrplot")) {
@@ -43,37 +49,33 @@ if (!require ("GOplot")) {
 }
 
 if(!require("VennDiagram")) {
-  
   install.packages("VennDiagram", dependencies = T)
   library(VennDiagram)
 }
 
 if(!require("scales")) {
-  
   install.packages("scales", dependencies = T)
   library(scales)
 }
 
 if(!require("plotrix")) {
-  
   install.packages("plotrix", dependencies = T)
   library(plotrix)
 }
 
 if(!require("stats")) {
-  
   install.packages("stats", dependencies = T)
   library(stats)
 }
 
 if(!require("stats4")) {
-  
   install.packages("stats4", dependencies = T)
   library(stats4)
 }
 
 if(!require("data.table")) {
-  library(stats4)
+install.packages("data.table")
+  library(data.table)
 }
 
 if (!require("PerformanceAnalytics")){
@@ -98,18 +100,17 @@ if (!require ("psych")){
 
 if (!require ("dplyr")){
   install.packages("dplyr")
-  library(psych)
+  library(dplyr)
 }
 
-
 if (!require ("compare")){
   install.packages("compare")
-  library(psych)
+  library(compare)
 }
 
 if (!require ("arsenal")){
   install.packages("arsenal")
-  library(psych)
+  library(arsenal)
 }
 
 
diff --git a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
index 57e476a..d0e13cd 100644
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@@ -11,7 +11,7 @@ getwd()
 # 				Installing and loading required packages 			   #
 ########################################################################
 
-source("Header_TT.R")
+#source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
@@ -286,7 +286,7 @@ outDir = "~/git/Data/pyrazinamide/output/"
 getwd()
 
 outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
-write.csv(merged_df3, outFile1)
+#write.csv(merged_df3, outFile1)
 
 #outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
 #write.csv(merged_df3_comp, outFile2)
diff --git a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
index cc7e15a..4c54681 100644
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@@ -1,5 +1,5 @@
 getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 
 ########################################################################
@@ -24,11 +24,11 @@ source("../combining_two_df.R")
 #==========================
 # This will return:
 
-# df with NA:
+# df with NA for pyrazinamide:
 # merged_df2
 # merged_df3
 
-# df without NA:
+# df without NA for pyrazinamide:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
@@ -38,14 +38,17 @@ source("../combining_two_df.R")
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
 ###########################
 
 # uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@@ -59,12 +62,39 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 
-table(my_df$mutation_info)
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
 
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 
+#==========================
+# Data for plot: assign as 
+# necessary
+#===========================
+
+# uncomment as necessary
+#!!!!!!!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+
+#==================
+# data for ALL muts
+#==================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#=======================
+# data for dr_muts ONLY
+#=======================
+#plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+#!!!!!!!!!!!!!!!!!!!!!!!
+
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
@@ -74,6 +104,7 @@ table(my_df$mutation_info)
 #===================
 # Data for plots
 #===================
+table(plot_df$lineage); str(plot_df$lineage)
 
 # subset only lineages1-4
 sel_lineages = c("lineage1"
@@ -82,34 +113,29 @@ sel_lineages = c("lineage1"
                  , "lineage4")
 
 # uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
 
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 
 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
-#104     1293      264     1311 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#99     1275      263     1255
 
 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}
 
 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
   print ("sanity check passed: numbers match")
 } else{
   print("Error!: check your numbers")
 } 
 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 # REASSIGNMENT
 df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!!
 
 rm(df_lin)
 
@@ -117,8 +143,8 @@ rm(df_lin)
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
-library(plotly)
-library(ggridges)
+#library(plotly)
+#library(ggridges)
 
 g <- ggplot(df, aes(x = ratioDUET)) + 
   geom_density(aes(fill = DUET_outcome)
@@ -129,64 +155,68 @@ g <- ggplot(df, aes(x = ratioDUET)) +
 ggplotly(g)
 
 # 2 : ggridges (good!)
-
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 
-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
 
-svg('lineage_dist_PS.svg')
+# check plot name
+my_plot_name
 
-printFile = ggplot( df, aes(x = ratioDUET
-                            , y = DUET_outcome) )+
+# output svg
+svg(my_plot_name)
+printFile = ggplot(df, aes(x = ratioDUET
+                            , y = DUET_outcome))+
   
   #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient( aes(fill = ..x..)
+  geom_density_ridges_gradient(aes(fill = ..x..)
                                 , scale = 3
                                 , size = 0.3 ) +
   facet_wrap( ~lineage
               , scales = "free"
 #             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
+              , labeller = labeller(lineage = my_labels) ) +
   coord_cartesian( xlim = c(-1, 1)
-#                  , ylim = c(0, 6)
-#                  , clip = "off" 
-                ) +
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+#                   , ylim = c(0, 6)
+#                   , clip = "off" 
+) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                         , name = "DUET" ) + 
-  theme( axis.text.x = element_text( size = my_ats
+  theme(axis.text.x = element_text(size = my_ats
                                      , angle = 90
                                      , hjust = 1
                                      , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
-#                                       , angle = 0
-#                                       , hjust = 1
-#                                       , vjust = 0)
+#                  , axis.text.y = element_text(size = my_ats
+#                                                , angle = 0
+#                                                , hjust = 1
+#                                                , vjust = 0)
          , axis.text.y = element_blank()
          , axis.title.x = element_blank()
          , axis.title.y = element_blank()
          , axis.ticks.y = element_blank()
          , plot.title = element_blank()
-         , strip.text = element_text(size=my_als)
-         , legend.text = element_text(size=10)
-         , legend.title = element_text(size=my_als)
-#         , legend.position = c(0.3, 0.8)
-#         , legend.key.height = unit(1, 'mm')
-        ) 
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#                  , legend.position = c(0.3, 0.8)
+#                  , legend.key.height = unit(1, 'mm')
+  ) 
 
 print(printFile)
 dev.off()
 
-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
+#=!=!=!=!=!=!=!
+# COMMENT: Not much differences in the distributions
+# when using merged_df2 or merged_df2_comp.
+# Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
+#=!=!=!=!=!=!=!
 #===================================================
 
 # COMPARING DISTRIBUTIONS