graphs for PS lineage dist for all and dr muts

2020-01-22 10:12:09 +00:00 · 2020-01-22 10:12:09 +00:00 · 4f06e42ee4
commit 4f06e42ee4
parent ec37e3c1f6
4 changed files with 93 additions and 567 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -1,512 +1,7 @@
 ###########################
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ###########################
 # Data for bfactor figure
 # PS average
 # Lig average
 ###########################
 head(my_df$Position)
 head(my_df$ratioDUET)
 # order data frame
 df = my_df[order(my_df$Position),]
 head(df$Position)
 head(df$ratioDUET)
 #***********
 # PS: average by position
 #***********
 mean_DUET_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.DUET = mean(ratioDUET))
 #***********
 # Lig: average by position
 #***********
 mean_Lig_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.Lig = mean(ratioPredAff))
 #***********
 # cbind:mean_DUET_by_position and mean_Lig_by_position
 #***********
 combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
 # sanity check
 # mean_PS_Lig_Bfactor
 colnames(combined)
 colnames(combined) = c("Position"
 , "average_DUETR"
 , "Position2"
 , "average_PredAffR")
 colnames(combined)
 identical(combined$Position, combined$Position2)
 n = which(colnames(combined) == "Position2"); n
 combined_df = combined[,-n]
 max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
 max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
 #=============
 # output csv
 #============
 outDir = "~/Data/pyrazinamide/input/processed/"
 outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
 print(paste0("Output file with path will be:","", outFile))
 head(combined_df$Position); tail(combined_df$Position)
 write.csv(combined_df, outFile
 , row.names = F)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			   #
 ########################################################################
 source("../Header_TT.R")
 #source("barplot_colour_function.R")
 require(data.table)
 require(dplyr)
 ########################################################################
 #		 Read file: call script for combining df for PS		   	   #
 ########################################################################
 source("../combining_two_df.R")
-###########################
+source("../combining_two_df.R")
-# This will return:
+source("../combining_two_df.R")
 # df with NA:
 # merged_df2
 # merged_df3
 # df without NA:
 # merged_df2_comp
 # merged_df3_comp
 ###########################
 #---------------------- PAY ATTENTION
 # the above changes the working dir
 #[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
 #---------------------- PAY ATTENTION
 ###########################
 # you need merged_df3
 # or
 # merged_df3_comp
 # since these have unique SNPs
 # I prefer to use the merged_df3
 # because using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available
 ###########################
 # uncomment as necessary
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df3
 #my_df = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 ###########################
 # Data for bfactor figure
 # PS average
 # Lig average
 ###########################
 head(my_df$Position)
 head(my_df$ratioDUET)
 # order data frame
 df = my_df[order(my_df$Position),]
 head(df$Position)
 head(df$ratioDUET)
 #***********
 # PS: average by position
 #***********
 mean_DUET_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.DUET = mean(ratioDUET))
 #***********
 # Lig: average by position
 #***********
 mean_Lig_by_position <- df %>%
 group_by(Position) %>%
 summarize(averaged.Lig = mean(ratioPredAff))
 #***********
 # cbind:mean_DUET_by_position and mean_Lig_by_position
 #***********
 combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
 # sanity check
 # mean_PS_Lig_Bfactor
 colnames(combined)
 colnames(combined) = c("Position"
 , "average_DUETR"
 , "Position2"
 , "average_PredAffR")
 colnames(combined)
 identical(combined$Position, combined$Position2)
 n = which(colnames(combined) == "Position2"); n
 combined_df = combined[,-n]
 max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
 max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
 #=============
 # output csv
 #============
 outDir = "~/git/Data/pyrazinamide/input/processed/"
 outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
 print(paste0("Output file with path will be:","", outFile))
 head(combined_df$Position); tail(combined_df$Position)
 write.csv(combined_df, outFile
 , row.names = F)
 # read in pdb file complex1
 inDir = "~/git/Data/pyrazinamide/input/structure"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 # read in pdb file complex1
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 #########################
 #3: Read complex pdb file
 ##########################
 source("Header_TT.R")
 # list of 8
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 rm(inDir, inFile)
 #====== end of script
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 complex1 = inFile
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 inFile
 inDir = "~/git/Data/pyrazinamide/input/structure/"
 inFile = paste0(inDir, "complex1_no_water.pdb")
 complex1 = inFile
 #inFile2 = paste0(inDir, "complex2_no_water.pdb")
 #complex2 = inFile2
 # list of 8
 my_pdb = read.pdb(complex1
 , maxlines = -1
 , multi = FALSE
 , rm.insert = FALSE
 , rm.alt = TRUE
 , ATOM.only = FALSE
 , hex = FALSE
 , verbose = TRUE)
 rm(inDir, inFile, complex1)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 source("Header_TT.R")
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
 getwd()
 ########################################################################
 # 				Installing and loading required packages 			               #
 ########################################################################
 source("Header_TT.R")
 #########################################################
 # TASK: replace B-factors in the pdb file with normalised values
 # use the complex file with no water as mCSM lig was
 # performed on this file. You can check it in the script: read_pdb file.
 #########################################################
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
 , header = T)
 str(my_df)
 source("read_pdb.R") # list of 8
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 #1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: DUET scores
 hist(my_df$average_DUETR
 , xlab = ""
 , main = "Norm_DUET")
 plot(density(my_df$average_DUETR)
 , xlab = ""
 , main = "Norm_DUET")
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 #1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: DUET scores
 hist(my_df$average_DUETR
 , xlab = ""
 , main = "Norm_DUET")
 plot(density(my_df$average_DUETR)
 , xlab = ""
 , main = "Norm_DUET")
 #=========
 # step 1_P1
 #=========
 # Be brave and replace in place now (don't run sanity check)
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P1
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na
 # count number of 0's in Bactor
 sum(d$b == 0)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 # sanity check: should be True
 if (sum(d$b == 0) == b_na){
 print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
 print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if(max(d$b) == max(my_df$average_DUETR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_DUETR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 #=========
 # step 3_P1
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P1
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d
 max(d$b); min(d$b)
 #=========
 # step 5_P1
 #=========
 # output dir
 getwd()
 outDir = "~/git/Data/pyrazinamide/output/"
 getwd()
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
 outDir = "~/git/Data/pyrazinamide/input/structure"
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
 write.pdb(my_pdb, outFile)
 hist(d$b
 , xlab = ""
 , main = "repalced-B")
 plot(density(d$b)
 , xlab = ""
 , main = "replaced-B")
 # graph titles
 mtext(text = "Frequency"
 , side = 2
 , line = 0
 , outer = TRUE)
 mtext(text = "DUET_stability"
 , side = 3
 , line = 0
 , outer = TRUE)
 #=========================================================
 # Processing P2: Replacing  B values with PredAff Scores
 #=========================================================
 # clear workspace
 rm(list = ls())
 #=========================================================
 # Processing P2: Replacing  B values with PredAff Scores
 #=========================================================
 # clear workspace
 rm(list = ls())
 ###########################
 # 2: Read file: average stability values
 # or mcsm_normalised file, output of step 4 mcsm pipeline
 ###########################
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
 #                  , row.names = 1
 #                  , stringsAsFactors = F
 , header = T)
 str(my_df)
 #=========================================================
 # Processing P2: Replacing B factor with mean ratioLig scores
 #=========================================================
 #########################
 # 3: Read complex pdb file
 # form the R script
 ##########################
 source("read_pdb.R") # list of 8
 # extract atom list into a vari
 inDir = "~/git/Data/pyrazinamide/input/processed/"
 inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
 my_df <- read.csv(inFile
 #                  , row.names = 1
 #                  , stringsAsFactors = F
 , header = T)
 str(my_df)
 # extract atom list into a variable
 # since in the list this corresponds to data frame, variable will be a df
 d = my_pdb[[1]]
 # make a copy: required for downstream sanity checks
 d2 = d
 # sanity checks: B factor
 max(d$b); min(d$b)
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 # 1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: Pred Aff scores
 hist(my_df$average_PredAffR
 , xlab = ""
 , main = "Norm_lig_average")
 plot(density(my_df$average_PredAffR)
 , xlab = ""
 , main = "Norm_lig_average")
 # 3: After the following replacement
 #********************************
 par(oma = c(3,2,3,0)
 , mar = c(1,3,5,2)
 , mfrow = c(3,2))
 #par(mfrow = c(3,2))
 # 1: Original B-factor
 hist(d$b
 , xlab = ""
 , main = "B-factor")
 plot(density(d$b)
 , xlab = ""
 , main = "B-factor")
 # 2: Pred Aff scores
 hist(my_df$average_PredAffR
 , xlab = ""
 , main = "Norm_lig_average")
 plot(density(my_df$average_PredAffR)
 , xlab = ""
 , main = "Norm_lig_average")
 # 3: After the following replacement
 #********************************
 #=========
 # step 1_P2: BE BRAVE and replace in place now (don't run step 0)
 #=========
 # this makes all the B-factor values in the non-matched positions as NA
 d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
 #=========
 # step 2_P2
 #=========
 # count NA in Bfactor
 b_na = sum(is.na(d$b)) ; b_na
 # count number of 0's in Bactor
 sum(d$b == 0)
 # replace all NA in b factor with 0
 d$b[is.na(d$b)] = 0
 # sanity check: should be 0
 sum(is.na(d$b))
 if (sum(d$b == 0) == b_na){
 print ("Sanity check passed: NA's replaced with 0's successfully")
 } else {
 print("Error: NA replacement NOT successful, Debug code!")
 }
 max(d$b); min(d$b)
 # sanity checks: should be True
 if (max(d$b) == max(my_df$average_PredAffR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 if (min(d$b) == min(my_df$average_PredAffR)){
 print("Sanity check passed: B-factors replaced correctly")
 } else {
 print ("Error: Debug code please")
 }
 #=========
 # step 3_P2
 #=========
 # sanity check: dim should be same before reassignment
 # should be TRUE
 dim(d) == dim(d2)
 #=========
 # step 4_P2
 #=========
 # assign it back to the pdb file
 my_pdb[[1]] = d
 max(d$b); min(d$b)
 #=========
 # step 5_P2
 #=========
 write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
 # output dir
 getwd()
 # output dir
 outDir = "~/git/Data/pyrazinamide/input/structure/"
 outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
 outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
 write.pdb(my_pdb, outFile)
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@ -1,25 +1,31 @@
 #########################################################
 ### A) Installing and loading required packages
 #########################################################
 #lib_loc = "/usr/local/lib/R/site-library")
 #if (!require("gplots")) {
 #  install.packages("gplots", dependencies = TRUE)
 #  library(gplots)
 #}
-if (!require("tidyverse")) {
+#if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
+#  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
+#  library(tidyverse)
-}
+#}
 if (!require("ggplot2")) {
  install.packages("ggplot2", dependencies = TRUE)
  library(ggplot2)
 }
 if (!require("plotly")) {
  install.packages("plotly", dependencies = TRUE)
  library(plotly)
 }
 if (!require("cowplot")) {
  install.packages("copwplot", dependencies = TRUE)
-  library(ggplot2)
+  library(cowplot)
 }
 if (!require("ggcorrplot")) {
@ -43,37 +49,33 @@ if (!require ("GOplot")) {
 }
 if(!require("VennDiagram")) {
  install.packages("VennDiagram", dependencies = T)
  library(VennDiagram)
 }
 if(!require("scales")) {
  install.packages("scales", dependencies = T)
  library(scales)
 }
 if(!require("plotrix")) {
  install.packages("plotrix", dependencies = T)
  library(plotrix)
 }
 if(!require("stats")) {
  install.packages("stats", dependencies = T)
  library(stats)
 }
 if(!require("stats4")) {
  install.packages("stats4", dependencies = T)
  library(stats4)
 }
 if(!require("data.table")) {
-  library(stats4)
+install.packages("data.table")
  library(data.table)
 }
 if (!require("PerformanceAnalytics")){
@ -98,18 +100,17 @@ if (!require ("psych")){
 if (!require ("dplyr")){
  install.packages("dplyr")
-  library(psych)
+  library(dplyr)
 }
 if (!require ("compare")){
  install.packages("compare")
-  library(psych)
+  library(compare)
 }
 if (!require ("arsenal")){
  install.packages("arsenal")
-  library(psych)
+  library(arsenal)
 }
--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -11,7 +11,7 @@ getwd()
 # 				Installing and loading required packages 			   #
 ########################################################################
-source("Header_TT.R")
+#source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
@ -286,7 +286,7 @@ outDir = "~/git/Data/pyrazinamide/output/"
 getwd()
 outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
-write.csv(merged_df3, outFile1)
+#write.csv(merged_df3, outFile1)
 #outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
 #write.csv(merged_df3_comp, outFile2)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -1,5 +1,5 @@
 getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
 ########################################################################
@ -24,11 +24,11 @@ source("../combining_two_df.R")
 #==========================
 # This will return:
-# df with NA:
+# df with NA for pyrazinamide:
 # merged_df2
 # merged_df3
-# df without NA:
+# df without NA for pyrazinamide:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
@ -38,14 +38,17 @@ source("../combining_two_df.R")
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 # using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
 # uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@ -59,12 +62,39 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
-table(my_df$mutation_info)
+table(my_df$mutation_info); str(my_df$mutation_info)
 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Data for plot: assign as 
 # necessary
 #===========================
 # uncomment as necessary
 #!!!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT
 #==================
 # data for ALL muts
 #==================
 plot_df = my_df  
 my_plot_name = 'lineage_dist_PS.svg'
 #my_plot_name = 'lineage_dist_PS_comp.svg'
 #=======================
 # data for dr_muts ONLY
 #=======================
 #plot_df = my_df_dr 
 #my_plot_name = 'lineage_dist_dr_PS.svg'
 #my_plot_name = 'lineage_dist_dr_PS_comp.svg'
 #!!!!!!!!!!!!!!!!!!!!!!!
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
@ -74,6 +104,7 @@ table(my_df$mutation_info)
 #===================
 # Data for plots
 #===================
 table(plot_df$lineage); str(plot_df$lineage)
 # subset only lineages1-4
 sel_lineages = c("lineage1"
@ -82,34 +113,29 @@ sel_lineages = c("lineage1"
                 , "lineage4")
 # uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
 #104     1293      264     1311 
 # when merged_df2_comp is used
 #lineage1 lineage2 lineage3 lineage4 
 #99     1275      263     1255
 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}
 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 # REASSIGNMENT
 df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!!
 rm(df_lin)
@ -117,8 +143,8 @@ rm(df_lin)
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
-library(plotly)
+#library(plotly)
-library(ggridges)
+#library(ggridges)
 g <- ggplot(df, aes(x = ratioDUET)) + 
  geom_density(aes(fill = DUET_outcome)
@ -129,42 +155,44 @@ g <- ggplot(df, aes(x = ratioDUET)) +
 ggplotly(g)
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()
-svg('lineage_dist_PS.svg')
+# check plot name
 my_plot_name
-printFile = ggplot( df, aes(x = ratioDUET
+# output svg
-                            , y = DUET_outcome) )+
+svg(my_plot_name)
 printFile = ggplot(df, aes(x = ratioDUET
                            , y = DUET_outcome))+
  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient( aes(fill = ..x..)
+  geom_density_ridges_gradient(aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
 #             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
+              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
 #                   , ylim = c(0, 6)
 #                   , clip = "off" 
-                ) +
+) +
-  scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
-  theme( axis.text.x = element_text( size = my_ats
+  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
-#         , axis.text.y = element_text( size = my_ats
+#                  , axis.text.y = element_text(size = my_ats
 #                                                , angle = 0
 #                                                , hjust = 1
 #                                                , vjust = 0)
@ -173,9 +201,9 @@ printFile = ggplot( df, aes(x = ratioDUET
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
-         , strip.text = element_text(size=my_als)
+         , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size=10)
+         , legend.text = element_text(size = 10)
-         , legend.title = element_text(size=my_als)
+         , legend.title = element_text(size = my_als)
 #                  , legend.position = c(0.3, 0.8)
 #                  , legend.key.height = unit(1, 'mm')
  ) 
@ -183,10 +211,12 @@ printFile = ggplot( df, aes(x = ratioDUET
 print(printFile)
 dev.off()
-#=!=!=!=!=!=!
+#=!=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
+# COMMENT: Not much differences in the distributions
 # when using merged_df2 or merged_df2_comp.
 # Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
+#=!=!=!=!=!=!=!
 #===================================================
 # COMPARING DISTRIBUTIONS