graphs for PS lineage dist for all and dr muts

2020-01-22 10:12:09 +00:00 · 2020-01-22 10:12:09 +00:00 · 4de4549430
commit 4de4549430
parent 3c20be5615
4 changed files with 93 additions and 567 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/.Rhistory
+++ b/mcsm_analysis/pyrazinamide/scripts/.Rhistory
@ -1,512 +1,7 @@
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
 getwd()
 setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
 source("../Header_TT.R")
-#source("barplot_colour_function.R")
-require(data.table)
-require(dplyr)
-########################################################################
-#		 Read file: call script for combining df for PS		   	   #
-########################################################################
 source("../combining_two_df.R")
-###########################
-# This will return:
-# df with NA:
-# merged_df2
-# merged_df3
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
-###########################
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
-###########################
-# you need merged_df3
-# or
-# merged_df3_comp
-# since these have unique SNPs
-# I prefer to use the merged_df3
-# because using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available
-###########################
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-my_df  = merged_df3
-#my_df = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-# quick checks
-colnames(my_df)
-str(my_df)
-###########################
-# Data for bfactor figure
-# PS average
-# Lig average
-###########################
-head(my_df$Position)
-head(my_df$ratioDUET)
-# order data frame
-df = my_df[order(my_df$Position),]
-head(df$Position)
-head(df$ratioDUET)
-#***********
-# PS: average by position
-#***********
-mean_DUET_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.DUET = mean(ratioDUET))
-#***********
-# Lig: average by position
-#***********
-mean_Lig_by_position <- df %>%
-group_by(Position) %>%
-summarize(averaged.Lig = mean(ratioPredAff))
-#***********
-# cbind:mean_DUET_by_position and mean_Lig_by_position
-#***********
-combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
-# sanity check
-# mean_PS_Lig_Bfactor
-colnames(combined)
-colnames(combined) = c("Position"
-, "average_DUETR"
-, "Position2"
-, "average_PredAffR")
-colnames(combined)
-identical(combined$Position, combined$Position2)
-n = which(colnames(combined) == "Position2"); n
-combined_df = combined[,-n]
-max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
-max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
-#=============
-# output csv
-#============
-outDir = "~/git/Data/pyrazinamide/input/processed/"
-outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
-print(paste0("Output file with path will be:","", outFile))
-head(combined_df$Position); tail(combined_df$Position)
-write.csv(combined_df, outFile
-, row.names = F)
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-# read in pdb file complex1
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-#########################
-#3: Read complex pdb file
-##########################
-source("Header_TT.R")
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile)
-#====== end of script
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-complex1 = inFile
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-inFile
-inDir = "~/git/Data/pyrazinamide/input/structure/"
-inFile = paste0(inDir, "complex1_no_water.pdb")
-complex1 = inFile
-#inFile2 = paste0(inDir, "complex2_no_water.pdb")
-#complex2 = inFile2
-# list of 8
-my_pdb = read.pdb(complex1
-, maxlines = -1
-, multi = FALSE
-, rm.insert = FALSE
-, rm.alt = TRUE
-, ATOM.only = FALSE
-, hex = FALSE
-, verbose = TRUE)
-rm(inDir, inFile, complex1)
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-source("Header_TT.R")
-getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
-getwd()
-########################################################################
-# 				Installing and loading required packages 			               #
-########################################################################
-source("Header_TT.R")
-#########################################################
-# TASK: replace B-factors in the pdb file with normalised values
-# use the complex file with no water as mCSM lig was
-# performed on this file. You can check it in the script: read_pdb file.
-#########################################################
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-source("read_pdb.R") # list of 8
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-# Set the margin on all sides
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-#1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: DUET scores
-hist(my_df$average_DUETR
-, xlab = ""
-, main = "Norm_DUET")
-plot(density(my_df$average_DUETR)
-, xlab = ""
-, main = "Norm_DUET")
-#=========
-# step 1_P1
-#=========
-# Be brave and replace in place now (don't run sanity check)
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P1
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-# sanity check: should be True
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if(max(d$b) == max(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_DUETR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P1
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P1
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P1
-#=========
-# output dir
-getwd()
-outDir = "~/git/Data/pyrazinamide/output/"
-getwd()
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-outDir = "~/git/Data/pyrazinamide/input/structure"
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
-write.pdb(my_pdb, outFile)
-hist(d$b
-, xlab = ""
-, main = "repalced-B")
-plot(density(d$b)
-, xlab = ""
-, main = "replaced-B")
-# graph titles
-mtext(text = "Frequency"
-, side = 2
-, line = 0
-, outer = TRUE)
-mtext(text = "DUET_stability"
-, side = 3
-, line = 0
-, outer = TRUE)
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-#=========================================================
-# Processing P2: Replacing  B values with PredAff Scores
-#=========================================================
-# clear workspace
-rm(list = ls())
-###########################
-# 2: Read file: average stability values
-# or mcsm_normalised file, output of step 4 mcsm pipeline
-###########################
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-#=========================================================
-# Processing P2: Replacing B factor with mean ratioLig scores
-#=========================================================
-#########################
-# 3: Read complex pdb file
-# form the R script
-##########################
-source("read_pdb.R") # list of 8
-# extract atom list into a vari
-inDir = "~/git/Data/pyrazinamide/input/processed/"
-inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
-my_df <- read.csv(inFile
-#                  , row.names = 1
-#                  , stringsAsFactors = F
-, header = T)
-str(my_df)
-# extract atom list into a variable
-# since in the list this corresponds to data frame, variable will be a df
-d = my_pdb[[1]]
-# make a copy: required for downstream sanity checks
-d2 = d
-# sanity checks: B factor
-max(d$b); min(d$b)
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-par(oma = c(3,2,3,0)
-, mar = c(1,3,5,2)
-, mfrow = c(3,2))
-#par(mfrow = c(3,2))
-# 1: Original B-factor
-hist(d$b
-, xlab = ""
-, main = "B-factor")
-plot(density(d$b)
-, xlab = ""
-, main = "B-factor")
-# 2: Pred Aff scores
-hist(my_df$average_PredAffR
-, xlab = ""
-, main = "Norm_lig_average")
-plot(density(my_df$average_PredAffR)
-, xlab = ""
-, main = "Norm_lig_average")
-# 3: After the following replacement
-#********************************
-#=========
-# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
-#=========
-# this makes all the B-factor values in the non-matched positions as NA
-d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
-#=========
-# step 2_P2
-#=========
-# count NA in Bfactor
-b_na = sum(is.na(d$b)) ; b_na
-# count number of 0's in Bactor
-sum(d$b == 0)
-# replace all NA in b factor with 0
-d$b[is.na(d$b)] = 0
-# sanity check: should be 0
-sum(is.na(d$b))
-if (sum(d$b == 0) == b_na){
-print ("Sanity check passed: NA's replaced with 0's successfully")
-} else {
-print("Error: NA replacement NOT successful, Debug code!")
-}
-max(d$b); min(d$b)
-# sanity checks: should be True
-if (max(d$b) == max(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-if (min(d$b) == min(my_df$average_PredAffR)){
-print("Sanity check passed: B-factors replaced correctly")
-} else {
-print ("Error: Debug code please")
-}
-#=========
-# step 3_P2
-#=========
-# sanity check: dim should be same before reassignment
-# should be TRUE
-dim(d) == dim(d2)
-#=========
-# step 4_P2
-#=========
-# assign it back to the pdb file
-my_pdb[[1]] = d
-max(d$b); min(d$b)
-#=========
-# step 5_P2
-#=========
-write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
-# output dir
-getwd()
-# output dir
-outDir = "~/git/Data/pyrazinamide/input/structure/"
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
-outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
-write.pdb(my_pdb, outFile)
+source("../combining_two_df.R")
+source("../combining_two_df.R")
--- a/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
+++ b/mcsm_analysis/pyrazinamide/scripts/Header_TT.R
@ -1,25 +1,31 @@
 #########################################################
 ### A) Installing and loading required packages
 #########################################################
+#lib_loc = "/usr/local/lib/R/site-library")

 #if (!require("gplots")) {
 #  install.packages("gplots", dependencies = TRUE)
 #  library(gplots)
 #}

-if (!require("tidyverse")) {
-  install.packages("tidyverse", dependencies = TRUE)
-  library(tidyverse)
-}
+#if (!require("tidyverse")) {
+#  install.packages("tidyverse", dependencies = TRUE)
+#  library(tidyverse)
+#}

 if (!require("ggplot2")) {
  install.packages("ggplot2", dependencies = TRUE)
  library(ggplot2)
 }

+if (!require("plotly")) {
+  install.packages("plotly", dependencies = TRUE)
+  library(plotly)
+}
+
 if (!require("cowplot")) {
  install.packages("copwplot", dependencies = TRUE)
-  library(ggplot2)
+  library(cowplot)
 }

 if (!require("ggcorrplot")) {
@ -43,37 +49,33 @@ if (!require ("GOplot")) {
 }

 if(!require("VennDiagram")) {
-  
  install.packages("VennDiagram", dependencies = T)
  library(VennDiagram)
 }

 if(!require("scales")) {
-  
  install.packages("scales", dependencies = T)
  library(scales)
 }

 if(!require("plotrix")) {
-  
  install.packages("plotrix", dependencies = T)
  library(plotrix)
 }

 if(!require("stats")) {
-  
  install.packages("stats", dependencies = T)
  library(stats)
 }

 if(!require("stats4")) {
-  
  install.packages("stats4", dependencies = T)
  library(stats4)
 }

 if(!require("data.table")) {
-  library(stats4)
+install.packages("data.table")
+  library(data.table)
 }

 if (!require("PerformanceAnalytics")){
@ -98,18 +100,17 @@ if (!require ("psych")){

 if (!require ("dplyr")){
  install.packages("dplyr")
-  library(psych)
+  library(dplyr)
 }

-
 if (!require ("compare")){
  install.packages("compare")
-  library(psych)
+  library(compare)
 }

 if (!require ("arsenal")){
  install.packages("arsenal")
-  library(psych)
+  library(arsenal)
 }


--- a/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
+++ b/mcsm_analysis/pyrazinamide/scripts/combining_two_df.R
@ -11,7 +11,7 @@ getwd()
 # 				Installing and loading required packages 			   #
 ########################################################################

-source("Header_TT.R")
+#source("Header_TT.R")
 #require(data.table)
 #require(arsenal)
 #require(compare)
@ -286,7 +286,7 @@ outDir = "~/git/Data/pyrazinamide/output/"
 getwd()

 outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
-write.csv(merged_df3, outFile1)
+#write.csv(merged_df3, outFile1)

 #outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
 #write.csv(merged_df3_comp, outFile2)
--- a/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
+++ b/mcsm_analysis/pyrazinamide/scripts/plotting/lineage_dist_PS.R
@ -1,5 +1,5 @@
 getwd()
-setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
 getwd()

 ########################################################################
@ -24,11 +24,11 @@ source("../combining_two_df.R")
 #==========================
 # This will return:

-# df with NA:
+# df with NA for pyrazinamide:
 # merged_df2
 # merged_df3

-# df without NA:
+# df without NA for pyrazinamide:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
@ -38,14 +38,17 @@ source("../combining_two_df.R")
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
 ###########################

 # uncomment as necessary
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT
 my_df  = merged_df2
 #my_df  = merged_df2_comp
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!

 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@ -59,12 +62,39 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)

-table(my_df$mutation_info)
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 

 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################

+#==========================
+# Data for plot: assign as 
+# necessary
+#===========================
+
+# uncomment as necessary
+#!!!!!!!!!!!!!!!!!!!!!!!
+# REASSIGNMENT
+
+#==================
+# data for ALL muts
+#==================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#=======================
+# data for dr_muts ONLY
+#=======================
+#plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+#!!!!!!!!!!!!!!!!!!!!!!!
+
 #==========================
 # Plot: Lineage Distribution
 # x = mcsm_values, y = dist
@ -74,6 +104,7 @@ table(my_df$mutation_info)
 #===================
 # Data for plots
 #===================
+table(plot_df$lineage); str(plot_df$lineage)

 # subset only lineages1-4
 sel_lineages = c("lineage1"
@ -82,34 +113,29 @@ sel_lineages = c("lineage1"
                 , "lineage4")

 # uncomment as necessary
-df_lin = subset(my_df, subset = lineage %in% sel_lineages )
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )

 # refactor
 df_lin$lineage = factor(df_lin$lineage)

 table(df_lin$lineage) #{RESULT: No of samples within lineage}
 #lineage1 lineage2 lineage3 lineage4 
-#104     1293      264     1311 
-
-# when merged_df2_comp is used
-#lineage1 lineage2 lineage3 lineage4 
-#99     1275      263     1255

 length(unique(df_lin$Mutationinformation))
 #{Result: No. of unique mutations the 4 lineages contribute to}

 # sanity checks
 r1 = 2:5 # when merged_df2 used: because there is missing lineages 
-if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
 } else{
  print("Error!: check your numbers")
 } 

-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!! 
 # REASSIGNMENT
 df <- df_lin
-#<<<<<<<<<<<<<<<<<<<<<<<<<
+#!!!!!!!!!!!!!!!!!!!!!!!!!

 rm(df_lin)

@ -117,8 +143,8 @@ rm(df_lin)
 # generate distribution plot of lineages
 #******************
 # basic: could improve this!
-library(plotly)
-library(ggridges)
+#library(plotly)
+#library(ggridges)

 g <- ggplot(df, aes(x = ratioDUET)) + 
  geom_density(aes(fill = DUET_outcome)
@ -129,20 +155,22 @@ g <- ggplot(df, aes(x = ratioDUET)) +
 ggplotly(g)

 # 2 : ggridges (good!)
-
 my_ats = 15 # axis text size
 my_als = 20 # axis label size

-fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
-names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')

 # set output dir for plots
 getwd()
 setwd("~/git/Data/pyrazinamide/output/plots")
 getwd()

-svg('lineage_dist_PS.svg')
+# check plot name
+my_plot_name

+# output svg
+svg(my_plot_name)
 printFile = ggplot(df, aes(x = ratioDUET
                            , y = DUET_outcome))+
  
@ -153,7 +181,7 @@ printFile = ggplot( df, aes(x = ratioDUET
  facet_wrap( ~lineage
              , scales = "free"
 #             , switch = 'x'
-              , labeller = labeller(lineage = fooNames) ) +
+              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
 #                   , ylim = c(0, 6)
 #                   , clip = "off" 
@ -183,10 +211,12 @@ printFile = ggplot( df, aes(x = ratioDUET
 print(printFile)
 dev.off()

-#=!=!=!=!=!=!
-# COMMENT: When you look at all mutations, the lineage differences disappear...
+#=!=!=!=!=!=!=!
+# COMMENT: Not much differences in the distributions
+# when using merged_df2 or merged_df2_comp.
+# Also, the lineage differences disappear when looking at all muts
 # The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!
+#=!=!=!=!=!=!=!
 #===================================================

 # COMPARING DISTRIBUTIONS