getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()

########################################################################
# 				Installing and loading required packages 			   #
########################################################################

source("../Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)

########################################################################
#		 Read file: call script for combining df for PS			   	   #
########################################################################

source("../combining_two_df.R")

#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION

#==========================
# This will return:

# df with NA for pyrazinamide:
# merged_df2
# merged_df3

# df without NA for pyrazinamide:
# merged_df2_comp
# merged_df3_comp
#===========================

###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship 
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################

# uncomment as necessary
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df  = merged_df2
#my_df  = merged_df2_comp
#%%%%%%%%%%%%%%%%%%%%%%%%

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)

table(my_df$mutation_info); str(my_df$mutation_info)

# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
table(my_df_dr$mutation_info)

########################################################################
#               end of data extraction and cleaning for plots          #
########################################################################

#==========================
# Run two times: 
# uncomment as necessary
# 1) for all muts
# 2) for dr_muts
#===========================

#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT

#================
# for ALL muts
#================
plot_df = my_df  
my_plot_name = 'lineage_dist_PS.svg'
#my_plot_name = 'lineage_dist_PS_comp.svg'

#================
# for dr muts ONLY
#================
#plot_df = my_df_dr 
#my_plot_name = 'lineage_dist_dr_PS.svg'
#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
 
#%%%%%%%%%%%%%%%%%%%%%%%%

#==========================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================

#===================
# Data for plots
#===================
table(plot_df$lineage); str(plot_df$lineage)

# subset only lineages1-4
sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")

# uncomment as necessary
df_lin = subset(plot_df, subset = lineage %in% sel_lineages )

# refactor
df_lin$lineage = factor(df_lin$lineage)

table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4 

length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}

# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages 
if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
} else{
  print("Error!: check your numbers")
} 

#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%

rm(df_lin)

#******************
# generate distribution plot of lineages
#******************
# basic: could improve this!
#library(plotly)
#library(ggridges)

g <- ggplot(df, aes(x = ratioDUET)) + 
  geom_density(aes(fill = DUET_outcome)
               , alpha = 0.5) + facet_wrap(~ lineage,
                                           scales = "free") +
  ggtitle("Kernel Density estimates of Protein stability by lineage")

ggplotly(g)

# 2 : ggridges (good!)
my_ats = 15 # axis text size
my_als = 20 # axis label size

my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')

# set output dir for plots
getwd()
setwd("~/git/Data/pyrazinamide/output/plots")
getwd()

# check plot name
my_plot_name

# output svg
svg(my_plot_name)
printFile = ggplot(df, aes(x = ratioDUET
                            , y = DUET_outcome))+
  
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
#             , switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
#                   , ylim = c(0, 6)
#                   , clip = "off" 
) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
#                  , axis.text.y = element_text(size = my_ats
#                                                , angle = 0
#                                                , hjust = 1
#                                                , vjust = 0)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
         , legend.text = element_text(size = 10)
         , legend.title = element_text(size = my_als)
#                  , legend.position = c(0.3, 0.8)
#                  , legend.key.height = unit(1, 'mm')
  ) 

print(printFile)
dev.off()

#=!=!=!=!=!=!=!
# COMMENT: Not much differences in the distributions
# when using merged_df2 or merged_df2_comp.
# Also, the lineage differences disappear when looking at all muts
# The pattern we are interested in is possibly only for dr_mutations
#=!=!=!=!=!=!=!
#===================================================

# COMPARING DISTRIBUTIONS: KS test 
# run: "../KS_test_PS.R"