getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
getwd()

########################################################################
# 				Installing and loading required packages 			   #
########################################################################

source("../Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)

########################################################################
#		 Read file: call script for combining df for PS			   	   #
########################################################################

source("../combining_two_df.R")

#---------------------- PAY ATTENTION
# the above changes the working dir
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
#---------------------- PAY ATTENTION

#==========================
# This will return:

# df with NA for pyrazinamide:
# merged_df2
# merged_df3

# df without NA for pyrazinamide:
# merged_df2_comp
# merged_df3_comp
#===========================

###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship 
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################

# uncomment as necessary

#%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df  = merged_df2
#my_df  = merged_df2_comp
#%%%%%%%%%%%%%%%%%%%%%%%%%

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
is.factor(my_df$lineage)
my_df$lineage = as.factor(my_df$lineage)
is.factor(my_df$lineage)

table(my_df$mutation_info); str(my_df$mutation_info)

# subset df with dr muts only
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
table(my_df_dr$mutation_info)

########################################################################
#               end of data extraction and cleaning for plots          #
########################################################################

#==========================
# Run two times: 
# uncomment as necessary
# 1) for all muts
# 2) for dr_muts
#===========================
#%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT

#================
# for ALL muts
#================
#plot_df = my_df  

#================
# for dr muts ONLY
#================
plot_df = my_df_dr 

#%%%%%%%%%%%%%%%%%%%%%%%%
#============================
# Plot: Lineage Distribution
# x = mcsm_values, y = dist
# fill = stability
#============================

table(plot_df$lineage); str(plot_df$lineage)

# subset only lineages1-4
sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")

# uncomment as necessary
df_lin = subset(plot_df, subset = lineage %in% sel_lineages )

# refactor
df_lin$lineage = factor(df_lin$lineage)

table(df_lin$lineage) #{RESULT: No of samples within lineage}
#lineage1 lineage2 lineage3 lineage4 

length(unique(df_lin$Mutationinformation))
#{Result: No. of unique mutations the 4 lineages contribute to}

# sanity checks
r1 = 2:5 # when merged_df2 used: because there is missing lineages 
if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
  print ("sanity check passed: numbers match")
} else{
  print("Error!: check your numbers")
} 

#%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df <- df_lin
#%%%%%%%%%%%%%%%%%%%%%%%%%%

rm(df_lin)

# COMPARING DISTRIBUTIONS
head(df$lineage)
df$lineage = as.character(df$lineage)

lin1 = df[df$lineage == "lineage1",]$ratioDUET
lin2 = df[df$lineage == "lineage2",]$ratioDUET
lin3 = df[df$lineage == "lineage3",]$ratioDUET
lin4 = df[df$lineage == "lineage4",]$ratioDUET

# ks test
ks.test(lin1,lin2) 
ks.test(lin1,lin3) 
ks.test(lin1,lin4) 

ks.test(lin2,lin3)
ks.test(lin2,lin4)  

ks.test(lin3,lin4)