#!/usr/bin/env Rscript
#########################################################
# TASK: KS test for PS/DUET lineage distributions
#=======================================================================
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/")
getwd()

#source("~/git/LSHTM_analysis/scripts/Header_TT.R")
#source("../barplot_colour_function.R")
#require(data.table)
source("plotting/combining_dfs_plotting.R")
# should return the following dfs, directories and variables

# PS combined: 
# 1) merged_df2
# 2) merged_df2_comp
# 3) merged_df3
# 4) merged_df3_comp

# LIG combined: 
# 5) merged_df2_lig
# 6) merged_df2_comp_lig
# 7) merged_df3_lig
# 8) merged_df3_comp_lig

# 9) my_df_u
# 10) my_df_u_lig

cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))

cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nAngstrom symbol:", angstroms_symbol
           , "\nNo. of duplicated muts:", dup_muts_nu
           , "\nNA count for ORs:", na_count
           , "\nNA count in df2:", na_count_df2
           , "\nNA count in df3:", na_count_df3))       


#=============
# Output
#=============
# ks test by lineage
ks_dr_others_lineage = paste0(outdir, "/KS_lineage_dr_vs_others.csv")

###########################
# Data for  stats
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship 
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
###########################

# REASSIGNMENT
my_df  = merged_df2

# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
rm(merged_df2_lig, merged_df2_comp_lig, merged_df3_lig, merged_df3_comp_lig, my_df_u, my_df_u_lig)

# quick checks
colnames(my_df)
str(my_df)

# Ensure correct data type in columns to plot: need to be factor
#is.factor(my_df$lineage)
#my_df$lineage = as.factor(my_df$lineage)
#is.factor(my_df$lineage)
########################################################################
table(my_df$lineage); str(my_df$lineage)

# subset only lineages1-4
sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")

# subset selected lineages
df_lin = subset(my_df, subset = lineage %in% sel_lineages)

#==============
# dr_muts_col
#==============
table(df_lin$mutation_info); str(df_lin$mutation_info)

# subset df with dr muts only
df_lin_dr = subset(df_lin, mutation_info == dr_muts_col) 
table(df_lin_dr$mutation_info)

#==============
# other_muts_col
#==============
df_lin_other = subset(df_lin, mutation_info == other_muts_col) 
table(df_lin_other$mutation_info)

#=======================================================================
# individual: CHECKS
ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == "lineage1"]
        , df_lin_other$duet_scaled[df_lin_other$lineage == "lineage1"])

#=======================================================================
my_lineages = levels(factor(df_lin$lineage)); my_lineages

ks_df = data.frame()

for (i in my_lineages){
   cat(i)
   df = data.frame(lineage = NA, method = NA, ks_statistic = NA, ks_pvalue = NA)
   
   lineage_comp = i
   ks_method = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
                       , df_lin_other$duet_scaled[df_lin_other$lineage == i])$method
   
   ks_statistic = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
                          , df_lin_other$duet_scaled[df_lin_other$lineage == i])$statistic
   
   ks_pvalue = ks.test(df_lin_dr$duet_scaled[df_lin_dr$lineage == i]
                       , df_lin_other$duet_scaled[df_lin_other$lineage == i])$p.value
   
   # print(c(lineage_comp, ks_method, ks_statistic[[1]], ks_pval))
   df$lineage = lineage_comp
   df$method = ks_method
   df$ks_statistic = ks_statistic[[1]]
   df$ks_pvalue = ks_pvalue
   
   print(df)
   ks_df = rbind(ks_df,df)
   
}

#=======================================================================
# formatting
#=======================================================================
# add total_n number
ks_df$n_drug = nrow(df_lin_dr)
ks_df$n_others = nrow(df_lin_other)

# add group
ks_df$group = "Drug vs Others"

str(ks_df)
ks_df$pvalue_signif = ks_df$ks_pvalue
str(ks_df$pvalue_signif)

# adding pvalue_signif
ks_df = dplyr::mutate(ks_df, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                      , pvalue_signif <=0.0001 ~ '****'
                                                      , pvalue_signif <=0.001 ~ '***'
                                                      , pvalue_signif <=0.01 ~ '**'
                                                      , pvalue_signif <0.05 ~ '*'
                                                      , TRUE ~ 'ns'))


#=======================================================================
#******************
# write output file: KS test
#******************
cat("Output of KS test bt lineage:", ks_dr_others_lineage )
write.csv(ks_df, ks_dr_others_lineage, row.names = FALSE)