#!/usr/bin/env Rscript  
#########################################################
# TASK: producing boxplots for dr and other muts

#########################################################
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/plotting")
getwd()

#source("Header_TT.R")
library(tidyverse)
library(ggplot2)
library(data.table)
library(dplyr)

#=========
# Input
#=========
#source("combining_dfs_plotting.R")

source("output_tables.R")
rm(df, merged_df3_short, df_output)

#===============================================================
df_comp = df_ordered[!is.na(df_ordered$af),]

#%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
df = df_comp
#%%%%%%%%%%%%%%%%%%%%%

cols_all_muts_table = c("mutationinformation"
                        , "mutation_info"
                        , "af"
                        , "af_percent"

                        , "or_mychisq"

                        , "pval_fisher"
                        , "or_kin"
   
                        , "pwald_kin"
                        , "duet_stability_change"
                        , "duet_outcome"
                        , "ligand_distance"
                        , "ligand_affinity_change"
                        , "ligand_outcome"
                        , "ddg"
                        , "foldx_outcome"
                        , "asa"
                        , "rsa"
                        , "kd_values"
                        , "rd_values")

df = df[,cols_all_muts_table]
#===============================================================

#Most Frequent mutation 

mf = df[df$af_percent == max(df$af_percent), ]
mf

# highest OR
hor = df[df$or_mychisq == max(df$or_mychisq), ]
hor


# Most Destabilising for protein stability (DUET)
df_d = df[df$duet_outcome == "Destabilising",]

hd_duet = df_d[df_d$duet_stability_change == min(df_d$duet_stability_change), ]
hd_duet

# Most Stabilising for protein stability (DUET)
df_s = df[df$duet_outcome == "Stabilising",]
hs_duet = df_s[df_s$duet_stability_change == max(df_s$duet_stability_change), ]
hs_duet

# Closest Destabilising for protein stability
close_d = df_d[order(df_d$ligand_distance, df_d$duet_stability_change),]

# Closest Stabilising for protein stability
close_s = df_s[order(df_s$ligand_distance, df_s$duet_stability_change),]


#===============
# ligand affinity: filtered
#================
df_lig = df[df$ligand_distance<10,]

df_d_lig = df_lig[df_lig$ligand_outcome == "Destabilising",]
hd_lig= df_d_lig[df_d_lig$ligand_affinity_change == min(df_d_lig$ligand_affinity_change), ]
hd_lig


df_s_lig = df[df$ligand_outcome == "Stabilising",]
hs_lig= df_s_lig[df_s_lig$ligand_affinity_change == max(df_s_lig$ligand_affinity_change), ]
hs_lig


# Closest Destabilising for ligand affintiy
close_d_lig = df_d_lig[order(df_d_lig$ligand_distance, df_d_lig$ligand_affinity_change),]

# Closest Stabilising for ligand affinity
close_s_lig = df_s_lig[order(df_s_lig$ligand_distance, df_s_lig$ligand_affinity_change),]

#===============
# foldx
#===============
df_d_foldx = df[df$foldx_outcome == "Destabilising",]
hd_foldx = df_d_foldx[df_d_foldx$ddg == max(df_d_foldx$ddg), ]
hd_foldx

# Most Stabilising for protein stability (DUET)
df_s_foldx = df[df$foldx_outcome == "Stabilising",]
hs_foldx = df_s_foldx[df_s_foldx$ddg == min(df_s_foldx$ddg), ]
hs_foldx

#===============
# active site muts
#===============

aa_muts = merged_df3[merged_df3$ligand_distance<5,]

aa_dist = paste0(5, angstroms_symbol)

cat("No. of active site residues within", aa_dist, ":", nrow(aa_muts))

#====================
# budding hotspots
#====================
# this is what you want
foo = merged_df3 %>% group_by(position) %>% tally()
bar = merged_df3 %>% group_by(position) %>% count()

# sanity check
all(table(foo$n) == table(bar$n))
table(foo$n)


n_budding_sites = table(foo$n)[[2]]
n_mult_muts_sites = sum(table(foo$n)) - (table(foo$n)[[1]] - table(foo$n)[[2]])

cat("No of budding hotspots (sites with 2 mutations):", n_budding_sites
    , "\nNo. of sites with mutiple (>2) mutations:", n_mult_muts_sites)

#==========================================================================

#==============================
# agreement of foldx and DUET
#==============================
mcsm_foldx = merged_df3[which(merged_df3$duet_outcome != merged_df3$foldx_outcome),]

mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
table(mcsm_foldx$sign_comp)

# another way of checking
merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
table(merged_df3$sign_comp)

disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
agreement = 100 - disagreement

cat("There is", agreement, "% between mcsm and foldx predictions")

##############################################################################