diff --git a/scripts/plotting/basic_barplots_LIG.R b/scripts/plotting/basic_barplots_LIG.R
index 8e64937..dd14d79 100755
--- a/scripts/plotting/basic_barplots_LIG.R
+++ b/scripts/plotting/basic_barplots_LIG.R
@@ -188,6 +188,6 @@ OutPlot_lig_pos_count = g + geom_bar(aes (alpha = 0.5)
 print(OutPlot_lig_pos_count)
 dev.off()
 ########################################################################
-#               			end of lig barplots         			   
+#               			end of LIG barplots         			   
 ########################################################################
 
diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R
index 36c2483..aaab947 100755
--- a/scripts/plotting/basic_barplots_PS.R
+++ b/scripts/plotting/basic_barplots_PS.R
@@ -186,5 +186,5 @@ OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
 print(OutPlot_pos_count)
 dev.off()
 ########################################################################
-#               			end of Ligand barplots         			   
+#               			end of PS barplots         			   
 ########################################################################
diff --git a/scripts/plotting/corr_PS_LIG_all.R b/scripts/plotting/corr_PS_LIG_all.R
deleted file mode 100644
index 138ef11..0000000
--- a/scripts/plotting/corr_PS_LIG_all.R
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: Corr plots for PS and Lig 
-
-# Output: 1 svg
-
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
-#source("combining_dfs_plotting.R")
-source("my_pairs_panel.R") # with lower panel turned off
-source("corr_data.R")
-
-#=======
-# output
-#=======
-# PS
-corr_ps_all_df2 = "corr_PS_ALL_df2.svg"
-plot_corr_ps_all_df2 =  paste0(plotdir,"/", corr_ps_all_df2)
-
-corr_ps_all_df3 = "corr_PS_ALL_df3.svg"
-plot_corr_ps_all_df3 =  paste0(plotdir,"/", corr_ps_all_df3)
-
-# LIG
-corr_lig_all_df2 = "corr_LIG_ALL_df2.svg"
-plot_corr_lig_all_df2 =  paste0(plotdir,"/", corr_lig_all_df2)
-
-corr_lig_all_df3 = "corr_LIG_ALL_df3.svg"
-plot_corr_lig_all_df3 =  paste0(plotdir,"/", corr_lig_all_df3)
-
-####################################################################
-#               end of loading libraries and functions                
-####################################################################
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Data for plots
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-# PS
-corr_ps_df2 = corr_ps_df2[-1]
-corr_ps_df3 = corr_ps_df3[-1]
-
-# Lig
-corr_lig_df2 = corr_lig_df2[-1]
-corr_lig_df3 = corr_lig_df3[-1]
-
-#---------------------------------------
-# generate corr PS plot 1: merged_df2
-#---------------------------------------
-cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all_df2)
-svg(plot_corr_ps_all_df2, width = 30, height = 30)
-
-OutPlot_ps_df2 = pairs.panels(corr_ps_df2[1:(length(corr_ps_df2)-2)]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = T  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df2$duet_outcome))] # can't use colour as duet and foldx are opposite
-             , pch = 21 # for bg
-             #, pch = 19
-             , jitter = T
-             , alpha = 1
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 1.8
-             , cex.axis = 2
-             , cex.labels = 2
-             , cex.cor = 1
-             , smooth = F)
-
-print(OutPlot_ps_df2)
-dev.off()
-
-#----------------------------------------------
-# generate corr PS plot 2: merged_df3
-#----------------------------------------------
-cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all_df3)
-svg(plot_corr_ps_all_df3, width = 30, height = 30)
-
-OutPlot_ps_df3 = pairs.panels(corr_ps_df3[1:(length(corr_ps_df3)-2)]
-                 , method = "spearman" # correlation method
-                 , hist.col = "grey" ##00AFBB
-                 , density = T  # show density plots
-                 , ellipses = F # show correlation ellipses
-                 , stars = T
-                 , rug = F
-                 , breaks = "Sturges"
-                 , show.points = T
-                 , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df3$duet_outcome))] # can't use colour as duet and foldx are opposite
-                 , pch = 21 # for bg
-                 , cex = 2
-                 , cex.axis = 1.6
-                 , cex.labels = 2
-                 , cex.cor = 1
-                 , smooth = F
-                 
-)
-
-print(OutPlot_ps_df3)
-dev.off()
-
-################################################################################################
-
-
-#---------------------------------------
-# generate corr lig plot 1: merged_df2
-#---------------------------------------
-cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_all_df2)
-svg(plot_corr_lig_all_df2, width = 30, height = 30)
-
-OutPlot_lig_df2 = pairs.panels(corr_lig_df2[1:(length(corr_lig_df2)-2)]
-                               , method = "spearman" # correlation method
-                               , hist.col = "grey" ##00AFBB
-                               , density = T  # show density plots
-                               , ellipses = F # show correlation elliliges
-                               , stars = T
-                               , rug = F
-                               , breaks = "Sturges"
-                               , show.points = T
-                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df2$ligand_outcome))] # can't use colour as duet and foldx are opposite
-                               , pch = 21 # for bg
-                               #, pch = 19
-                               , jitter = T
-                               , alpha = 1
-                               #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-                               , cex = 1.8
-                               , cex.axis = 2
-                               , cex.labels = 2
-                               , cex.cor = 1
-                               , smooth = F)
-
-print(OutPlot_lig_df2)
-dev.off()
-
-#----------------------------------------------
-# generate corr lig plot 2: merged_df3
-#----------------------------------------------
-cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_all_df3)
-svg(plot_corr_lig_all_df3, width = 30, height = 30)
-
-OutPlot_lig_df3 = pairs.panels(corr_lig_df3[1:(length(corr_lig_df3)-2)]
-                               , method = "spearman" # correlation method
-                               , hist.col = "grey" ##00AFBB
-                               , density = T  # show density plots
-                               , ellipses = F # show correlation elliliges
-                               , stars = T
-                               , rug = F
-                               , breaks = "Sturges"
-                               , show.points = T
-                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df3$ligand_outcome))] # can't use colour as duet and foldx are opposite
-                               , pch = 21 # for bg
-                               , cex = 2
-                               , cex.axis = 1.6
-                               , cex.labels = 2
-                               , cex.cor = 1
-                               , smooth = F
-                               
-)
-
-print(OutPlot_lig_df3)
-dev.off()
diff --git a/scripts/plotting/corr_PS_LIG_v2.R b/scripts/plotting/corr_PS_LIG_v2.R
deleted file mode 100644
index 908b7d4..0000000
--- a/scripts/plotting/corr_PS_LIG_v2.R
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: Corr plots for PS and Lig 
-
-# Output: 1 svg
-
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
-#source("combining_dfs_plotting.R")
-source("my_pairs_panel.R") # with lower panel turned off
-source("corr_data.R")
-
-#=======
-# output
-#=======
-# PS
-corrplot_ps_df2 = "corr_PS_df2.svg"
-plot_corr_ps_df2 =  paste0(plotdir,"/", corrplot_ps_df2)
-
-corrplot_ps_df3 = "corr_PS_df3.svg"
-plot_corr_ps_df3 =  paste0(plotdir,"/", corrplot_ps_df3)
-
-# LIG
-corrplot_lig_df2 = "corr_LIG_df2.svg"
-plot_corr_lig_df2 =  paste0(plotdir,"/", corrplot_lig_df2)
-
-corrplot_lig_df3 = "corr_LIG_df3.svg"
-plot_corr_lig_df3 =  paste0(plotdir,"/", corrplot_lig_df3)
-
-####################################################################
-#               end of loading libraries and functions                
-####################################################################
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# Data for plots
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-cols_to_drop = c("ASA", "AF_kin")
-
-
-# PS
-corr_ps_df2 = corr_ps_df2[!colnames(corr_ps_df2)%in%cols_to_drop]
-corr_ps_df2 = corr_ps_df2[-1]
-
-corr_ps_df3 = corr_ps_df3[!colnames(corr_ps_df3)%in%cols_to_drop]
-corr_ps_df3 = corr_ps_df3[-1]
-
-
-# Lig
-corr_lig_df2 = corr_lig_df2[!colnames(corr_lig_df2)%in%cols_to_drop]
-corr_lig_df2 = corr_lig_df2[-1]
-
-corr_lig_df3 = corr_lig_df3[!colnames(corr_lig_df3)%in%cols_to_drop]
-corr_lig_df3 = corr_lig_df3[-1]
-
-#---------------------------------------
-# generate corr PS plot 1: merged_df2
-#---------------------------------------
-cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_df2)
-svg(plot_corr_ps_df2, width = 30, height = 25)
-
-OutPlot_ps_df2 = pairs.panels(corr_ps_df2[1:(length(corr_ps_df2)-2)]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = T  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df2$duet_outcome))] # can't use colour as duet and foldx are opposite
-             , pch = 21 # for bg
-             #, pch = 19
-             , jitter = T
-             , alpha = 1
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 1.8
-             , cex.axis = 2
-             , cex.labels = 3.8
-             , cex.cor = 1
-             , smooth = F)
-
-print(OutPlot_ps_df2)
-dev.off()
-
-#----------------------------------------------
-# generate corr PS plot 2: merged_df3
-#----------------------------------------------
-cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_df3)
-svg(plot_corr_ps_df3, width = 30, height = 25)
-
-OutPlot_ps_df3 = pairs.panels(corr_ps_df3[1:(length(corr_ps_df3)-2)]
-                 , method = "spearman" # correlation method
-                 , hist.col = "grey" ##00AFBB
-                 , density = T  # show density plots
-                 , ellipses = F # show correlation ellipses
-                 , stars = T
-                 , rug = F
-                 , breaks = "Sturges"
-                 , show.points = T
-                 , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df3$duet_outcome))] # can't use colour as duet and foldx are opposite
-                 , pch = 21 # for bg
-                 , cex = 3
-                 , cex.axis = 1.6
-                 , cex.labels = 3.8
-                 , cex.cor = 1
-                 , smooth = F
-                 
-)
-
-print(OutPlot_ps_df3)
-dev.off()
-
-################################################################################################
-
-#---------------------------------------
-# generate corr lig plot 1: merged_df2
-#---------------------------------------
-cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_df2)
-svg(plot_corr_lig_df2, width = 30, height = 25)
-
-OutPlot_lig_df2 = pairs.panels(corr_lig_df2[1:(length(corr_lig_df2)-2)]
-                               , method = "spearman" # correlation method
-                               , hist.col = "grey" ##00AFBB
-                               , density = T  # show density plots
-                               , ellipses = F # show correlation elliliges
-                               , stars = T
-                               , rug = F
-                               , breaks = "Sturges"
-                               , show.points = T
-                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df2$ligand_outcome))] # can't use colour as duet and foldx are opposite
-                               , pch = 21 # for bg
-                               #, pch = 19
-                               , jitter = T
-                               , alpha = 1
-                               #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-                               , cex = 1.8
-                               , cex.axis = 2
-                               , cex.labels = 3.8
-                               , cex.cor = 1
-                               , smooth = F)
-
-print(OutPlot_lig_df2)
-dev.off()
-
-#----------------------------------------------
-# generate corr lig plot 2: merged_df3
-#----------------------------------------------
-cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_df3)
-svg(plot_corr_lig_df3, width = 30, height = 25)
-
-OutPlot_lig_df3 = pairs.panels(corr_lig_df3[1:(length(corr_lig_df3)-2)]
-                               , method = "spearman" # correlation method
-                               , hist.col = "grey" ##00AFBB
-                               , density = T  # show density plots
-                               , ellipses = F # show correlation elliliges
-                               , stars = T
-                               , rug = F
-                               , breaks = "Sturges"
-                               , show.points = T
-                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df3$ligand_outcome))] # can't use colour as duet and foldx are opposite
-                               , pch = 21 # for bg
-                               , cex = 3
-                               , cex.axis = 1.6
-                               , cex.labels = 3.8
-                               , cex.cor = 1
-                               , smooth = F
-                               
-)
-
-print(OutPlot_lig_df3)
-dev.off()
-
diff --git a/scripts/plotting/corr_foldx.R b/scripts/plotting/corr_foldx.R
deleted file mode 100644
index c5e9569..0000000
--- a/scripts/plotting/corr_foldx.R
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: Corr plots for PS and Lig 
-
-# Output: 1 svg
-
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
-
-source("Header_TT.R")
-require(cowplot)
-source("combining_dfs_plotting.R") # FIXME: add extra from other plots here
-
-# should return the following dfs, directories and variables
-
-
-#=======
-# output
-#=======
-# can't combine by cowplot because not ggplots
-#corr_plot_combined = "corr_combined.svg"
-#plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
-
-# PS foldx
-corr_foldx = "corr_adjusted_foldx.svg"
-plot_corr_foldx =  paste0(plotdir,"/", corr_foldx)
-
-####################################################################
-#               end of loading libraries and functions                 #
-########################################################################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-df_ps = merged_df3
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-rm( merged_df2, merged_df2_comp, merged_df2_lig
-    , merged_df2_comp_lig
-    , merged_df3_comp, merged_df3_comp_lig
-    , my_df_u, my_df_u_lig)
-
-########################################################################
-#               end of data extraction and cleaning for plots          #
-########################################################################
-
-#===========================
-# Data for Correlation plots:foldx
-#===========================
-#============================
-# adding foldx scaled values
-# scale data b/w -1 and 1
-#============================
-n = which(colnames(df_ps) == "ddg"); n 
-
-my_min = min(df_ps[,n]); my_min 
-my_max = max(df_ps[,n]); my_max 
-
-df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
-                            , df_ps[,n]/abs(my_min)
-                            , df_ps[,n]/my_max) 
-# sanity check
-my_min = min(df_ps$foldx_scaled); my_min 
-my_max = max(df_ps$foldx_scaled); my_max
-
-if (my_min == -1 && my_max == 1){
-  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
-      , "\nProceeding with assigning foldx outcome category")
-}else{
-  cat("FAIL: could not scale foldx ddg values"
-      , "Aborting!")
-}
-
-
-#================================
-# adding foldx outcome category
-# ddg<0 = "Stabilising" (-ve)
-#=================================
-
-c1 = table(df_ps$ddg < 0)
-df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
-c2 = table(df_ps$ddg < 0)
-
-if ( all(c1 == c2) ){
-  cat("PASS: foldx outcome successfully created")
-}else{
-  cat("FAIL: foldx outcome could not be created. Aborting!")
-  exit()
-}
-
-table(df_ps$foldx_outcome)
-
-#======================
-# adding log cols 
-#======================
-df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
-df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
-
-df_ps$log10_or_kin = log10(df_ps$or_kin)
-df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
-
-# subset data to generate pairwise correlations
-cols_to_select_foldx =  c("foldx_scaled"
-                    
-                    , "duet_scaled"
-                    
-                    , "log10_or_mychisq"
-                    , "neglog_pval_fisher"
-                    
-                    , "log10_or_kin"
-                    , "neglog_pwald_kin"
-                    
-                    , "af"
-                    
-                    , "foldx_outcome"
-                    , drug)
-
-corr_data_foldx = df_ps[, cols_to_select_foldx]
-
-dim(corr_data_foldx)
-
-#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
-#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
-
-# assign nice colnames (for display)
-my_corr_colnames_foldx = c("Foldx"
-                     
-                     ,"DUET"
-                     
-                     , "Log(OR)"
-                     , "-Log(P)"
-                     
-                     , "Log(OR adjusted)"
-                     , "-Log(P wald)"
-                     
-                     , "AF"
-                     
-                     , "foldx_outcome"
-                     , drug)
-
-length(my_corr_colnames_foldx)
-
-colnames(corr_data_foldx)
-colnames(corr_data_foldx) <- my_corr_colnames_foldx
-colnames(corr_data_foldx)
-
-#-----------------
-# generate corr foldx plot
-#-----------------
-start = 1
-end = which(colnames(corr_data_foldx) == drug); end # should be the last column
-offset = 1
-
-my_corr_foldx = corr_data_foldx[start:(end-offset)]
-head(my_corr_foldx)
-
-#my_cols = c("#f8766d", "#00bfc4")
-# deep blue :#007d85
-# deep red: #ae301e
-
-cat("Corr plot foldx:", plot_corr_foldx)
-svg(plot_corr_foldx, width = 15, height = 15)
-
-OutPlot_foldx= pairs.panels(my_corr_foldx[1:(length(my_corr_foldx)-1)]
-             , method = "spearman" # correlation method
-             , hist.col = "grey" ##00AFBB
-             , density = TRUE  # show density plots
-             , ellipses = F # show correlation ellipses
-             , stars = T
-             , rug = F
-             , breaks = "Sturges"
-             , show.points = T
-             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_foldx$foldx_outcome))]
-             , pch = 21
-             , jitter = T
-             #, alpha = .05
-             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
-             , cex = 3
-             , cex.axis = 2.5
-             , cex.labels = 2.1
-             , cex.cor = 1
-             , smooth = F
-)
-
-print(OutPlot_foldx)
-dev.off()
-
-
-
diff --git a/scripts/plotting/ggridges_lineage_country.R b/scripts/plotting/ggridges_lineage_country.R
deleted file mode 100644
index ef91623..0000000
--- a/scripts/plotting/ggridges_lineage_country.R
+++ /dev/null
@@ -1,289 +0,0 @@
-
-
-#########################################################
-# 1: Installing and loading required packages
-#########################################################
-
-#source("../Header_TT.R")
-install.packages("qqman")
-library(qqman)
-
-source("combining_dfs_plotting.R")
-#mcsm_data: raw file, 225, 15
-#merged_df2 = 2201, 35
-#merged_df3 =  205, 35 ("Can't trust non-numerical params')
-
-#===============================================
-# PLOTS: DUET vs GWAS: non-numerical
-# lineage, country_code, etc
-# merged_df2: 1592, 35
-#===============================================
-
-#########################
-# Data for plot
-#########################
-df = merged_df2
-#df = merged_df2_comp 
-
-
-#========================
-# Plot 1a: Lineage barplot
-# x = lineage y = No of samples
-# col = Lineage
-# fill = lineage
-#========================
-table(df$lineage)
-
-# subset only lineages1-4
-sel_lineages = c("lineage1"
-                 , "lineage2"
-                 , "lineage3"
-                 , "lineage4"
-                 #, "lineage5"
-                 #, "lineage6"
-                 #, "lineage7"
-)
-
-# uncomment as necessary
-df_lin = subset(df, subset = lineage %in% sel_lineages )
-table(df_lin$lineage)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df <- df_lin
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df2 = df
-#%%%%%%%%%%%%%%%%%%%%%%%%
-df2 = df2%>%
-  add_count(country_code)
-
-str(df2$country_code); str(df2$n)
-
-n = which(colnames(df2) == "n")
-colnames(df2)[n] = "count_country"
-
-table(df2$count_country>100 & df$country_code!= "")
-df3 = subset(df2, df2$count_country>100 & df2$country_code != "")
-
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-df = df3
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-sample = sum(table(unique(df$id))); sample
-table(df$country_code)
-tab = sum(table(df$country_code)); tab
-
-
-View(table(df$country_code))
-View(t1)
-
-############## begin plot
-g = ggplot(df, aes(x = lineage))
-g + geom_bar(aes(fill = lineage)) +
-  theme( axis.text.x = element_text(size = 13
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = 15
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = 15)
-         , axis.title.y = element_text(size = 15) ) +
-  labs(title = "Lineage"
-       , x = "Lineage"
-       , y = "No of samples")
-
-
-#========================
-# Plot 2: DUET, lineage, country_code and or_mychisq
-# x = lineage y = DUET
-# col = Lineage
-# fill = country_code
-#========================
-### begin plot
-g = ggplot(df, aes(x = country_code
-                   , y = duet_scaled))
-g + geom_point(aes(col = lineage
-                   , size = or_mychisq)) +
-  theme(axis.text.x = element_text(size = 13
-                                    , angle = 90
-                                    , hjust = 1
-                                    , vjust = 0.4)
-         , axis.text.y = element_text(size = 15
-                                      , angle = 0
-                                      , hjust = 1
-                                      , vjust = 0)
-         , axis.title.x = element_text(size = 15)
-         , axis.title.y = element_text(size = 15) ) +
-  labs(title = "DUET, country_code, lineage, or_mychisq"
-       , x = "Lineage"
-       , y = "DUET (PS)")
-
-
-#############
-#========================
-# Plot 3: DUET, lineage, or_mychisq
-# x = lineage y = DUET
-# col = Lineage
-# fill = country_code
-#========================
-
-### begin plot
-table(df$lineage)
-
-g = ggplot(df_lin, aes(x = lineage
-                   , y = duet_scaled))
-g + geom_point(aes(col = lineage
-                   , size = or_mychisq)) +
-  theme(axis.text.x = element_text(size = 13
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        , axis.text.y = element_text(size = 15
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0)
-        , axis.title.x = element_text(size = 15)
-        , axis.title.y = element_text(size = 15) ) +
-  labs(title = "DUET, lineage, or_mychisq"
-       , x = "Lineage"
-       , y = "DUET (PS)")
-
-#========================
-# Plot 4-5: Distributions
-# ggrdiges
-#========================
-
-
-#==================================================
-my_ats = 15 # axis text size
-my_als = 20 # axis label size
-
-my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
-              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
-              )
-names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
-                     # , 'lineage5', 'lineage6', 'lineage7'
-                     )
-      
-
-#========================
-# Plot 4: Distribution
-# x = duet_scaled
-# y = country
-# fill = country_code
-# facet = lineage
-#========================
-# works neatly!
-
-p1 = ggplot(df, aes(x = duet_scaled
-                    , y = country_code))+
-  
-  #printFile=geom_density_ridges_gradient(
-  geom_density_ridges_gradient(aes(fill = country_code)
-                               , jittered_points = TRUE
-                               , scale = 3
-                               , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-              , switch = 'x'
-              , labeller = labeller(lineage = my_labels)
-              ) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  #scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-  #                     , name = "DUET" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        
-        #, axis.text.y = element_blank()
-        , axis.title.x = element_blank()
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = my_als-5)
-        , legend.title = element_text(size = my_als)
-  ) 
-
-p1
-
-
-#========================
-# Plot 5: Distribution
-# x = duet_scaled
-# y = country_code
-# fill = lineage
-# facet = NONE
-#========================
-# no facet wrap
-
-p2 = ggplot(df, aes(x = duet_scaled
-                    , y = country_code))+
-  
-  geom_density_ridges_gradient(aes(fill = factor(lineage))
-                               , scale = 3
-                               , size = 0.3 ) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  #scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-  #                     , name = "DUET" ) + 
-  #scale_fill_continuous(colours = c("darkgreen", "pink", "orange", "brown")
-  #                     , name = "lineage" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        
-        #, axis.text.y = element_blank()
-        , axis.title.x = element_blank()
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = my_als-5)
-        , legend.title = element_text(size = my_als)
-  ) 
-
-p2
-
-
-#===============
-# lineage only
-#================
-#svg(plot_lineage_duet)
-p3 = ggplot(df, aes(x = duet_scaled
-                    , y = duet_outcome))+
-  geom_density_ridges_gradient(aes(fill = ..x..)
-                               , jittered_points = TRUE
-                               , scale = 3
-                               , size = 0.3 ) +
-  facet_wrap( ~lineage
-              , scales = "free"
-              #, switch = 'x'
-              , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)) +
-  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
-                       , name = "DUET" ) + 
-  theme(axis.text.x = element_text(size = my_ats
-                                   , angle = 90
-                                   , hjust = 1
-                                   , vjust = 0.4)
-        
-        , axis.text.y = element_blank()
-        , axis.title.x = element_blank()
-        , axis.title.y = element_blank()
-        , axis.ticks.y = element_blank()
-        , plot.title = element_blank()
-        , strip.text = element_text(size = my_als)
-        , legend.text = element_text(size = my_als-5)
-        , legend.title = element_text(size = my_als)
-  ) 
-
-print(p3)
diff --git a/scripts/prediction_all.R b/scripts/prediction_all.R
deleted file mode 100644
index fa91239..0000000
--- a/scripts/prediction_all.R
+++ /dev/null
@@ -1,426 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: prediction
-
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/")
-getwd()
-
-source("plotting/combining_dfs_plotting.R")
-
-####################################################################
-#               end of loading libraries and functions              
-####################################################################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# ps
-table(merged_df2$mutation_info)
-merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0)
-table(merged_df2$mutation_info_labels)
-
-
-table(merged_df3$mutation_info)
-merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
-table(merged_df3$mutation_info_labels)
-
-
-# lig
-table(merged_df2_lig$mutation_info)
-merged_df2_lig$mutation_info_labels = ifelse(merged_df2_lig$mutation_info == dr_muts_col, 1, 0)
-table(merged_df2_lig$mutation_info_labels)
-
-
-table(merged_df3_lig$mutation_info)
-merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0)
-table(merged_df3_lig$mutation_info_labels)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-###############################################################################
-model_ind = glm(mutation_info_labels ~ or_mychisq
-    , data = merged_df2
-    , family = "binomial")
-summary(model_ind)
-nobs(model_ind)
-
-#=============
-# try loop
-#=============
-my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled")
-
-for( i in my_ivs){
-  cat ("===============================\n")
-  cat(i)
-  cat ("\n===============================\n")
-  print(summary(glm(mutation_info_labels ~ eval(parse(text=i))
-              , data = merged_df2
-              , family = "binomial")))
-}
-###############################################################################
-#========================================
-# merged_df2: UNadjusted,loop
-#========================================
-my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
-           , "ligand_distance"
-           , "rsa"
-           , "rd_values"
-           , "kd_values"
-           , "duet_stability_change" 
-           , "duet_scaled"
-           , "duet_outcome"
-           , "ddg"
-           , "foldx_scaled"
-           , "foldx_outcome")
-
-ps_logistic_df2  = data.frame()
-
-for( i in my_ivs){
-  print(i)
-  
-  df = data.frame(var_name = NA
-                  , number_samples = NA
-                  , beta = NA
-                  , odds_ratio = NA
-                  , pvalue = NA
-                  , se = NA
-                  , zvalue = NA
-                  , ci_lower = NA
-                  , ci_upper = NA)
-  
-  model = glm(mutation_info_labels ~ eval(parse(text=i))
-              , data = merged_df2
-              , family = "binomial")
-  
-  var_name = i
-  
-  number_samples = nobs(model)
-  
-  beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
-  
-  or_logistic = exp(summary(model)$coefficients[2,1])
-  
-  pval_logistic = summary(model)$coefficients[2,4]
-  
-  se_logistic = summary(model)$coefficients[2,2]
-  
-  zval_logistic = summary(model)$coefficients[2,3]
-  
-  ci_mod = exp(confint(model))[2,]
-  
-  ci_lower_logistic = ci_mod[["2.5 %"]]
-  ci_upper_logistic = ci_mod[["97.5 %"]]
-  
-  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
-  
-  df$var_name = var_name 
-  df$number_samples = number_samples
-  df$beta = beta_logistic
-  df$odds_ratio = or_logistic
-  df$pvalue = pval_logistic
-  df$se = se_logistic
-  df$zvalue = zval_logistic
-  df$ci_lower = ci_lower_logistic 
-  df$ci_upper = ci_upper_logistic 
-  
-  print(df)
-  ps_logistic_df2 = rbind(ps_logistic_df2, df)
-  
-}
-#--------------------
-# formatting df
-#--------------------
-ps_logistic_df2$data_source = "df2"
-ps_logistic_df2$model = "unadjusted"
-
-ps_logistic_df2$odds_ratio = round(ps_logistic_df2$odds_ratio, 2)
-ps_logistic_df2$ci_lower = round(ps_logistic_df2$ci_lower, 2)
-ps_logistic_df2$ci_upper = round(ps_logistic_df2$ci_upper, 2)
-
-# adding pvalue_signif
-ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue
-str(ps_logistic_df2$pvalue_signif)
-
-ps_logistic_df2 = dplyr::mutate(ps_logistic_df2
-                                , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                            , pvalue_signif <=0.0001 ~ '****'
-                                                            , pvalue_signif <=0.001 ~ '***'
-                                                            , pvalue_signif <=0.01 ~ '**'
-                                                            , pvalue_signif <0.05 ~ '*'
-                                                            , TRUE ~ 'ns'))
-# rearranging columns
-colnames(ps_logistic_df2)
-ps_logistic_df2_o = ps_logistic_df2 [c("var_name"    
-                   , "number_samples"
-                   , "model"
-                   , "odds_ratio"
-                   , "pvalue"
-                   , "pvalue_signif"
-                   , "beta"
-                   , "se"
-                   , "zvalue"
-                   , "ci_lower"
-                   , "ci_upper"
-                   , "data_source")]
-
-###############################################################################
-#========================================
-# merged_df2: adjusted, loop
-#========================================
-#model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
-#                       duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
-#            , data = merged_df2
-#           , family = "binomial")
-
-model_adjusted_df2 = glm(mutation_info_labels ~ or_mychisq + or_kin + rd_values + kd_values +
-                           ligand_distance + duet_stability_change 
-                         , data = merged_df2
-                         , family = "binomial");summary(model_adjusted_df2)
-
-var_names_df = as.data.frame(names(model_adjusted_df2$coefficients))
-names(var_names_df) = c("var_name")
-
-ci_mod = exp(confint(model_adjusted_df2))
-ci_mod_df = as.data.frame(ci_mod)
-names(ci_mod_df) = c("ci_lower", "ci_upper")
-ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
-ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
-
-
-estimates_df = as.data.frame(summary(model_adjusted_df2)$coefficients)
-names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
-estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
-number_samples = nobs(model_adjusted_df2)
-estimates_df$number_samples = number_samples
-estimates_df$data_source = "df2"
-estimates_df$model = "adjusted"
-
-names(ps_logistic_adjusted_df2)
-
-if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
-  cat("PASS: rownames match. Preparing to merge...")
-  ps_logistic_adjusted_df2 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
-}
-
-
-colnames(ps_logistic_adjusted_df2)[1] <- c("var_name")
-d1 = which(ps_logistic_adjusted_df2$var_name ==  "(Intercept)")
-ps_logistic_adjusted_df2 = ps_logistic_adjusted_df2[-d1,]
-
-# adding pvalue_signif
-ps_logistic_adjusted_df2$pvalue_signif = ps_logistic_adjusted_df2$pvalue
-str(ps_logistic_adjusted_df2$pvalue_signif)
-
-ps_logistic_adjusted_df2 = dplyr::mutate(ps_logistic_adjusted_df2
-                                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                                     , pvalue_signif <=0.0001 ~ '****'
-                                                                     , pvalue_signif <=0.001 ~ '***'
-                                                                     , pvalue_signif <=0.01 ~ '**'
-                                                                     , pvalue_signif <0.05 ~ '*'
-                                                                     , TRUE ~ 'ns'))
-# rearranging columns
-colnames(ps_logistic_adjusted_df2)
-ps_logistic_adjusted_df2_o = ps_logistic_adjusted_df2[c("var_name"    
-                                                        , "number_samples"
-                                                        , "model"
-                                                        , "odds_ratio"
-                                                        , "pvalue"
-                                                        , "pvalue_signif"
-                                                        , "beta"
-                                                        , "se"
-                                                        , "zvalue"
-                                                        , "ci_lower"
-                                                        , "ci_upper"
-                                                        , "data_source")]
-
-###############################################################################
-
-###############################################################################
-#========================================
-# merged_df3: UNadjusted,loop
-#========================================
-
-my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
-           , "ligand_distance"
-           , "rsa"
-           , "rd_values"
-           , "kd_values"
-           , "duet_stability_change" 
-           , "duet_scaled"
-           , "duet_outcome"
-           , "ddg"
-           , "foldx_scaled"
-           , "foldx_outcome")
-
-ps_logistic_df3  = data.frame()
-
-for( i in my_ivs){
-  print(i)
-  
-  df = data.frame(var_name = NA
-                  , number_samples = NA
-                  , beta = NA
-                  , odds_ratio = NA
-                  , pvalue = NA
-                  , se = NA
-                  , zvalue = NA
-                  , ci_lower = NA
-                  , ci_upper = NA)
-  
-  model = glm(mutation_info_labels ~ eval(parse(text=i))
-              , data = merged_df3
-              , family = "binomial")
-  
-  var_name = i
-  
-  number_samples = nobs(model)
-  
-  beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
-  
-  or_logistic = exp(summary(model)$coefficients[2,1])
-  
-  pval_logistic = summary(model)$coefficients[2,4]
-  
-  se_logistic = summary(model)$coefficients[2,2]
-  
-  zval_logistic = summary(model)$coefficients[2,3]
-  
-  ci_mod = exp(confint(model))[2,]
-  
-  ci_lower_logistic = ci_mod[["2.5 %"]]
-  ci_upper_logistic = ci_mod[["97.5 %"]]
-  
-  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
-  
-  df$var_name = var_name 
-  df$number_samples = number_samples
-  df$beta = beta_logistic
-  df$odds_ratio = or_logistic
-  df$pvalue = pval_logistic
-  df$se = se_logistic
-  df$zvalue = zval_logistic
-  df$ci_lower = ci_lower_logistic 
-  df$ci_upper = ci_upper_logistic 
-  
-  print(df)
-  ps_logistic_df3 = rbind(ps_logistic_df3, df)
-  
-}
-#--------------------
-# formatting df
-#--------------------
-ps_logistic_df3$data_source = "df3"
-ps_logistic_df3$model = "unadjusted"
-
-ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2)
-ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2)
-ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2)
-
-# adding pvalue_signif
-ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue
-str(ps_logistic_df3$pvalue_signif)
-
-ps_logistic_df3 = dplyr::mutate(ps_logistic_df3
-                                , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                            , pvalue_signif <=0.0001 ~ '****'
-                                                            , pvalue_signif <=0.001 ~ '***'
-                                                            , pvalue_signif <=0.01 ~ '**'
-                                                            , pvalue_signif <0.05 ~ '*'
-                                                            , TRUE ~ 'ns'))
-# rearranging columns
-ps_logistic_df3_o = ps_logistic_df3[c("var_name"
-                                      , "number_samples"
-                                      , "model"
-                                      , "odds_ratio"
-                                      , "pvalue"
-                                      , "pvalue_signif"
-                                      , "beta"
-                                      , "se"
-                                      , "zvalue"
-                                      , "ci_lower"
-                                      , "ci_upper"
-                                      , "data_source")]
-
-#========================================
-# merged_df3: adjusted, loop
-#========================================
-#model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
-#                       duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
-#            , data = merged_df3
-#           , family = "binomial")
-
-model_adjusted_df3 = glm(mutation_info_labels ~ rd_values +
-                           ligand_distance + duet_stability_change 
-                         , data = merged_df3
-                         , family = "binomial");summary(model_adjusted_df3)
-
-var_names_df = as.data.frame(names(model_adjusted_df3$coefficients))
-names(var_names_df) = c("var_name")
-
-ci_mod = exp(confint(model_adjusted_df3))
-ci_mod_df = as.data.frame(ci_mod)
-names(ci_mod_df) = c("ci_lower", "ci_upper")
-ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
-ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
-
-
-estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients)
-names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
-estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
-number_samples = nobs(model_adjusted_df3)
-estimates_df$number_samples = number_samples
-estimates_df$data_source = "df3"
-estimates_df$model = "adjusted"
-
-names(ps_logistic_adjusted_df3)
-
-if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
-  cat("PASS: rownames match. Preparing to merge...")
-  ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
-}
-
-colnames(ps_logistic_adjusted_df3)[1] <- c("var_name")
-d2 = which(ps_logistic_adjusted_df3$var_name ==  "(Intercept)")
-ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,]
-
-# adding pvalue_signif
-ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue
-str(ps_logistic_adjusted_df3$pvalue_signif)
-
-ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3
-                                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                                     , pvalue_signif <=0.0001 ~ '****'
-                                                                     , pvalue_signif <=0.001 ~ '***'
-                                                                     , pvalue_signif <=0.01 ~ '**'
-                                                                     , pvalue_signif <0.05 ~ '*'
-                                                                     , TRUE ~ 'ns'))
-# rearranging columns
-colnames(ps_logistic_adjusted_df3)
-ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name"    
-                                                        , "number_samples"
-                                                        , "model"
-                                                        , "odds_ratio"
-                                                        , "pvalue"
-                                                        , "pvalue_signif"
-                                                        , "beta"
-                                                        , "se"
-                                                        , "zvalue"
-                                                        , "ci_lower"
-                                                        , "ci_upper"
-                                                        ,"data_source")]
-
-#-------------
-# lm
-#-------------
-
-model_lm = lm(or_kin ~ rsa + rd_values + duet_stability_change +  ddg + mutation_info_labels
-              , data = merged_df3)
-
-summary(model_lm)
-
-
-model_lm1 = lm(or_mychisq ~ mutation_info_labels
-               , data = merged_df2)
-
-summary(model_lm1)
diff --git a/scripts/prediction_lig.R b/scripts/prediction_lig.R
deleted file mode 100644
index dd6ee19..0000000
--- a/scripts/prediction_lig.R
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: prediction lig
-
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/")
-getwd()
-
-source("plotting/combining_dfs_plotting.R")
-
-#=======
-# output
-#=======
-lig_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_LIG.csv")
-lig_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_LIG.csv")
-
-####################################################################
-#               end of loading libraries and functions              
-####################################################################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# lig
-table(merged_df3_lig$mutation_info)
-merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0)
-table(merged_df3_lig$mutation_info_labels)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-###############################################################################
-
-#========================================
-# merged_df3_lig: UNadjusted,loop
-#========================================
-my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
-           #, "ligand_distance"
-           , "rsa"
-           , "rd_values"
-           , "kd_values"
-           , "ligand_affinity_change" 
-           , "affinity_scaled"
-           , "ligand_outcome")
-
-lig_logistic_df3  = data.frame()
-
-for( i in my_ivs){
-  print(i)
-  
-  df = data.frame(var_name = NA
-                  , number_samples = NA
-                  , beta = NA
-                  , odds_ratio = NA
-                  , pvalue = NA
-                  , se = NA
-                  , zvalue = NA
-                  , ci_lower = NA
-                  , ci_upper = NA)
-  
-  model_lig = glm(mutation_info_labels ~ eval(parse(text=i))
-                  , data = merged_df3_lig
-                  , family = "binomial")
-  
-  var_name = i
-  
-  number_samples = nobs(model_lig)
-  
-  beta_logistic = summary(model_lig)$coefficients[2,1]; beta_logistic
-  
-  or_logistic = exp(summary(model_lig)$coefficients[2,1])
-  
-  pval_logistic = summary(model_lig)$coefficients[2,4]
-  
-  se_logistic = summary(model_lig)$coefficients[2,2]
-  
-  zval_logistic = summary(model_lig)$coefficients[2,3]
-  
-  ci_mod = exp(confint(model_lig))[2,]
-  
-  ci_lower_logistic = ci_mod[["2.5 %"]]
-  ci_upper_logistic = ci_mod[["97.5 %"]]
-  
-  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
-  
-  df$var_name = var_name 
-  df$number_samples = number_samples
-  df$beta = beta_logistic
-  df$odds_ratio = or_logistic
-  df$pvalue = pval_logistic
-  df$se = se_logistic
-  df$zvalue = zval_logistic
-  df$ci_lower = ci_lower_logistic 
-  df$ci_upper = ci_upper_logistic 
-  
-  print(df)
-  lig_logistic_df3 = rbind(lig_logistic_df3, df)
-  
-}
-#--------------------
-# formatting df
-#--------------------
-lig_logistic_df3$data_source = "df3_lig"
-lig_logistic_df3$model_lig = "unadjusted"
-
-lig_logistic_df3$odds_ratio = round(lig_logistic_df3$odds_ratio, 2)
-lig_logistic_df3$ci_lower = round(lig_logistic_df3$ci_lower, 2)
-lig_logistic_df3$ci_upper = round(lig_logistic_df3$ci_upper, 2)
-
-# adding pvalue_signif
-lig_logistic_df3$pvalue_signif = lig_logistic_df3$pvalue
-str(lig_logistic_df3$pvalue_signif)
-
-lig_logistic_df3 = dplyr::mutate(lig_logistic_df3
-                                 , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                             , pvalue_signif <=0.0001 ~ '****'
-                                                             , pvalue_signif <=0.001 ~ '***'
-                                                             , pvalue_signif <=0.01 ~ '**'
-                                                             , pvalue_signif <0.05 ~ '*'
-                                                             , TRUE ~ 'ns'))
-# rearranging columns
-lig_logistic_df3_o = lig_logistic_df3[c("var_name"
-                                        , "number_samples"
-                                        , "model_lig"
-                                        , "odds_ratio"
-                                        , "pvalue"
-                                        , "pvalue_signif"
-                                        , "beta"
-                                        , "se"
-                                        , "zvalue"
-                                        , "ci_lower"
-                                        , "ci_upper"
-                                        , "data_source")]
-# writing file
-write.csv(lig_logistic_df3_o, lig_unadjusted, row.names = F)
-
-#========================================
-# merged_df3_lig: adjusted, loop
-#========================================
-#model_lig_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
-#                       ligand_affinity_change + affinity_scaled + ligand_outcome 
-#            , data = merged_df3_lig
-#           , family = "binomial")
-
-model_lig_adjusted_df3 = glm(mutation_info_labels ~ rd_values + ligand_affinity_change 
-                             , data = merged_df3_lig
-                             , family = "binomial");summary(model_lig_adjusted_df3)
-
-var_names_df = as.data.frame(names(model_lig_adjusted_df3$coefficients))
-names(var_names_df) = c("var_name")
-
-ci_mod = exp(confint(model_lig_adjusted_df3))
-ci_mod_df = as.data.frame(ci_mod)
-names(ci_mod_df) = c("ci_lower", "ci_upper")
-ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
-ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
-
-
-estimates_df = as.data.frame(summary(model_lig_adjusted_df3)$coefficients)
-names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
-estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
-number_samples = nobs(model_lig_adjusted_df3)
-estimates_df$number_samples = number_samples
-estimates_df$data_source = "df3_lig"
-estimates_df$model_lig = "adjusted"
-
-names(lig_logistic_adjusted_df3)
-
-if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
-  cat("PASS: rownames match. Preparing to merge...")
-  lig_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
-}
-
-colnames(lig_logistic_adjusted_df3)[1] <- c("var_name")
-d2 = which(lig_logistic_adjusted_df3$var_name ==  "(Intercept)")
-lig_logistic_adjusted_df3 = lig_logistic_adjusted_df3[-d2,]
-
-# adding pvalue_signif
-lig_logistic_adjusted_df3$pvalue_signif = lig_logistic_adjusted_df3$pvalue
-str(lig_logistic_adjusted_df3$pvalue_signif)
-
-lig_logistic_adjusted_df3 = dplyr::mutate(lig_logistic_adjusted_df3
-                                          , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                                      , pvalue_signif <=0.0001 ~ '****'
-                                                                      , pvalue_signif <=0.001 ~ '***'
-                                                                      , pvalue_signif <=0.01 ~ '**'
-                                                                      , pvalue_signif <0.05 ~ '*'
-                                                                      , TRUE ~ 'ns'))
-# rearranging columns
-colnames(lig_logistic_adjusted_df3)
-lig_logistic_adjusted_df3_o = lig_logistic_adjusted_df3[c("var_name"    
-                                                          , "number_samples"
-                                                          , "model_lig"
-                                                          , "odds_ratio"
-                                                          , "pvalue"
-                                                          , "pvalue_signif"
-                                                          , "beta"
-                                                          , "se"
-                                                          , "zvalue"
-                                                          , "ci_lower"
-                                                          , "ci_upper"
-                                                          ,"data_source")]
-# writing file
-write.csv(lig_logistic_adjusted_df3_o, lig_adjusted, row.names = F)
\ No newline at end of file
diff --git a/scripts/prediction_ps.R b/scripts/prediction_ps.R
deleted file mode 100644
index 605f058..0000000
--- a/scripts/prediction_ps.R
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/usr/bin/env Rscript
-#########################################################
-# TASK: prediction_ps
-
-#=======================================================================
-# working dir and loading libraries
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/")
-getwd()
-
-source("plotting/combining_dfs_plotting.R")
-
-#=======
-# output
-#=======
-ps_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_PS.csv")
-ps_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_PS.csv")
-
-####################################################################
-#               end of loading libraries and functions              
-####################################################################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# ps
-table(merged_df3$mutation_info)
-merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
-table(merged_df3$mutation_info_labels)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-###############################################################################
-#========================================
-# merged_df3: UNadjusted,loop
-#========================================
-my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
-           , "ligand_distance"
-           , "rsa"
-           , "rd_values"
-           , "kd_values"
-           , "duet_stability_change" 
-           , "duet_scaled"
-           , "duet_outcome"
-           , "ddg"
-           , "foldx_scaled"
-           , "foldx_outcome")
-
-ps_logistic_df3  = data.frame()
-
-for( i in my_ivs){
-  print(i)
-  
-  df = data.frame(var_name = NA
-                  , number_samples = NA
-                  , beta = NA
-                  , odds_ratio = NA
-                  , pvalue = NA
-                  , se = NA
-                  , zvalue = NA
-                  , ci_lower = NA
-                  , ci_upper = NA)
-  
-  model = glm(mutation_info_labels ~ eval(parse(text=i))
-              , data = merged_df3
-              , family = "binomial")
-  
-  var_name = i
-  
-  number_samples = nobs(model)
-  
-  beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
-  
-  or_logistic = exp(summary(model)$coefficients[2,1])
-  
-  pval_logistic = summary(model)$coefficients[2,4]
-  
-  se_logistic = summary(model)$coefficients[2,2]
-  
-  zval_logistic = summary(model)$coefficients[2,3]
-  
-  ci_mod = exp(confint(model))[2,]
-  
-  ci_lower_logistic = ci_mod[["2.5 %"]]
-  ci_upper_logistic = ci_mod[["97.5 %"]]
-  
-  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
-  
-  df$var_name = var_name 
-  df$number_samples = number_samples
-  df$beta = beta_logistic
-  df$odds_ratio = or_logistic
-  df$pvalue = pval_logistic
-  df$se = se_logistic
-  df$zvalue = zval_logistic
-  df$ci_lower = ci_lower_logistic 
-  df$ci_upper = ci_upper_logistic 
-  
-  print(df)
-  ps_logistic_df3 = rbind(ps_logistic_df3, df)
-  
-}
-#--------------------
-# formatting df
-#--------------------
-ps_logistic_df3$data_source = "df3"
-ps_logistic_df3$model = "unadjusted"
-
-ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2)
-ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2)
-ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2)
-
-# adding pvalue_signif
-ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue
-str(ps_logistic_df3$pvalue_signif)
-
-ps_logistic_df3 = dplyr::mutate(ps_logistic_df3
-                                , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                            , pvalue_signif <=0.0001 ~ '****'
-                                                            , pvalue_signif <=0.001 ~ '***'
-                                                            , pvalue_signif <=0.01 ~ '**'
-                                                            , pvalue_signif <0.05 ~ '*'
-                                                            , TRUE ~ 'ns'))
-# rearranging columns
-ps_logistic_df3_o = ps_logistic_df3[c("var_name"
-                                      , "number_samples"
-                                      , "model"
-                                      , "odds_ratio"
-                                      , "pvalue"
-                                      , "pvalue_signif"
-                                      , "beta"
-                                      , "se"
-                                      , "zvalue"
-                                      , "ci_lower"
-                                      , "ci_upper"
-                                      , "data_source")]
-# writing file
-write.csv(ps_logistic_df3_o, ps_unadjusted, row.names = F)
-
-#========================================
-# merged_df3: adjusted, loop
-#========================================
-#model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
-#                       duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
-#            , data = merged_df3
-#           , family = "binomial")
-
-model_adjusted_df3 = glm(mutation_info_labels ~ rd_values +
-                           ligand_distance + duet_stability_change 
-                         , data = merged_df3
-                         , family = "binomial");summary(model_adjusted_df3)
-
-var_names_df = as.data.frame(names(model_adjusted_df3$coefficients))
-names(var_names_df) = c("var_name")
-
-ci_mod = exp(confint(model_adjusted_df3))
-ci_mod_df = as.data.frame(ci_mod)
-names(ci_mod_df) = c("ci_lower", "ci_upper")
-ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
-ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
-
-
-estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients)
-names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
-estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
-number_samples = nobs(model_adjusted_df3)
-estimates_df$number_samples = number_samples
-estimates_df$data_source = "df3"
-estimates_df$model = "adjusted"
-
-names(ps_logistic_adjusted_df3)
-
-if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
-  cat("PASS: rownames match. Preparing to merge...")
-  ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
-}
-
-colnames(ps_logistic_adjusted_df3)[1] <- c("var_name")
-d2 = which(ps_logistic_adjusted_df3$var_name ==  "(Intercept)")
-ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,]
-
-# adding pvalue_signif
-ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue
-str(ps_logistic_adjusted_df3$pvalue_signif)
-
-ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3
-                                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
-                                                                     , pvalue_signif <=0.0001 ~ '****'
-                                                                     , pvalue_signif <=0.001 ~ '***'
-                                                                     , pvalue_signif <=0.01 ~ '**'
-                                                                     , pvalue_signif <0.05 ~ '*'
-                                                                     , TRUE ~ 'ns'))
-# rearranging columns
-colnames(ps_logistic_adjusted_df3)
-ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name"    
-                                                        , "number_samples"
-                                                        , "model"
-                                                        , "odds_ratio"
-                                                        , "pvalue"
-                                                        , "pvalue_signif"
-                                                        , "beta"
-                                                        , "se"
-                                                        , "zvalue"
-                                                        , "ci_lower"
-                                                        , "ci_upper"
-                                                        ,"data_source")]
-# writing file
-write.csv(ps_logistic_adjusted_df3_o, ps_adjusted, row.names = F)
-###############################################################################
\ No newline at end of file