moved not required plots to scratch

2020-10-06 09:52:54 +01:00 · 2020-10-06 09:52:54 +01:00 · 4c345ea9f4
commit 4c345ea9f4
parent 9597997741
9 changed files with 2 additions and 1660 deletions
--- a/scripts/plotting/basic_barplots_LIG.R
+++ b/scripts/plotting/basic_barplots_LIG.R
@ -188,6 +188,6 @@ OutPlot_lig_pos_count = g + geom_bar(aes (alpha = 0.5)
 print(OutPlot_lig_pos_count)
 dev.off()
 ########################################################################
-#               			end of lig barplots         			   
+#               			end of LIG barplots         			   
 ########################################################################
--- a/scripts/plotting/basic_barplots_PS.R
+++ b/scripts/plotting/basic_barplots_PS.R
@ -186,5 +186,5 @@ OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5)
 print(OutPlot_pos_count)
 dev.off()
 ########################################################################
-#               			end of Ligand barplots         			   
+#               			end of PS barplots         			   
 ########################################################################
--- a/scripts/plotting/corr_PS_LIG_all.R
+++ b/scripts/plotting/corr_PS_LIG_all.R
@ -1,166 +0,0 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Corr plots for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 #source("combining_dfs_plotting.R")
 source("my_pairs_panel.R") # with lower panel turned off
 source("corr_data.R")
 #=======
 # output
 #=======
 # PS
 corr_ps_all_df2 = "corr_PS_ALL_df2.svg"
 plot_corr_ps_all_df2 =  paste0(plotdir,"/", corr_ps_all_df2)
 corr_ps_all_df3 = "corr_PS_ALL_df3.svg"
 plot_corr_ps_all_df3 =  paste0(plotdir,"/", corr_ps_all_df3)
 # LIG
 corr_lig_all_df2 = "corr_LIG_ALL_df2.svg"
 plot_corr_lig_all_df2 =  paste0(plotdir,"/", corr_lig_all_df2)
 corr_lig_all_df3 = "corr_LIG_ALL_df3.svg"
 plot_corr_lig_all_df3 =  paste0(plotdir,"/", corr_lig_all_df3)
 ####################################################################
 #               end of loading libraries and functions                
 ####################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Data for plots
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # PS
 corr_ps_df2 = corr_ps_df2[-1]
 corr_ps_df3 = corr_ps_df3[-1]
 # Lig
 corr_lig_df2 = corr_lig_df2[-1]
 corr_lig_df3 = corr_lig_df3[-1]
 #---------------------------------------
 # generate corr PS plot 1: merged_df2
 #---------------------------------------
 cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all_df2)
 svg(plot_corr_ps_all_df2, width = 30, height = 30)
 OutPlot_ps_df2 = pairs.panels(corr_ps_df2[1:(length(corr_ps_df2)-2)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = T  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df2$duet_outcome))] # can't use colour as duet and foldx are opposite
             , pch = 21 # for bg
             #, pch = 19
             , jitter = T
             , alpha = 1
             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 1.8
             , cex.axis = 2
             , cex.labels = 2
             , cex.cor = 1
             , smooth = F)
 print(OutPlot_ps_df2)
 dev.off()
 #----------------------------------------------
 # generate corr PS plot 2: merged_df3
 #----------------------------------------------
 cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all_df3)
 svg(plot_corr_ps_all_df3, width = 30, height = 30)
 OutPlot_ps_df3 = pairs.panels(corr_ps_df3[1:(length(corr_ps_df3)-2)]
                 , method = "spearman" # correlation method
                 , hist.col = "grey" ##00AFBB
                 , density = T  # show density plots
                 , ellipses = F # show correlation ellipses
                 , stars = T
                 , rug = F
                 , breaks = "Sturges"
                 , show.points = T
                 , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df3$duet_outcome))] # can't use colour as duet and foldx are opposite
                 , pch = 21 # for bg
                 , cex = 2
                 , cex.axis = 1.6
                 , cex.labels = 2
                 , cex.cor = 1
                 , smooth = F
 )
 print(OutPlot_ps_df3)
 dev.off()
 ################################################################################################
 #---------------------------------------
 # generate corr lig plot 1: merged_df2
 #---------------------------------------
 cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_all_df2)
 svg(plot_corr_lig_all_df2, width = 30, height = 30)
 OutPlot_lig_df2 = pairs.panels(corr_lig_df2[1:(length(corr_lig_df2)-2)]
                               , method = "spearman" # correlation method
                               , hist.col = "grey" ##00AFBB
                               , density = T  # show density plots
                               , ellipses = F # show correlation elliliges
                               , stars = T
                               , rug = F
                               , breaks = "Sturges"
                               , show.points = T
                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df2$ligand_outcome))] # can't use colour as duet and foldx are opposite
                               , pch = 21 # for bg
                               #, pch = 19
                               , jitter = T
                               , alpha = 1
                               #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
                               , cex = 1.8
                               , cex.axis = 2
                               , cex.labels = 2
                               , cex.cor = 1
                               , smooth = F)
 print(OutPlot_lig_df2)
 dev.off()
 #----------------------------------------------
 # generate corr lig plot 2: merged_df3
 #----------------------------------------------
 cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_all_df3)
 svg(plot_corr_lig_all_df3, width = 30, height = 30)
 OutPlot_lig_df3 = pairs.panels(corr_lig_df3[1:(length(corr_lig_df3)-2)]
                               , method = "spearman" # correlation method
                               , hist.col = "grey" ##00AFBB
                               , density = T  # show density plots
                               , ellipses = F # show correlation elliliges
                               , stars = T
                               , rug = F
                               , breaks = "Sturges"
                               , show.points = T
                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df3$ligand_outcome))] # can't use colour as duet and foldx are opposite
                               , pch = 21 # for bg
                               , cex = 2
                               , cex.axis = 1.6
                               , cex.labels = 2
                               , cex.cor = 1
                               , smooth = F
 )
 print(OutPlot_lig_df3)
 dev.off()
--- a/scripts/plotting/corr_PS_LIG_v2.R
+++ b/scripts/plotting/corr_PS_LIG_v2.R
@ -1,176 +0,0 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Corr plots for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 #source("combining_dfs_plotting.R")
 source("my_pairs_panel.R") # with lower panel turned off
 source("corr_data.R")
 #=======
 # output
 #=======
 # PS
 corrplot_ps_df2 = "corr_PS_df2.svg"
 plot_corr_ps_df2 =  paste0(plotdir,"/", corrplot_ps_df2)
 corrplot_ps_df3 = "corr_PS_df3.svg"
 plot_corr_ps_df3 =  paste0(plotdir,"/", corrplot_ps_df3)
 # LIG
 corrplot_lig_df2 = "corr_LIG_df2.svg"
 plot_corr_lig_df2 =  paste0(plotdir,"/", corrplot_lig_df2)
 corrplot_lig_df3 = "corr_LIG_df3.svg"
 plot_corr_lig_df3 =  paste0(plotdir,"/", corrplot_lig_df3)
 ####################################################################
 #               end of loading libraries and functions                
 ####################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Data for plots
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 cols_to_drop = c("ASA", "AF_kin")
 # PS
 corr_ps_df2 = corr_ps_df2[!colnames(corr_ps_df2)%in%cols_to_drop]
 corr_ps_df2 = corr_ps_df2[-1]
 corr_ps_df3 = corr_ps_df3[!colnames(corr_ps_df3)%in%cols_to_drop]
 corr_ps_df3 = corr_ps_df3[-1]
 # Lig
 corr_lig_df2 = corr_lig_df2[!colnames(corr_lig_df2)%in%cols_to_drop]
 corr_lig_df2 = corr_lig_df2[-1]
 corr_lig_df3 = corr_lig_df3[!colnames(corr_lig_df3)%in%cols_to_drop]
 corr_lig_df3 = corr_lig_df3[-1]
 #---------------------------------------
 # generate corr PS plot 1: merged_df2
 #---------------------------------------
 cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_df2)
 svg(plot_corr_ps_df2, width = 30, height = 25)
 OutPlot_ps_df2 = pairs.panels(corr_ps_df2[1:(length(corr_ps_df2)-2)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = T  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df2$duet_outcome))] # can't use colour as duet and foldx are opposite
             , pch = 21 # for bg
             #, pch = 19
             , jitter = T
             , alpha = 1
             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 1.8
             , cex.axis = 2
             , cex.labels = 3.8
             , cex.cor = 1
             , smooth = F)
 print(OutPlot_ps_df2)
 dev.off()
 #----------------------------------------------
 # generate corr PS plot 2: merged_df3
 #----------------------------------------------
 cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_df3)
 svg(plot_corr_ps_df3, width = 30, height = 25)
 OutPlot_ps_df3 = pairs.panels(corr_ps_df3[1:(length(corr_ps_df3)-2)]
                 , method = "spearman" # correlation method
                 , hist.col = "grey" ##00AFBB
                 , density = T  # show density plots
                 , ellipses = F # show correlation ellipses
                 , stars = T
                 , rug = F
                 , breaks = "Sturges"
                 , show.points = T
                 , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df3$duet_outcome))] # can't use colour as duet and foldx are opposite
                 , pch = 21 # for bg
                 , cex = 3
                 , cex.axis = 1.6
                 , cex.labels = 3.8
                 , cex.cor = 1
                 , smooth = F
 )
 print(OutPlot_ps_df3)
 dev.off()
 ################################################################################################
 #---------------------------------------
 # generate corr lig plot 1: merged_df2
 #---------------------------------------
 cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_df2)
 svg(plot_corr_lig_df2, width = 30, height = 25)
 OutPlot_lig_df2 = pairs.panels(corr_lig_df2[1:(length(corr_lig_df2)-2)]
                               , method = "spearman" # correlation method
                               , hist.col = "grey" ##00AFBB
                               , density = T  # show density plots
                               , ellipses = F # show correlation elliliges
                               , stars = T
                               , rug = F
                               , breaks = "Sturges"
                               , show.points = T
                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df2$ligand_outcome))] # can't use colour as duet and foldx are opposite
                               , pch = 21 # for bg
                               #, pch = 19
                               , jitter = T
                               , alpha = 1
                               #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
                               , cex = 1.8
                               , cex.axis = 2
                               , cex.labels = 3.8
                               , cex.cor = 1
                               , smooth = F)
 print(OutPlot_lig_df2)
 dev.off()
 #----------------------------------------------
 # generate corr lig plot 2: merged_df3
 #----------------------------------------------
 cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_df3)
 svg(plot_corr_lig_df3, width = 30, height = 25)
 OutPlot_lig_df3 = pairs.panels(corr_lig_df3[1:(length(corr_lig_df3)-2)]
                               , method = "spearman" # correlation method
                               , hist.col = "grey" ##00AFBB
                               , density = T  # show density plots
                               , ellipses = F # show correlation elliliges
                               , stars = T
                               , rug = F
                               , breaks = "Sturges"
                               , show.points = T
                               , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df3$ligand_outcome))] # can't use colour as duet and foldx are opposite
                               , pch = 21 # for bg
                               , cex = 3
                               , cex.axis = 1.6
                               , cex.labels = 3.8
                               , cex.cor = 1
                               , smooth = F
 )
 print(OutPlot_lig_df3)
 dev.off()
--- a/scripts/plotting/corr_foldx.R
+++ b/scripts/plotting/corr_foldx.R
@ -1,191 +0,0 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Corr plots for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("Header_TT.R")
 require(cowplot)
 source("combining_dfs_plotting.R") # FIXME: add extra from other plots here
 # should return the following dfs, directories and variables
 #=======
 # output
 #=======
 # can't combine by cowplot because not ggplots
 #corr_plot_combined = "corr_combined.svg"
 #plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
 # PS foldx
 corr_foldx = "corr_adjusted_foldx.svg"
 plot_corr_foldx =  paste0(plotdir,"/", corr_foldx)
 ####################################################################
 #               end of loading libraries and functions                 #
 ########################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 df_ps = merged_df3
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm( merged_df2, merged_df2_comp, merged_df2_lig
    , merged_df2_comp_lig
    , merged_df3_comp, merged_df3_comp_lig
    , my_df_u, my_df_u_lig)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Data for Correlation plots:foldx
 #===========================
 #============================
 # adding foldx scaled values
 # scale data b/w -1 and 1
 #============================
 n = which(colnames(df_ps) == "ddg"); n 
 my_min = min(df_ps[,n]); my_min 
 my_max = max(df_ps[,n]); my_max 
 df_ps$foldx_scaled = ifelse(df_ps[,n] < 0
                            , df_ps[,n]/abs(my_min)
                            , df_ps[,n]/my_max) 
 # sanity check
 my_min = min(df_ps$foldx_scaled); my_min 
 my_max = max(df_ps$foldx_scaled); my_max
 if (my_min == -1 && my_max == 1){
  cat("PASS: foldx ddg successfully scaled b/w -1 and 1"
      , "\nProceeding with assigning foldx outcome category")
 }else{
  cat("FAIL: could not scale foldx ddg values"
      , "Aborting!")
 }
 #================================
 # adding foldx outcome category
 # ddg<0 = "Stabilising" (-ve)
 #=================================
 c1 = table(df_ps$ddg < 0)
 df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising")
 c2 = table(df_ps$ddg < 0)
 if ( all(c1 == c2) ){
  cat("PASS: foldx outcome successfully created")
 }else{
  cat("FAIL: foldx outcome could not be created. Aborting!")
  exit()
 }
 table(df_ps$foldx_outcome)
 #======================
 # adding log cols 
 #======================
 df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
 df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
 df_ps$log10_or_kin = log10(df_ps$or_kin)
 df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
 # subset data to generate pairwise correlations
 cols_to_select_foldx =  c("foldx_scaled"
                    , "duet_scaled"
                    , "log10_or_mychisq"
                    , "neglog_pval_fisher"
                    , "log10_or_kin"
                    , "neglog_pwald_kin"
                    , "af"
                    , "foldx_outcome"
                    , drug)
 corr_data_foldx = df_ps[, cols_to_select_foldx]
 dim(corr_data_foldx)
 #p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
 #p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
 # assign nice colnames (for display)
 my_corr_colnames_foldx = c("Foldx"
                     ,"DUET"
                     , "Log(OR)"
                     , "-Log(P)"
                     , "Log(OR adjusted)"
                     , "-Log(P wald)"
                     , "AF"
                     , "foldx_outcome"
                     , drug)
 length(my_corr_colnames_foldx)
 colnames(corr_data_foldx)
 colnames(corr_data_foldx) <- my_corr_colnames_foldx
 colnames(corr_data_foldx)
 #-----------------
 # generate corr foldx plot
 #-----------------
 start = 1
 end = which(colnames(corr_data_foldx) == drug); end # should be the last column
 offset = 1
 my_corr_foldx = corr_data_foldx[start:(end-offset)]
 head(my_corr_foldx)
 #my_cols = c("#f8766d", "#00bfc4")
 # deep blue :#007d85
 # deep red: #ae301e
 cat("Corr plot foldx:", plot_corr_foldx)
 svg(plot_corr_foldx, width = 15, height = 15)
 OutPlot_foldx= pairs.panels(my_corr_foldx[1:(length(my_corr_foldx)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_foldx$foldx_outcome))]
             , pch = 21
             , jitter = T
             #, alpha = .05
             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 3
             , cex.axis = 2.5
             , cex.labels = 2.1
             , cex.cor = 1
             , smooth = F
 )
 print(OutPlot_foldx)
 dev.off()
--- a/scripts/plotting/ggridges_lineage_country.R
+++ b/scripts/plotting/ggridges_lineage_country.R
@ -1,289 +0,0 @@
 #########################################################
 # 1: Installing and loading required packages
 #########################################################
 #source("../Header_TT.R")
 install.packages("qqman")
 library(qqman)
 source("combining_dfs_plotting.R")
 #mcsm_data: raw file, 225, 15
 #merged_df2 = 2201, 35
 #merged_df3 =  205, 35 ("Can't trust non-numerical params')
 #===============================================
 # PLOTS: DUET vs GWAS: non-numerical
 # lineage, country_code, etc
 # merged_df2: 1592, 35
 #===============================================
 #########################
 # Data for plot
 #########################
 df = merged_df2
 #df = merged_df2_comp 
 #========================
 # Plot 1a: Lineage barplot
 # x = lineage y = No of samples
 # col = Lineage
 # fill = lineage
 #========================
 table(df$lineage)
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4"
                 #, "lineage5"
                 #, "lineage6"
                 #, "lineage7"
 )
 # uncomment as necessary
 df_lin = subset(df, subset = lineage %in% sel_lineages )
 table(df_lin$lineage)
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df <- df_lin
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df2 = df
 #%%%%%%%%%%%%%%%%%%%%%%%%
 df2 = df2%>%
  add_count(country_code)
 str(df2$country_code); str(df2$n)
 n = which(colnames(df2) == "n")
 colnames(df2)[n] = "count_country"
 table(df2$count_country>100 & df$country_code!= "")
 df3 = subset(df2, df2$count_country>100 & df2$country_code != "")
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df = df3
 #%%%%%%%%%%%%%%%%%%%%%%%%
 sample = sum(table(unique(df$id))); sample
 table(df$country_code)
 tab = sum(table(df$country_code)); tab
 View(table(df$country_code))
 View(t1)
 ############## begin plot
 g = ggplot(df, aes(x = lineage))
 g + geom_bar(aes(fill = lineage)) +
  theme( axis.text.x = element_text(size = 13
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = 15
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = 15)
         , axis.title.y = element_text(size = 15) ) +
  labs(title = "Lineage"
       , x = "Lineage"
       , y = "No of samples")
 #========================
 # Plot 2: DUET, lineage, country_code and or_mychisq
 # x = lineage y = DUET
 # col = Lineage
 # fill = country_code
 #========================
 ### begin plot
 g = ggplot(df, aes(x = country_code
                   , y = duet_scaled))
 g + geom_point(aes(col = lineage
                   , size = or_mychisq)) +
  theme(axis.text.x = element_text(size = 13
                                    , angle = 90
                                    , hjust = 1
                                    , vjust = 0.4)
         , axis.text.y = element_text(size = 15
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_text(size = 15)
         , axis.title.y = element_text(size = 15) ) +
  labs(title = "DUET, country_code, lineage, or_mychisq"
       , x = "Lineage"
       , y = "DUET (PS)")
 #############
 #========================
 # Plot 3: DUET, lineage, or_mychisq
 # x = lineage y = DUET
 # col = Lineage
 # fill = country_code
 #========================
 ### begin plot
 table(df$lineage)
 g = ggplot(df_lin, aes(x = lineage
                   , y = duet_scaled))
 g + geom_point(aes(col = lineage
                   , size = or_mychisq)) +
  theme(axis.text.x = element_text(size = 13
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_text(size = 15
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = 15)
        , axis.title.y = element_text(size = 15) ) +
  labs(title = "DUET, lineage, or_mychisq"
       , x = "Lineage"
       , y = "DUET (PS)")
 #========================
 # Plot 4-5: Distributions
 # ggrdiges
 #========================
 #==================================================
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
              )
 names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                     # , 'lineage5', 'lineage6', 'lineage7'
                     )
 #========================
 # Plot 4: Distribution
 # x = duet_scaled
 # y = country
 # fill = country_code
 # facet = lineage
 #========================
 # works neatly!
 p1 = ggplot(df, aes(x = duet_scaled
                    , y = country_code))+
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = country_code)
                               , jittered_points = TRUE
                               , scale = 3
                               , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
              , switch = 'x'
              , labeller = labeller(lineage = my_labels)
              ) +
  coord_cartesian( xlim = c(-1, 1)) +
  #scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
  #                     , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        #, axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
        , legend.text = element_text(size = my_als-5)
        , legend.title = element_text(size = my_als)
  ) 
 p1
 #========================
 # Plot 5: Distribution
 # x = duet_scaled
 # y = country_code
 # fill = lineage
 # facet = NONE
 #========================
 # no facet wrap
 p2 = ggplot(df, aes(x = duet_scaled
                    , y = country_code))+
  geom_density_ridges_gradient(aes(fill = factor(lineage))
                               , scale = 3
                               , size = 0.3 ) +
  coord_cartesian( xlim = c(-1, 1)) +
  #scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
  #                     , name = "DUET" ) + 
  #scale_fill_continuous(colours = c("darkgreen", "pink", "orange", "brown")
  #                     , name = "lineage" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        #, axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
        , legend.text = element_text(size = my_als-5)
        , legend.title = element_text(size = my_als)
  ) 
 p2
 #===============
 # lineage only
 #================
 #svg(plot_lineage_duet)
 p3 = ggplot(df, aes(x = duet_scaled
                    , y = duet_outcome))+
  geom_density_ridges_gradient(aes(fill = ..x..)
                               , jittered_points = TRUE
                               , scale = 3
                               , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
              #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                       , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
        , legend.text = element_text(size = my_als-5)
        , legend.title = element_text(size = my_als)
  ) 
 print(p3)
--- a/scripts/prediction_all.R
+++ b/scripts/prediction_all.R
@ -1,426 +0,0 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: prediction
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/")
 getwd()
 source("plotting/combining_dfs_plotting.R")
 ####################################################################
 #               end of loading libraries and functions              
 ####################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # ps
 table(merged_df2$mutation_info)
 merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0)
 table(merged_df2$mutation_info_labels)
 table(merged_df3$mutation_info)
 merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
 table(merged_df3$mutation_info_labels)
 # lig
 table(merged_df2_lig$mutation_info)
 merged_df2_lig$mutation_info_labels = ifelse(merged_df2_lig$mutation_info == dr_muts_col, 1, 0)
 table(merged_df2_lig$mutation_info_labels)
 table(merged_df3_lig$mutation_info)
 merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0)
 table(merged_df3_lig$mutation_info_labels)
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 ###############################################################################
 model_ind = glm(mutation_info_labels ~ or_mychisq
    , data = merged_df2
    , family = "binomial")
 summary(model_ind)
 nobs(model_ind)
 #=============
 # try loop
 #=============
 my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled")
 for( i in my_ivs){
  cat ("===============================\n")
  cat(i)
  cat ("\n===============================\n")
  print(summary(glm(mutation_info_labels ~ eval(parse(text=i))
              , data = merged_df2
              , family = "binomial")))
 }
 ###############################################################################
 #========================================
 # merged_df2: UNadjusted,loop
 #========================================
 my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
           , "ligand_distance"
           , "rsa"
           , "rd_values"
           , "kd_values"
           , "duet_stability_change" 
           , "duet_scaled"
           , "duet_outcome"
           , "ddg"
           , "foldx_scaled"
           , "foldx_outcome")
 ps_logistic_df2  = data.frame()
 for( i in my_ivs){
  print(i)
  df = data.frame(var_name = NA
                  , number_samples = NA
                  , beta = NA
                  , odds_ratio = NA
                  , pvalue = NA
                  , se = NA
                  , zvalue = NA
                  , ci_lower = NA
                  , ci_upper = NA)
  model = glm(mutation_info_labels ~ eval(parse(text=i))
              , data = merged_df2
              , family = "binomial")
  var_name = i
  number_samples = nobs(model)
  beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
  or_logistic = exp(summary(model)$coefficients[2,1])
  pval_logistic = summary(model)$coefficients[2,4]
  se_logistic = summary(model)$coefficients[2,2]
  zval_logistic = summary(model)$coefficients[2,3]
  ci_mod = exp(confint(model))[2,]
  ci_lower_logistic = ci_mod[["2.5 %"]]
  ci_upper_logistic = ci_mod[["97.5 %"]]
  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
  df$var_name = var_name 
  df$number_samples = number_samples
  df$beta = beta_logistic
  df$odds_ratio = or_logistic
  df$pvalue = pval_logistic
  df$se = se_logistic
  df$zvalue = zval_logistic
  df$ci_lower = ci_lower_logistic 
  df$ci_upper = ci_upper_logistic 
  print(df)
  ps_logistic_df2 = rbind(ps_logistic_df2, df)
 }
 #--------------------
 # formatting df
 #--------------------
 ps_logistic_df2$data_source = "df2"
 ps_logistic_df2$model = "unadjusted"
 ps_logistic_df2$odds_ratio = round(ps_logistic_df2$odds_ratio, 2)
 ps_logistic_df2$ci_lower = round(ps_logistic_df2$ci_lower, 2)
 ps_logistic_df2$ci_upper = round(ps_logistic_df2$ci_upper, 2)
 # adding pvalue_signif
 ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue
 str(ps_logistic_df2$pvalue_signif)
 ps_logistic_df2 = dplyr::mutate(ps_logistic_df2
                                , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                            , pvalue_signif <=0.0001 ~ '****'
                                                            , pvalue_signif <=0.001 ~ '***'
                                                            , pvalue_signif <=0.01 ~ '**'
                                                            , pvalue_signif <0.05 ~ '*'
                                                            , TRUE ~ 'ns'))
 # rearranging columns
 colnames(ps_logistic_df2)
 ps_logistic_df2_o = ps_logistic_df2 [c("var_name"    
                   , "number_samples"
                   , "model"
                   , "odds_ratio"
                   , "pvalue"
                   , "pvalue_signif"
                   , "beta"
                   , "se"
                   , "zvalue"
                   , "ci_lower"
                   , "ci_upper"
                   , "data_source")]
 ###############################################################################
 #========================================
 # merged_df2: adjusted, loop
 #========================================
 #model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
 #                       duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
 #            , data = merged_df2
 #           , family = "binomial")
 model_adjusted_df2 = glm(mutation_info_labels ~ or_mychisq + or_kin + rd_values + kd_values +
                           ligand_distance + duet_stability_change 
                         , data = merged_df2
                         , family = "binomial");summary(model_adjusted_df2)
 var_names_df = as.data.frame(names(model_adjusted_df2$coefficients))
 names(var_names_df) = c("var_name")
 ci_mod = exp(confint(model_adjusted_df2))
 ci_mod_df = as.data.frame(ci_mod)
 names(ci_mod_df) = c("ci_lower", "ci_upper")
 ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
 ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
 estimates_df = as.data.frame(summary(model_adjusted_df2)$coefficients)
 names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
 estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
 number_samples = nobs(model_adjusted_df2)
 estimates_df$number_samples = number_samples
 estimates_df$data_source = "df2"
 estimates_df$model = "adjusted"
 names(ps_logistic_adjusted_df2)
 if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
  cat("PASS: rownames match. Preparing to merge...")
  ps_logistic_adjusted_df2 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
 }
 colnames(ps_logistic_adjusted_df2)[1] <- c("var_name")
 d1 = which(ps_logistic_adjusted_df2$var_name ==  "(Intercept)")
 ps_logistic_adjusted_df2 = ps_logistic_adjusted_df2[-d1,]
 # adding pvalue_signif
 ps_logistic_adjusted_df2$pvalue_signif = ps_logistic_adjusted_df2$pvalue
 str(ps_logistic_adjusted_df2$pvalue_signif)
 ps_logistic_adjusted_df2 = dplyr::mutate(ps_logistic_adjusted_df2
                                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                                     , pvalue_signif <=0.0001 ~ '****'
                                                                     , pvalue_signif <=0.001 ~ '***'
                                                                     , pvalue_signif <=0.01 ~ '**'
                                                                     , pvalue_signif <0.05 ~ '*'
                                                                     , TRUE ~ 'ns'))
 # rearranging columns
 colnames(ps_logistic_adjusted_df2)
 ps_logistic_adjusted_df2_o = ps_logistic_adjusted_df2[c("var_name"    
                                                        , "number_samples"
                                                        , "model"
                                                        , "odds_ratio"
                                                        , "pvalue"
                                                        , "pvalue_signif"
                                                        , "beta"
                                                        , "se"
                                                        , "zvalue"
                                                        , "ci_lower"
                                                        , "ci_upper"
                                                        , "data_source")]
 ###############################################################################
 ###############################################################################
 #========================================
 # merged_df3: UNadjusted,loop
 #========================================
 my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
           , "ligand_distance"
           , "rsa"
           , "rd_values"
           , "kd_values"
           , "duet_stability_change" 
           , "duet_scaled"
           , "duet_outcome"
           , "ddg"
           , "foldx_scaled"
           , "foldx_outcome")
 ps_logistic_df3  = data.frame()
 for( i in my_ivs){
  print(i)
  df = data.frame(var_name = NA
                  , number_samples = NA
                  , beta = NA
                  , odds_ratio = NA
                  , pvalue = NA
                  , se = NA
                  , zvalue = NA
                  , ci_lower = NA
                  , ci_upper = NA)
  model = glm(mutation_info_labels ~ eval(parse(text=i))
              , data = merged_df3
              , family = "binomial")
  var_name = i
  number_samples = nobs(model)
  beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
  or_logistic = exp(summary(model)$coefficients[2,1])
  pval_logistic = summary(model)$coefficients[2,4]
  se_logistic = summary(model)$coefficients[2,2]
  zval_logistic = summary(model)$coefficients[2,3]
  ci_mod = exp(confint(model))[2,]
  ci_lower_logistic = ci_mod[["2.5 %"]]
  ci_upper_logistic = ci_mod[["97.5 %"]]
  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
  df$var_name = var_name 
  df$number_samples = number_samples
  df$beta = beta_logistic
  df$odds_ratio = or_logistic
  df$pvalue = pval_logistic
  df$se = se_logistic
  df$zvalue = zval_logistic
  df$ci_lower = ci_lower_logistic 
  df$ci_upper = ci_upper_logistic 
  print(df)
  ps_logistic_df3 = rbind(ps_logistic_df3, df)
 }
 #--------------------
 # formatting df
 #--------------------
 ps_logistic_df3$data_source = "df3"
 ps_logistic_df3$model = "unadjusted"
 ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2)
 ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2)
 ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2)
 # adding pvalue_signif
 ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue
 str(ps_logistic_df3$pvalue_signif)
 ps_logistic_df3 = dplyr::mutate(ps_logistic_df3
                                , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                            , pvalue_signif <=0.0001 ~ '****'
                                                            , pvalue_signif <=0.001 ~ '***'
                                                            , pvalue_signif <=0.01 ~ '**'
                                                            , pvalue_signif <0.05 ~ '*'
                                                            , TRUE ~ 'ns'))
 # rearranging columns
 ps_logistic_df3_o = ps_logistic_df3[c("var_name"
                                      , "number_samples"
                                      , "model"
                                      , "odds_ratio"
                                      , "pvalue"
                                      , "pvalue_signif"
                                      , "beta"
                                      , "se"
                                      , "zvalue"
                                      , "ci_lower"
                                      , "ci_upper"
                                      , "data_source")]
 #========================================
 # merged_df3: adjusted, loop
 #========================================
 #model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
 #                       duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
 #            , data = merged_df3
 #           , family = "binomial")
 model_adjusted_df3 = glm(mutation_info_labels ~ rd_values +
                           ligand_distance + duet_stability_change 
                         , data = merged_df3
                         , family = "binomial");summary(model_adjusted_df3)
 var_names_df = as.data.frame(names(model_adjusted_df3$coefficients))
 names(var_names_df) = c("var_name")
 ci_mod = exp(confint(model_adjusted_df3))
 ci_mod_df = as.data.frame(ci_mod)
 names(ci_mod_df) = c("ci_lower", "ci_upper")
 ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
 ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
 estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients)
 names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
 estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
 number_samples = nobs(model_adjusted_df3)
 estimates_df$number_samples = number_samples
 estimates_df$data_source = "df3"
 estimates_df$model = "adjusted"
 names(ps_logistic_adjusted_df3)
 if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
  cat("PASS: rownames match. Preparing to merge...")
  ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
 }
 colnames(ps_logistic_adjusted_df3)[1] <- c("var_name")
 d2 = which(ps_logistic_adjusted_df3$var_name ==  "(Intercept)")
 ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,]
 # adding pvalue_signif
 ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue
 str(ps_logistic_adjusted_df3$pvalue_signif)
 ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3
                                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                                     , pvalue_signif <=0.0001 ~ '****'
                                                                     , pvalue_signif <=0.001 ~ '***'
                                                                     , pvalue_signif <=0.01 ~ '**'
                                                                     , pvalue_signif <0.05 ~ '*'
                                                                     , TRUE ~ 'ns'))
 # rearranging columns
 colnames(ps_logistic_adjusted_df3)
 ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name"    
                                                        , "number_samples"
                                                        , "model"
                                                        , "odds_ratio"
                                                        , "pvalue"
                                                        , "pvalue_signif"
                                                        , "beta"
                                                        , "se"
                                                        , "zvalue"
                                                        , "ci_lower"
                                                        , "ci_upper"
                                                        ,"data_source")]
 #-------------
 # lm
 #-------------
 model_lm = lm(or_kin ~ rsa + rd_values + duet_stability_change +  ddg + mutation_info_labels
              , data = merged_df3)
 summary(model_lm)
 model_lm1 = lm(or_mychisq ~ mutation_info_labels
               , data = merged_df2)
 summary(model_lm1)
--- a/scripts/prediction_lig.R
+++ b/scripts/prediction_lig.R
@ -1,203 +0,0 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: prediction lig
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/")
 getwd()
 source("plotting/combining_dfs_plotting.R")
 #=======
 # output
 #=======
 lig_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_LIG.csv")
 lig_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_LIG.csv")
 ####################################################################
 #               end of loading libraries and functions              
 ####################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # lig
 table(merged_df3_lig$mutation_info)
 merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0)
 table(merged_df3_lig$mutation_info_labels)
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 ###############################################################################
 #========================================
 # merged_df3_lig: UNadjusted,loop
 #========================================
 my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
           #, "ligand_distance"
           , "rsa"
           , "rd_values"
           , "kd_values"
           , "ligand_affinity_change" 
           , "affinity_scaled"
           , "ligand_outcome")
 lig_logistic_df3  = data.frame()
 for( i in my_ivs){
  print(i)
  df = data.frame(var_name = NA
                  , number_samples = NA
                  , beta = NA
                  , odds_ratio = NA
                  , pvalue = NA
                  , se = NA
                  , zvalue = NA
                  , ci_lower = NA
                  , ci_upper = NA)
  model_lig = glm(mutation_info_labels ~ eval(parse(text=i))
                  , data = merged_df3_lig
                  , family = "binomial")
  var_name = i
  number_samples = nobs(model_lig)
  beta_logistic = summary(model_lig)$coefficients[2,1]; beta_logistic
  or_logistic = exp(summary(model_lig)$coefficients[2,1])
  pval_logistic = summary(model_lig)$coefficients[2,4]
  se_logistic = summary(model_lig)$coefficients[2,2]
  zval_logistic = summary(model_lig)$coefficients[2,3]
  ci_mod = exp(confint(model_lig))[2,]
  ci_lower_logistic = ci_mod[["2.5 %"]]
  ci_upper_logistic = ci_mod[["97.5 %"]]
  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
  df$var_name = var_name 
  df$number_samples = number_samples
  df$beta = beta_logistic
  df$odds_ratio = or_logistic
  df$pvalue = pval_logistic
  df$se = se_logistic
  df$zvalue = zval_logistic
  df$ci_lower = ci_lower_logistic 
  df$ci_upper = ci_upper_logistic 
  print(df)
  lig_logistic_df3 = rbind(lig_logistic_df3, df)
 }
 #--------------------
 # formatting df
 #--------------------
 lig_logistic_df3$data_source = "df3_lig"
 lig_logistic_df3$model_lig = "unadjusted"
 lig_logistic_df3$odds_ratio = round(lig_logistic_df3$odds_ratio, 2)
 lig_logistic_df3$ci_lower = round(lig_logistic_df3$ci_lower, 2)
 lig_logistic_df3$ci_upper = round(lig_logistic_df3$ci_upper, 2)
 # adding pvalue_signif
 lig_logistic_df3$pvalue_signif = lig_logistic_df3$pvalue
 str(lig_logistic_df3$pvalue_signif)
 lig_logistic_df3 = dplyr::mutate(lig_logistic_df3
                                 , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                             , pvalue_signif <=0.0001 ~ '****'
                                                             , pvalue_signif <=0.001 ~ '***'
                                                             , pvalue_signif <=0.01 ~ '**'
                                                             , pvalue_signif <0.05 ~ '*'
                                                             , TRUE ~ 'ns'))
 # rearranging columns
 lig_logistic_df3_o = lig_logistic_df3[c("var_name"
                                        , "number_samples"
                                        , "model_lig"
                                        , "odds_ratio"
                                        , "pvalue"
                                        , "pvalue_signif"
                                        , "beta"
                                        , "se"
                                        , "zvalue"
                                        , "ci_lower"
                                        , "ci_upper"
                                        , "data_source")]
 # writing file
 write.csv(lig_logistic_df3_o, lig_unadjusted, row.names = F)
 #========================================
 # merged_df3_lig: adjusted, loop
 #========================================
 #model_lig_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
 #                       ligand_affinity_change + affinity_scaled + ligand_outcome 
 #            , data = merged_df3_lig
 #           , family = "binomial")
 model_lig_adjusted_df3 = glm(mutation_info_labels ~ rd_values + ligand_affinity_change 
                             , data = merged_df3_lig
                             , family = "binomial");summary(model_lig_adjusted_df3)
 var_names_df = as.data.frame(names(model_lig_adjusted_df3$coefficients))
 names(var_names_df) = c("var_name")
 ci_mod = exp(confint(model_lig_adjusted_df3))
 ci_mod_df = as.data.frame(ci_mod)
 names(ci_mod_df) = c("ci_lower", "ci_upper")
 ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
 ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
 estimates_df = as.data.frame(summary(model_lig_adjusted_df3)$coefficients)
 names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
 estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
 number_samples = nobs(model_lig_adjusted_df3)
 estimates_df$number_samples = number_samples
 estimates_df$data_source = "df3_lig"
 estimates_df$model_lig = "adjusted"
 names(lig_logistic_adjusted_df3)
 if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
  cat("PASS: rownames match. Preparing to merge...")
  lig_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
 }
 colnames(lig_logistic_adjusted_df3)[1] <- c("var_name")
 d2 = which(lig_logistic_adjusted_df3$var_name ==  "(Intercept)")
 lig_logistic_adjusted_df3 = lig_logistic_adjusted_df3[-d2,]
 # adding pvalue_signif
 lig_logistic_adjusted_df3$pvalue_signif = lig_logistic_adjusted_df3$pvalue
 str(lig_logistic_adjusted_df3$pvalue_signif)
 lig_logistic_adjusted_df3 = dplyr::mutate(lig_logistic_adjusted_df3
                                          , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                                      , pvalue_signif <=0.0001 ~ '****'
                                                                      , pvalue_signif <=0.001 ~ '***'
                                                                      , pvalue_signif <=0.01 ~ '**'
                                                                      , pvalue_signif <0.05 ~ '*'
                                                                      , TRUE ~ 'ns'))
 # rearranging columns
 colnames(lig_logistic_adjusted_df3)
 lig_logistic_adjusted_df3_o = lig_logistic_adjusted_df3[c("var_name"    
                                                          , "number_samples"
                                                          , "model_lig"
                                                          , "odds_ratio"
                                                          , "pvalue"
                                                          , "pvalue_signif"
                                                          , "beta"
                                                          , "se"
                                                          , "zvalue"
                                                          , "ci_lower"
                                                          , "ci_upper"
                                                          ,"data_source")]
 # writing file
 write.csv(lig_logistic_adjusted_df3_o, lig_adjusted, row.names = F)
--- a/scripts/prediction_ps.R
+++ b/scripts/prediction_ps.R
@ -1,207 +0,0 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: prediction_ps
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/")
 getwd()
 source("plotting/combining_dfs_plotting.R")
 #=======
 # output
 #=======
 ps_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_PS.csv")
 ps_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_PS.csv")
 ####################################################################
 #               end of loading libraries and functions              
 ####################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # ps
 table(merged_df3$mutation_info)
 merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
 table(merged_df3$mutation_info_labels)
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 ###############################################################################
 #========================================
 # merged_df3: UNadjusted,loop
 #========================================
 my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
           , "ligand_distance"
           , "rsa"
           , "rd_values"
           , "kd_values"
           , "duet_stability_change" 
           , "duet_scaled"
           , "duet_outcome"
           , "ddg"
           , "foldx_scaled"
           , "foldx_outcome")
 ps_logistic_df3  = data.frame()
 for( i in my_ivs){
  print(i)
  df = data.frame(var_name = NA
                  , number_samples = NA
                  , beta = NA
                  , odds_ratio = NA
                  , pvalue = NA
                  , se = NA
                  , zvalue = NA
                  , ci_lower = NA
                  , ci_upper = NA)
  model = glm(mutation_info_labels ~ eval(parse(text=i))
              , data = merged_df3
              , family = "binomial")
  var_name = i
  number_samples = nobs(model)
  beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
  or_logistic = exp(summary(model)$coefficients[2,1])
  pval_logistic = summary(model)$coefficients[2,4]
  se_logistic = summary(model)$coefficients[2,2]
  zval_logistic = summary(model)$coefficients[2,3]
  ci_mod = exp(confint(model))[2,]
  ci_lower_logistic = ci_mod[["2.5 %"]]
  ci_upper_logistic = ci_mod[["97.5 %"]]
  print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
  df$var_name = var_name 
  df$number_samples = number_samples
  df$beta = beta_logistic
  df$odds_ratio = or_logistic
  df$pvalue = pval_logistic
  df$se = se_logistic
  df$zvalue = zval_logistic
  df$ci_lower = ci_lower_logistic 
  df$ci_upper = ci_upper_logistic 
  print(df)
  ps_logistic_df3 = rbind(ps_logistic_df3, df)
 }
 #--------------------
 # formatting df
 #--------------------
 ps_logistic_df3$data_source = "df3"
 ps_logistic_df3$model = "unadjusted"
 ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2)
 ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2)
 ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2)
 # adding pvalue_signif
 ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue
 str(ps_logistic_df3$pvalue_signif)
 ps_logistic_df3 = dplyr::mutate(ps_logistic_df3
                                , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                            , pvalue_signif <=0.0001 ~ '****'
                                                            , pvalue_signif <=0.001 ~ '***'
                                                            , pvalue_signif <=0.01 ~ '**'
                                                            , pvalue_signif <0.05 ~ '*'
                                                            , TRUE ~ 'ns'))
 # rearranging columns
 ps_logistic_df3_o = ps_logistic_df3[c("var_name"
                                      , "number_samples"
                                      , "model"
                                      , "odds_ratio"
                                      , "pvalue"
                                      , "pvalue_signif"
                                      , "beta"
                                      , "se"
                                      , "zvalue"
                                      , "ci_lower"
                                      , "ci_upper"
                                      , "data_source")]
 # writing file
 write.csv(ps_logistic_df3_o, ps_unadjusted, row.names = F)
 #========================================
 # merged_df3: adjusted, loop
 #========================================
 #model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
 #                       duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
 #            , data = merged_df3
 #           , family = "binomial")
 model_adjusted_df3 = glm(mutation_info_labels ~ rd_values +
                           ligand_distance + duet_stability_change 
                         , data = merged_df3
                         , family = "binomial");summary(model_adjusted_df3)
 var_names_df = as.data.frame(names(model_adjusted_df3$coefficients))
 names(var_names_df) = c("var_name")
 ci_mod = exp(confint(model_adjusted_df3))
 ci_mod_df = as.data.frame(ci_mod)
 names(ci_mod_df) = c("ci_lower", "ci_upper")
 ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
 ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
 estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients)
 names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
 estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
 number_samples = nobs(model_adjusted_df3)
 estimates_df$number_samples = number_samples
 estimates_df$data_source = "df3"
 estimates_df$model = "adjusted"
 names(ps_logistic_adjusted_df3)
 if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
  cat("PASS: rownames match. Preparing to merge...")
  ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
 }
 colnames(ps_logistic_adjusted_df3)[1] <- c("var_name")
 d2 = which(ps_logistic_adjusted_df3$var_name ==  "(Intercept)")
 ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,]
 # adding pvalue_signif
 ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue
 str(ps_logistic_adjusted_df3$pvalue_signif)
 ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3
                                         , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
                                                                     , pvalue_signif <=0.0001 ~ '****'
                                                                     , pvalue_signif <=0.001 ~ '***'
                                                                     , pvalue_signif <=0.01 ~ '**'
                                                                     , pvalue_signif <0.05 ~ '*'
                                                                     , TRUE ~ 'ns'))
 # rearranging columns
 colnames(ps_logistic_adjusted_df3)
 ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name"    
                                                        , "number_samples"
                                                        , "model"
                                                        , "odds_ratio"
                                                        , "pvalue"
                                                        , "pvalue_signif"
                                                        , "beta"
                                                        , "se"
                                                        , "zvalue"
                                                        , "ci_lower"
                                                        , "ci_upper"
                                                        ,"data_source")]
 # writing file
 write.csv(ps_logistic_adjusted_df3_o, ps_adjusted, row.names = F)
 ###############################################################################