diff --git a/scripts/plotting/basic_barplots_LIG.R b/scripts/plotting/basic_barplots_LIG.R index 8e64937..dd14d79 100755 --- a/scripts/plotting/basic_barplots_LIG.R +++ b/scripts/plotting/basic_barplots_LIG.R @@ -188,6 +188,6 @@ OutPlot_lig_pos_count = g + geom_bar(aes (alpha = 0.5) print(OutPlot_lig_pos_count) dev.off() ######################################################################## -# end of lig barplots +# end of LIG barplots ######################################################################## diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R index 36c2483..aaab947 100755 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -186,5 +186,5 @@ OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5) print(OutPlot_pos_count) dev.off() ######################################################################## -# end of Ligand barplots +# end of PS barplots ######################################################################## diff --git a/scripts/plotting/corr_PS_LIG_all.R b/scripts/plotting/corr_PS_LIG_all.R deleted file mode 100644 index 138ef11..0000000 --- a/scripts/plotting/corr_PS_LIG_all.R +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Corr plots for PS and Lig - -# Output: 1 svg - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -#source("combining_dfs_plotting.R") -source("my_pairs_panel.R") # with lower panel turned off -source("corr_data.R") - -#======= -# output -#======= -# PS -corr_ps_all_df2 = "corr_PS_ALL_df2.svg" -plot_corr_ps_all_df2 = paste0(plotdir,"/", corr_ps_all_df2) - -corr_ps_all_df3 = "corr_PS_ALL_df3.svg" -plot_corr_ps_all_df3 = paste0(plotdir,"/", corr_ps_all_df3) - -# LIG -corr_lig_all_df2 = "corr_LIG_ALL_df2.svg" -plot_corr_lig_all_df2 = paste0(plotdir,"/", corr_lig_all_df2) - -corr_lig_all_df3 = "corr_LIG_ALL_df3.svg" -plot_corr_lig_all_df3 = paste0(plotdir,"/", corr_lig_all_df3) - -#################################################################### -# end of loading libraries and functions -#################################################################### -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Data for plots -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -# PS -corr_ps_df2 = corr_ps_df2[-1] -corr_ps_df3 = corr_ps_df3[-1] - -# Lig -corr_lig_df2 = corr_lig_df2[-1] -corr_lig_df3 = corr_lig_df3[-1] - -#--------------------------------------- -# generate corr PS plot 1: merged_df2 -#--------------------------------------- -cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all_df2) -svg(plot_corr_ps_all_df2, width = 30, height = 30) - -OutPlot_ps_df2 = pairs.panels(corr_ps_df2[1:(length(corr_ps_df2)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df2$duet_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - #, pch = 19 - , jitter = T - , alpha = 1 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 1.8 - , cex.axis = 2 - , cex.labels = 2 - , cex.cor = 1 - , smooth = F) - -print(OutPlot_ps_df2) -dev.off() - -#---------------------------------------------- -# generate corr PS plot 2: merged_df3 -#---------------------------------------------- -cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_all_df3) -svg(plot_corr_ps_all_df3, width = 30, height = 30) - -OutPlot_ps_df3 = pairs.panels(corr_ps_df3[1:(length(corr_ps_df3)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df3$duet_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - , cex = 2 - , cex.axis = 1.6 - , cex.labels = 2 - , cex.cor = 1 - , smooth = F - -) - -print(OutPlot_ps_df3) -dev.off() - -################################################################################################ - - -#--------------------------------------- -# generate corr lig plot 1: merged_df2 -#--------------------------------------- -cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_all_df2) -svg(plot_corr_lig_all_df2, width = 30, height = 30) - -OutPlot_lig_df2 = pairs.panels(corr_lig_df2[1:(length(corr_lig_df2)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation elliliges - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df2$ligand_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - #, pch = 19 - , jitter = T - , alpha = 1 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 1.8 - , cex.axis = 2 - , cex.labels = 2 - , cex.cor = 1 - , smooth = F) - -print(OutPlot_lig_df2) -dev.off() - -#---------------------------------------------- -# generate corr lig plot 2: merged_df3 -#---------------------------------------------- -cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_all_df3) -svg(plot_corr_lig_all_df3, width = 30, height = 30) - -OutPlot_lig_df3 = pairs.panels(corr_lig_df3[1:(length(corr_lig_df3)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation elliliges - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df3$ligand_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - , cex = 2 - , cex.axis = 1.6 - , cex.labels = 2 - , cex.cor = 1 - , smooth = F - -) - -print(OutPlot_lig_df3) -dev.off() diff --git a/scripts/plotting/corr_PS_LIG_v2.R b/scripts/plotting/corr_PS_LIG_v2.R deleted file mode 100644 index 908b7d4..0000000 --- a/scripts/plotting/corr_PS_LIG_v2.R +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Corr plots for PS and Lig - -# Output: 1 svg - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - -#source("combining_dfs_plotting.R") -source("my_pairs_panel.R") # with lower panel turned off -source("corr_data.R") - -#======= -# output -#======= -# PS -corrplot_ps_df2 = "corr_PS_df2.svg" -plot_corr_ps_df2 = paste0(plotdir,"/", corrplot_ps_df2) - -corrplot_ps_df3 = "corr_PS_df3.svg" -plot_corr_ps_df3 = paste0(plotdir,"/", corrplot_ps_df3) - -# LIG -corrplot_lig_df2 = "corr_LIG_df2.svg" -plot_corr_lig_df2 = paste0(plotdir,"/", corrplot_lig_df2) - -corrplot_lig_df3 = "corr_LIG_df3.svg" -plot_corr_lig_df3 = paste0(plotdir,"/", corrplot_lig_df3) - -#################################################################### -# end of loading libraries and functions -#################################################################### -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Data for plots -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -cols_to_drop = c("ASA", "AF_kin") - - -# PS -corr_ps_df2 = corr_ps_df2[!colnames(corr_ps_df2)%in%cols_to_drop] -corr_ps_df2 = corr_ps_df2[-1] - -corr_ps_df3 = corr_ps_df3[!colnames(corr_ps_df3)%in%cols_to_drop] -corr_ps_df3 = corr_ps_df3[-1] - - -# Lig -corr_lig_df2 = corr_lig_df2[!colnames(corr_lig_df2)%in%cols_to_drop] -corr_lig_df2 = corr_lig_df2[-1] - -corr_lig_df3 = corr_lig_df3[!colnames(corr_lig_df3)%in%cols_to_drop] -corr_lig_df3 = corr_lig_df3[-1] - -#--------------------------------------- -# generate corr PS plot 1: merged_df2 -#--------------------------------------- -cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_df2) -svg(plot_corr_ps_df2, width = 30, height = 25) - -OutPlot_ps_df2 = pairs.panels(corr_ps_df2[1:(length(corr_ps_df2)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df2$duet_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - #, pch = 19 - , jitter = T - , alpha = 1 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 1.8 - , cex.axis = 2 - , cex.labels = 3.8 - , cex.cor = 1 - , smooth = F) - -print(OutPlot_ps_df2) -dev.off() - -#---------------------------------------------- -# generate corr PS plot 2: merged_df3 -#---------------------------------------------- -cat("Corr plot PS DUET with coloured dots:", plot_corr_ps_df3) -svg(plot_corr_ps_df3, width = 30, height = 25) - -OutPlot_ps_df3 = pairs.panels(corr_ps_df3[1:(length(corr_ps_df3)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_ps_df3$duet_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - , cex = 3 - , cex.axis = 1.6 - , cex.labels = 3.8 - , cex.cor = 1 - , smooth = F - -) - -print(OutPlot_ps_df3) -dev.off() - -################################################################################################ - -#--------------------------------------- -# generate corr lig plot 1: merged_df2 -#--------------------------------------- -cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_df2) -svg(plot_corr_lig_df2, width = 30, height = 25) - -OutPlot_lig_df2 = pairs.panels(corr_lig_df2[1:(length(corr_lig_df2)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation elliliges - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df2$ligand_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - #, pch = 19 - , jitter = T - , alpha = 1 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 1.8 - , cex.axis = 2 - , cex.labels = 3.8 - , cex.cor = 1 - , smooth = F) - -print(OutPlot_lig_df2) -dev.off() - -#---------------------------------------------- -# generate corr lig plot 2: merged_df3 -#---------------------------------------------- -cat("Corr plot lig DUET with coloured dots:", plot_corr_lig_df3) -svg(plot_corr_lig_df3, width = 30, height = 25) - -OutPlot_lig_df3 = pairs.panels(corr_lig_df3[1:(length(corr_lig_df3)-2)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = T # show density plots - , ellipses = F # show correlation elliliges - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(corr_lig_df3$ligand_outcome))] # can't use colour as duet and foldx are opposite - , pch = 21 # for bg - , cex = 3 - , cex.axis = 1.6 - , cex.labels = 3.8 - , cex.cor = 1 - , smooth = F - -) - -print(OutPlot_lig_df3) -dev.off() - diff --git a/scripts/plotting/corr_foldx.R b/scripts/plotting/corr_foldx.R deleted file mode 100644 index c5e9569..0000000 --- a/scripts/plotting/corr_foldx.R +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: Corr plots for PS and Lig - -# Output: 1 svg - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting/") -getwd() - - -source("Header_TT.R") -require(cowplot) -source("combining_dfs_plotting.R") # FIXME: add extra from other plots here - -# should return the following dfs, directories and variables - - -#======= -# output -#======= -# can't combine by cowplot because not ggplots -#corr_plot_combined = "corr_combined.svg" -#plot_corr_plot_combined = paste0(plotdir,"/", corr_plot_combined) - -# PS foldx -corr_foldx = "corr_adjusted_foldx.svg" -plot_corr_foldx = paste0(plotdir,"/", corr_foldx) - -#################################################################### -# end of loading libraries and functions # -######################################################################## - -#%%%%%%%%%%%%%%%%%%%%%%%%% -df_ps = merged_df3 -#%%%%%%%%%%%%%%%%%%%%%%%%% - -rm( merged_df2, merged_df2_comp, merged_df2_lig - , merged_df2_comp_lig - , merged_df3_comp, merged_df3_comp_lig - , my_df_u, my_df_u_lig) - -######################################################################## -# end of data extraction and cleaning for plots # -######################################################################## - -#=========================== -# Data for Correlation plots:foldx -#=========================== -#============================ -# adding foldx scaled values -# scale data b/w -1 and 1 -#============================ -n = which(colnames(df_ps) == "ddg"); n - -my_min = min(df_ps[,n]); my_min -my_max = max(df_ps[,n]); my_max - -df_ps$foldx_scaled = ifelse(df_ps[,n] < 0 - , df_ps[,n]/abs(my_min) - , df_ps[,n]/my_max) -# sanity check -my_min = min(df_ps$foldx_scaled); my_min -my_max = max(df_ps$foldx_scaled); my_max - -if (my_min == -1 && my_max == 1){ - cat("PASS: foldx ddg successfully scaled b/w -1 and 1" - , "\nProceeding with assigning foldx outcome category") -}else{ - cat("FAIL: could not scale foldx ddg values" - , "Aborting!") -} - - -#================================ -# adding foldx outcome category -# ddg<0 = "Stabilising" (-ve) -#================================= - -c1 = table(df_ps$ddg < 0) -df_ps$foldx_outcome = ifelse(df_ps$ddg < 0, "Stabilising", "Destabilising") -c2 = table(df_ps$ddg < 0) - -if ( all(c1 == c2) ){ - cat("PASS: foldx outcome successfully created") -}else{ - cat("FAIL: foldx outcome could not be created. Aborting!") - exit() -} - -table(df_ps$foldx_outcome) - -#====================== -# adding log cols -#====================== -df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) -df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) - -df_ps$log10_or_kin = log10(df_ps$or_kin) -df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) - -# subset data to generate pairwise correlations -cols_to_select_foldx = c("foldx_scaled" - - , "duet_scaled" - - , "log10_or_mychisq" - , "neglog_pval_fisher" - - , "log10_or_kin" - , "neglog_pwald_kin" - - , "af" - - , "foldx_outcome" - , drug) - -corr_data_foldx = df_ps[, cols_to_select_foldx] - -dim(corr_data_foldx) - -#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic -#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic - -# assign nice colnames (for display) -my_corr_colnames_foldx = c("Foldx" - - ,"DUET" - - , "Log(OR)" - , "-Log(P)" - - , "Log(OR adjusted)" - , "-Log(P wald)" - - , "AF" - - , "foldx_outcome" - , drug) - -length(my_corr_colnames_foldx) - -colnames(corr_data_foldx) -colnames(corr_data_foldx) <- my_corr_colnames_foldx -colnames(corr_data_foldx) - -#----------------- -# generate corr foldx plot -#----------------- -start = 1 -end = which(colnames(corr_data_foldx) == drug); end # should be the last column -offset = 1 - -my_corr_foldx = corr_data_foldx[start:(end-offset)] -head(my_corr_foldx) - -#my_cols = c("#f8766d", "#00bfc4") -# deep blue :#007d85 -# deep red: #ae301e - -cat("Corr plot foldx:", plot_corr_foldx) -svg(plot_corr_foldx, width = 15, height = 15) - -OutPlot_foldx= pairs.panels(my_corr_foldx[1:(length(my_corr_foldx)-1)] - , method = "spearman" # correlation method - , hist.col = "grey" ##00AFBB - , density = TRUE # show density plots - , ellipses = F # show correlation ellipses - , stars = T - , rug = F - , breaks = "Sturges" - , show.points = T - , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_foldx$foldx_outcome))] - , pch = 21 - , jitter = T - #, alpha = .05 - #, points(pch = 19, col = c("#f8766d", "#00bfc4")) - , cex = 3 - , cex.axis = 2.5 - , cex.labels = 2.1 - , cex.cor = 1 - , smooth = F -) - -print(OutPlot_foldx) -dev.off() - - - diff --git a/scripts/plotting/ggridges_lineage_country.R b/scripts/plotting/ggridges_lineage_country.R deleted file mode 100644 index ef91623..0000000 --- a/scripts/plotting/ggridges_lineage_country.R +++ /dev/null @@ -1,289 +0,0 @@ - - -######################################################### -# 1: Installing and loading required packages -######################################################### - -#source("../Header_TT.R") -install.packages("qqman") -library(qqman) - -source("combining_dfs_plotting.R") -#mcsm_data: raw file, 225, 15 -#merged_df2 = 2201, 35 -#merged_df3 = 205, 35 ("Can't trust non-numerical params') - -#=============================================== -# PLOTS: DUET vs GWAS: non-numerical -# lineage, country_code, etc -# merged_df2: 1592, 35 -#=============================================== - -######################### -# Data for plot -######################### -df = merged_df2 -#df = merged_df2_comp - - -#======================== -# Plot 1a: Lineage barplot -# x = lineage y = No of samples -# col = Lineage -# fill = lineage -#======================== -table(df$lineage) - -# subset only lineages1-4 -sel_lineages = c("lineage1" - , "lineage2" - , "lineage3" - , "lineage4" - #, "lineage5" - #, "lineage6" - #, "lineage7" -) - -# uncomment as necessary -df_lin = subset(df, subset = lineage %in% sel_lineages ) -table(df_lin$lineage) - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df <- df_lin -#%%%%%%%%%%%%%%%%%%%%%%%%% - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df2 = df -#%%%%%%%%%%%%%%%%%%%%%%%% -df2 = df2%>% - add_count(country_code) - -str(df2$country_code); str(df2$n) - -n = which(colnames(df2) == "n") -colnames(df2)[n] = "count_country" - -table(df2$count_country>100 & df$country_code!= "") -df3 = subset(df2, df2$count_country>100 & df2$country_code != "") - - -#%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT -df = df3 -#%%%%%%%%%%%%%%%%%%%%%%%% - -sample = sum(table(unique(df$id))); sample -table(df$country_code) -tab = sum(table(df$country_code)); tab - - -View(table(df$country_code)) -View(t1) - -############## begin plot -g = ggplot(df, aes(x = lineage)) -g + geom_bar(aes(fill = lineage)) + - theme( axis.text.x = element_text(size = 13 - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = 15 - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = 15) - , axis.title.y = element_text(size = 15) ) + - labs(title = "Lineage" - , x = "Lineage" - , y = "No of samples") - - -#======================== -# Plot 2: DUET, lineage, country_code and or_mychisq -# x = lineage y = DUET -# col = Lineage -# fill = country_code -#======================== -### begin plot -g = ggplot(df, aes(x = country_code - , y = duet_scaled)) -g + geom_point(aes(col = lineage - , size = or_mychisq)) + - theme(axis.text.x = element_text(size = 13 - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = 15 - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = 15) - , axis.title.y = element_text(size = 15) ) + - labs(title = "DUET, country_code, lineage, or_mychisq" - , x = "Lineage" - , y = "DUET (PS)") - - -############# -#======================== -# Plot 3: DUET, lineage, or_mychisq -# x = lineage y = DUET -# col = Lineage -# fill = country_code -#======================== - -### begin plot -table(df$lineage) - -g = ggplot(df_lin, aes(x = lineage - , y = duet_scaled)) -g + geom_point(aes(col = lineage - , size = or_mychisq)) + - theme(axis.text.x = element_text(size = 13 - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = 15 - , angle = 0 - , hjust = 1 - , vjust = 0) - , axis.title.x = element_text(size = 15) - , axis.title.y = element_text(size = 15) ) + - labs(title = "DUET, lineage, or_mychisq" - , x = "Lineage" - , y = "DUET (PS)") - -#======================== -# Plot 4-5: Distributions -# ggrdiges -#======================== - - -#================================================== -my_ats = 15 # axis text size -my_als = 20 # axis label size - -my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' - #, 'Lineage 5', 'Lineage 6', 'Lineage 7' - ) -names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' - # , 'lineage5', 'lineage6', 'lineage7' - ) - - -#======================== -# Plot 4: Distribution -# x = duet_scaled -# y = country -# fill = country_code -# facet = lineage -#======================== -# works neatly! - -p1 = ggplot(df, aes(x = duet_scaled - , y = country_code))+ - - #printFile=geom_density_ridges_gradient( - geom_density_ridges_gradient(aes(fill = country_code) - , jittered_points = TRUE - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" - , switch = 'x' - , labeller = labeller(lineage = my_labels) - ) + - coord_cartesian( xlim = c(-1, 1)) + - #scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - # , name = "DUET" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - - #, axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-5) - , legend.title = element_text(size = my_als) - ) - -p1 - - -#======================== -# Plot 5: Distribution -# x = duet_scaled -# y = country_code -# fill = lineage -# facet = NONE -#======================== -# no facet wrap - -p2 = ggplot(df, aes(x = duet_scaled - , y = country_code))+ - - geom_density_ridges_gradient(aes(fill = factor(lineage)) - , scale = 3 - , size = 0.3 ) + - coord_cartesian( xlim = c(-1, 1)) + - #scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - # , name = "DUET" ) + - #scale_fill_continuous(colours = c("darkgreen", "pink", "orange", "brown") - # , name = "lineage" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - - #, axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-5) - , legend.title = element_text(size = my_als) - ) - -p2 - - -#=============== -# lineage only -#================ -#svg(plot_lineage_duet) -p3 = ggplot(df, aes(x = duet_scaled - , y = duet_outcome))+ - geom_density_ridges_gradient(aes(fill = ..x..) - , jittered_points = TRUE - , scale = 3 - , size = 0.3 ) + - facet_wrap( ~lineage - , scales = "free" - #, switch = 'x' - , labeller = labeller(lineage = my_labels) ) + - coord_cartesian( xlim = c(-1, 1)) + - scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") - , name = "DUET" ) + - theme(axis.text.x = element_text(size = my_ats - , angle = 90 - , hjust = 1 - , vjust = 0.4) - - , axis.text.y = element_blank() - , axis.title.x = element_blank() - , axis.title.y = element_blank() - , axis.ticks.y = element_blank() - , plot.title = element_blank() - , strip.text = element_text(size = my_als) - , legend.text = element_text(size = my_als-5) - , legend.title = element_text(size = my_als) - ) - -print(p3) diff --git a/scripts/prediction_all.R b/scripts/prediction_all.R deleted file mode 100644 index fa91239..0000000 --- a/scripts/prediction_all.R +++ /dev/null @@ -1,426 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: prediction - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/") -getwd() - -source("plotting/combining_dfs_plotting.R") - -#################################################################### -# end of loading libraries and functions -#################################################################### - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# ps -table(merged_df2$mutation_info) -merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0) -table(merged_df2$mutation_info_labels) - - -table(merged_df3$mutation_info) -merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) -table(merged_df3$mutation_info_labels) - - -# lig -table(merged_df2_lig$mutation_info) -merged_df2_lig$mutation_info_labels = ifelse(merged_df2_lig$mutation_info == dr_muts_col, 1, 0) -table(merged_df2_lig$mutation_info_labels) - - -table(merged_df3_lig$mutation_info) -merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0) -table(merged_df3_lig$mutation_info_labels) - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -############################################################################### -model_ind = glm(mutation_info_labels ~ or_mychisq - , data = merged_df2 - , family = "binomial") -summary(model_ind) -nobs(model_ind) - -#============= -# try loop -#============= -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled") - -for( i in my_ivs){ - cat ("===============================\n") - cat(i) - cat ("\n===============================\n") - print(summary(glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df2 - , family = "binomial"))) -} -############################################################################### -#======================================== -# merged_df2: UNadjusted,loop -#======================================== -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" - , "ligand_distance" - , "rsa" - , "rd_values" - , "kd_values" - , "duet_stability_change" - , "duet_scaled" - , "duet_outcome" - , "ddg" - , "foldx_scaled" - , "foldx_outcome") - -ps_logistic_df2 = data.frame() - -for( i in my_ivs){ - print(i) - - df = data.frame(var_name = NA - , number_samples = NA - , beta = NA - , odds_ratio = NA - , pvalue = NA - , se = NA - , zvalue = NA - , ci_lower = NA - , ci_upper = NA) - - model = glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df2 - , family = "binomial") - - var_name = i - - number_samples = nobs(model) - - beta_logistic = summary(model)$coefficients[2,1]; beta_logistic - - or_logistic = exp(summary(model)$coefficients[2,1]) - - pval_logistic = summary(model)$coefficients[2,4] - - se_logistic = summary(model)$coefficients[2,2] - - zval_logistic = summary(model)$coefficients[2,3] - - ci_mod = exp(confint(model))[2,] - - ci_lower_logistic = ci_mod[["2.5 %"]] - ci_upper_logistic = ci_mod[["97.5 %"]] - - print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) - - df$var_name = var_name - df$number_samples = number_samples - df$beta = beta_logistic - df$odds_ratio = or_logistic - df$pvalue = pval_logistic - df$se = se_logistic - df$zvalue = zval_logistic - df$ci_lower = ci_lower_logistic - df$ci_upper = ci_upper_logistic - - print(df) - ps_logistic_df2 = rbind(ps_logistic_df2, df) - -} -#-------------------- -# formatting df -#-------------------- -ps_logistic_df2$data_source = "df2" -ps_logistic_df2$model = "unadjusted" - -ps_logistic_df2$odds_ratio = round(ps_logistic_df2$odds_ratio, 2) -ps_logistic_df2$ci_lower = round(ps_logistic_df2$ci_lower, 2) -ps_logistic_df2$ci_upper = round(ps_logistic_df2$ci_upper, 2) - -# adding pvalue_signif -ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue -str(ps_logistic_df2$pvalue_signif) - -ps_logistic_df2 = dplyr::mutate(ps_logistic_df2 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -colnames(ps_logistic_df2) -ps_logistic_df2_o = ps_logistic_df2 [c("var_name" - , "number_samples" - , "model" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - , "data_source")] - -############################################################################### -#======================================== -# merged_df2: adjusted, loop -#======================================== -#model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + -# duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome -# , data = merged_df2 -# , family = "binomial") - -model_adjusted_df2 = glm(mutation_info_labels ~ or_mychisq + or_kin + rd_values + kd_values + - ligand_distance + duet_stability_change - , data = merged_df2 - , family = "binomial");summary(model_adjusted_df2) - -var_names_df = as.data.frame(names(model_adjusted_df2$coefficients)) -names(var_names_df) = c("var_name") - -ci_mod = exp(confint(model_adjusted_df2)) -ci_mod_df = as.data.frame(ci_mod) -names(ci_mod_df) = c("ci_lower", "ci_upper") -ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) -ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) - - -estimates_df = as.data.frame(summary(model_adjusted_df2)$coefficients) -names(estimates_df) = c("beta", "se", "zvalue", "pvalue") -estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) -number_samples = nobs(model_adjusted_df2) -estimates_df$number_samples = number_samples -estimates_df$data_source = "df2" -estimates_df$model = "adjusted" - -names(ps_logistic_adjusted_df2) - -if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ - cat("PASS: rownames match. Preparing to merge...") - ps_logistic_adjusted_df2 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) -} - - -colnames(ps_logistic_adjusted_df2)[1] <- c("var_name") -d1 = which(ps_logistic_adjusted_df2$var_name == "(Intercept)") -ps_logistic_adjusted_df2 = ps_logistic_adjusted_df2[-d1,] - -# adding pvalue_signif -ps_logistic_adjusted_df2$pvalue_signif = ps_logistic_adjusted_df2$pvalue -str(ps_logistic_adjusted_df2$pvalue_signif) - -ps_logistic_adjusted_df2 = dplyr::mutate(ps_logistic_adjusted_df2 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -colnames(ps_logistic_adjusted_df2) -ps_logistic_adjusted_df2_o = ps_logistic_adjusted_df2[c("var_name" - , "number_samples" - , "model" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - , "data_source")] - -############################################################################### - -############################################################################### -#======================================== -# merged_df3: UNadjusted,loop -#======================================== - -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" - , "ligand_distance" - , "rsa" - , "rd_values" - , "kd_values" - , "duet_stability_change" - , "duet_scaled" - , "duet_outcome" - , "ddg" - , "foldx_scaled" - , "foldx_outcome") - -ps_logistic_df3 = data.frame() - -for( i in my_ivs){ - print(i) - - df = data.frame(var_name = NA - , number_samples = NA - , beta = NA - , odds_ratio = NA - , pvalue = NA - , se = NA - , zvalue = NA - , ci_lower = NA - , ci_upper = NA) - - model = glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df3 - , family = "binomial") - - var_name = i - - number_samples = nobs(model) - - beta_logistic = summary(model)$coefficients[2,1]; beta_logistic - - or_logistic = exp(summary(model)$coefficients[2,1]) - - pval_logistic = summary(model)$coefficients[2,4] - - se_logistic = summary(model)$coefficients[2,2] - - zval_logistic = summary(model)$coefficients[2,3] - - ci_mod = exp(confint(model))[2,] - - ci_lower_logistic = ci_mod[["2.5 %"]] - ci_upper_logistic = ci_mod[["97.5 %"]] - - print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) - - df$var_name = var_name - df$number_samples = number_samples - df$beta = beta_logistic - df$odds_ratio = or_logistic - df$pvalue = pval_logistic - df$se = se_logistic - df$zvalue = zval_logistic - df$ci_lower = ci_lower_logistic - df$ci_upper = ci_upper_logistic - - print(df) - ps_logistic_df3 = rbind(ps_logistic_df3, df) - -} -#-------------------- -# formatting df -#-------------------- -ps_logistic_df3$data_source = "df3" -ps_logistic_df3$model = "unadjusted" - -ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2) -ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2) -ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2) - -# adding pvalue_signif -ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue -str(ps_logistic_df3$pvalue_signif) - -ps_logistic_df3 = dplyr::mutate(ps_logistic_df3 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -ps_logistic_df3_o = ps_logistic_df3[c("var_name" - , "number_samples" - , "model" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - , "data_source")] - -#======================================== -# merged_df3: adjusted, loop -#======================================== -#model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + -# duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome -# , data = merged_df3 -# , family = "binomial") - -model_adjusted_df3 = glm(mutation_info_labels ~ rd_values + - ligand_distance + duet_stability_change - , data = merged_df3 - , family = "binomial");summary(model_adjusted_df3) - -var_names_df = as.data.frame(names(model_adjusted_df3$coefficients)) -names(var_names_df) = c("var_name") - -ci_mod = exp(confint(model_adjusted_df3)) -ci_mod_df = as.data.frame(ci_mod) -names(ci_mod_df) = c("ci_lower", "ci_upper") -ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) -ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) - - -estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients) -names(estimates_df) = c("beta", "se", "zvalue", "pvalue") -estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) -number_samples = nobs(model_adjusted_df3) -estimates_df$number_samples = number_samples -estimates_df$data_source = "df3" -estimates_df$model = "adjusted" - -names(ps_logistic_adjusted_df3) - -if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ - cat("PASS: rownames match. Preparing to merge...") - ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) -} - -colnames(ps_logistic_adjusted_df3)[1] <- c("var_name") -d2 = which(ps_logistic_adjusted_df3$var_name == "(Intercept)") -ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,] - -# adding pvalue_signif -ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue -str(ps_logistic_adjusted_df3$pvalue_signif) - -ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -colnames(ps_logistic_adjusted_df3) -ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name" - , "number_samples" - , "model" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - ,"data_source")] - -#------------- -# lm -#------------- - -model_lm = lm(or_kin ~ rsa + rd_values + duet_stability_change + ddg + mutation_info_labels - , data = merged_df3) - -summary(model_lm) - - -model_lm1 = lm(or_mychisq ~ mutation_info_labels - , data = merged_df2) - -summary(model_lm1) diff --git a/scripts/prediction_lig.R b/scripts/prediction_lig.R deleted file mode 100644 index dd6ee19..0000000 --- a/scripts/prediction_lig.R +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: prediction lig - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/") -getwd() - -source("plotting/combining_dfs_plotting.R") - -#======= -# output -#======= -lig_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_LIG.csv") -lig_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_LIG.csv") - -#################################################################### -# end of loading libraries and functions -#################################################################### - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# lig -table(merged_df3_lig$mutation_info) -merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0) -table(merged_df3_lig$mutation_info_labels) - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -############################################################################### - -#======================================== -# merged_df3_lig: UNadjusted,loop -#======================================== -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" - #, "ligand_distance" - , "rsa" - , "rd_values" - , "kd_values" - , "ligand_affinity_change" - , "affinity_scaled" - , "ligand_outcome") - -lig_logistic_df3 = data.frame() - -for( i in my_ivs){ - print(i) - - df = data.frame(var_name = NA - , number_samples = NA - , beta = NA - , odds_ratio = NA - , pvalue = NA - , se = NA - , zvalue = NA - , ci_lower = NA - , ci_upper = NA) - - model_lig = glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df3_lig - , family = "binomial") - - var_name = i - - number_samples = nobs(model_lig) - - beta_logistic = summary(model_lig)$coefficients[2,1]; beta_logistic - - or_logistic = exp(summary(model_lig)$coefficients[2,1]) - - pval_logistic = summary(model_lig)$coefficients[2,4] - - se_logistic = summary(model_lig)$coefficients[2,2] - - zval_logistic = summary(model_lig)$coefficients[2,3] - - ci_mod = exp(confint(model_lig))[2,] - - ci_lower_logistic = ci_mod[["2.5 %"]] - ci_upper_logistic = ci_mod[["97.5 %"]] - - print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) - - df$var_name = var_name - df$number_samples = number_samples - df$beta = beta_logistic - df$odds_ratio = or_logistic - df$pvalue = pval_logistic - df$se = se_logistic - df$zvalue = zval_logistic - df$ci_lower = ci_lower_logistic - df$ci_upper = ci_upper_logistic - - print(df) - lig_logistic_df3 = rbind(lig_logistic_df3, df) - -} -#-------------------- -# formatting df -#-------------------- -lig_logistic_df3$data_source = "df3_lig" -lig_logistic_df3$model_lig = "unadjusted" - -lig_logistic_df3$odds_ratio = round(lig_logistic_df3$odds_ratio, 2) -lig_logistic_df3$ci_lower = round(lig_logistic_df3$ci_lower, 2) -lig_logistic_df3$ci_upper = round(lig_logistic_df3$ci_upper, 2) - -# adding pvalue_signif -lig_logistic_df3$pvalue_signif = lig_logistic_df3$pvalue -str(lig_logistic_df3$pvalue_signif) - -lig_logistic_df3 = dplyr::mutate(lig_logistic_df3 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -lig_logistic_df3_o = lig_logistic_df3[c("var_name" - , "number_samples" - , "model_lig" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - , "data_source")] -# writing file -write.csv(lig_logistic_df3_o, lig_unadjusted, row.names = F) - -#======================================== -# merged_df3_lig: adjusted, loop -#======================================== -#model_lig_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + -# ligand_affinity_change + affinity_scaled + ligand_outcome -# , data = merged_df3_lig -# , family = "binomial") - -model_lig_adjusted_df3 = glm(mutation_info_labels ~ rd_values + ligand_affinity_change - , data = merged_df3_lig - , family = "binomial");summary(model_lig_adjusted_df3) - -var_names_df = as.data.frame(names(model_lig_adjusted_df3$coefficients)) -names(var_names_df) = c("var_name") - -ci_mod = exp(confint(model_lig_adjusted_df3)) -ci_mod_df = as.data.frame(ci_mod) -names(ci_mod_df) = c("ci_lower", "ci_upper") -ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) -ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) - - -estimates_df = as.data.frame(summary(model_lig_adjusted_df3)$coefficients) -names(estimates_df) = c("beta", "se", "zvalue", "pvalue") -estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) -number_samples = nobs(model_lig_adjusted_df3) -estimates_df$number_samples = number_samples -estimates_df$data_source = "df3_lig" -estimates_df$model_lig = "adjusted" - -names(lig_logistic_adjusted_df3) - -if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ - cat("PASS: rownames match. Preparing to merge...") - lig_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) -} - -colnames(lig_logistic_adjusted_df3)[1] <- c("var_name") -d2 = which(lig_logistic_adjusted_df3$var_name == "(Intercept)") -lig_logistic_adjusted_df3 = lig_logistic_adjusted_df3[-d2,] - -# adding pvalue_signif -lig_logistic_adjusted_df3$pvalue_signif = lig_logistic_adjusted_df3$pvalue -str(lig_logistic_adjusted_df3$pvalue_signif) - -lig_logistic_adjusted_df3 = dplyr::mutate(lig_logistic_adjusted_df3 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -colnames(lig_logistic_adjusted_df3) -lig_logistic_adjusted_df3_o = lig_logistic_adjusted_df3[c("var_name" - , "number_samples" - , "model_lig" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - ,"data_source")] -# writing file -write.csv(lig_logistic_adjusted_df3_o, lig_adjusted, row.names = F) \ No newline at end of file diff --git a/scripts/prediction_ps.R b/scripts/prediction_ps.R deleted file mode 100644 index 605f058..0000000 --- a/scripts/prediction_ps.R +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: prediction_ps - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/") -getwd() - -source("plotting/combining_dfs_plotting.R") - -#======= -# output -#======= -ps_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_PS.csv") -ps_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_PS.csv") - -#################################################################### -# end of loading libraries and functions -#################################################################### - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# ps -table(merged_df3$mutation_info) -merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) -table(merged_df3$mutation_info_labels) - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -############################################################################### -#======================================== -# merged_df3: UNadjusted,loop -#======================================== -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" - , "ligand_distance" - , "rsa" - , "rd_values" - , "kd_values" - , "duet_stability_change" - , "duet_scaled" - , "duet_outcome" - , "ddg" - , "foldx_scaled" - , "foldx_outcome") - -ps_logistic_df3 = data.frame() - -for( i in my_ivs){ - print(i) - - df = data.frame(var_name = NA - , number_samples = NA - , beta = NA - , odds_ratio = NA - , pvalue = NA - , se = NA - , zvalue = NA - , ci_lower = NA - , ci_upper = NA) - - model = glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df3 - , family = "binomial") - - var_name = i - - number_samples = nobs(model) - - beta_logistic = summary(model)$coefficients[2,1]; beta_logistic - - or_logistic = exp(summary(model)$coefficients[2,1]) - - pval_logistic = summary(model)$coefficients[2,4] - - se_logistic = summary(model)$coefficients[2,2] - - zval_logistic = summary(model)$coefficients[2,3] - - ci_mod = exp(confint(model))[2,] - - ci_lower_logistic = ci_mod[["2.5 %"]] - ci_upper_logistic = ci_mod[["97.5 %"]] - - print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) - - df$var_name = var_name - df$number_samples = number_samples - df$beta = beta_logistic - df$odds_ratio = or_logistic - df$pvalue = pval_logistic - df$se = se_logistic - df$zvalue = zval_logistic - df$ci_lower = ci_lower_logistic - df$ci_upper = ci_upper_logistic - - print(df) - ps_logistic_df3 = rbind(ps_logistic_df3, df) - -} -#-------------------- -# formatting df -#-------------------- -ps_logistic_df3$data_source = "df3" -ps_logistic_df3$model = "unadjusted" - -ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2) -ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2) -ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2) - -# adding pvalue_signif -ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue -str(ps_logistic_df3$pvalue_signif) - -ps_logistic_df3 = dplyr::mutate(ps_logistic_df3 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -ps_logistic_df3_o = ps_logistic_df3[c("var_name" - , "number_samples" - , "model" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - , "data_source")] -# writing file -write.csv(ps_logistic_df3_o, ps_unadjusted, row.names = F) - -#======================================== -# merged_df3: adjusted, loop -#======================================== -#model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + -# duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome -# , data = merged_df3 -# , family = "binomial") - -model_adjusted_df3 = glm(mutation_info_labels ~ rd_values + - ligand_distance + duet_stability_change - , data = merged_df3 - , family = "binomial");summary(model_adjusted_df3) - -var_names_df = as.data.frame(names(model_adjusted_df3$coefficients)) -names(var_names_df) = c("var_name") - -ci_mod = exp(confint(model_adjusted_df3)) -ci_mod_df = as.data.frame(ci_mod) -names(ci_mod_df) = c("ci_lower", "ci_upper") -ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) -ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) - - -estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients) -names(estimates_df) = c("beta", "se", "zvalue", "pvalue") -estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) -number_samples = nobs(model_adjusted_df3) -estimates_df$number_samples = number_samples -estimates_df$data_source = "df3" -estimates_df$model = "adjusted" - -names(ps_logistic_adjusted_df3) - -if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ - cat("PASS: rownames match. Preparing to merge...") - ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) -} - -colnames(ps_logistic_adjusted_df3)[1] <- c("var_name") -d2 = which(ps_logistic_adjusted_df3$var_name == "(Intercept)") -ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,] - -# adding pvalue_signif -ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue -str(ps_logistic_adjusted_df3$pvalue_signif) - -ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) -# rearranging columns -colnames(ps_logistic_adjusted_df3) -ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name" - , "number_samples" - , "model" - , "odds_ratio" - , "pvalue" - , "pvalue_signif" - , "beta" - , "se" - , "zvalue" - , "ci_lower" - , "ci_upper" - ,"data_source")] -# writing file -write.csv(ps_logistic_adjusted_df3_o, ps_adjusted, row.names = F) -############################################################################### \ No newline at end of file