From 8a6c7968f5768de5a526fd8b8adcbb892b17e156 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 30 Sep 2020 13:12:05 +0100 Subject: [PATCH] added predictions for ps and lig and output to results --- scripts/prediction.R | 255 ----------------------- scripts/prediction_all.R | 426 +++++++++++++++++++++++++++++++++++++++ scripts/prediction_lig.R | 203 +++++++++++++++++++ scripts/prediction_ps.R | 213 ++++++++++++++++++++ 4 files changed, 842 insertions(+), 255 deletions(-) delete mode 100644 scripts/prediction.R create mode 100644 scripts/prediction_all.R create mode 100644 scripts/prediction_lig.R create mode 100644 scripts/prediction_ps.R diff --git a/scripts/prediction.R b/scripts/prediction.R deleted file mode 100644 index 7d24fbf..0000000 --- a/scripts/prediction.R +++ /dev/null @@ -1,255 +0,0 @@ -#!/usr/bin/env Rscript -######################################################### -# TASK: prediction - -#======================================================================= -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts") -getwd() - -source("combining_dfs_plotting.R") -#source("plotting/corr_data.R") - -#======= -# output -#======= - -#################################################################### -# end of loading libraries and functions -#################################################################### - -#%%%%%%%%%%%%%%%%%%%%%%%%% - - -table(merged_df2$mutation_info) -merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0) -table(merged_df2$mutation_info_labels) - - -table(merged_df3$mutation_info) -merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) -table(merged_df3$mutation_info_labels) - - -# lig -table(merged_df3$mutation_info) -merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) -table(merged_df3$mutation_info_labels) - - -table(merged_df3$mutation_info) -merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) -table(merged_df3$mutation_info_labels) - -#%%%%%%%%%%%%%%%%%%%%%%%%% - -############################################################################### -model_ind = glm(mutation_info_labels ~ or_mychisq - , data = merged_df2 - , family = "binomial") -summary(model_ind) -nobs(model_ind) - -#============= -# try loop -#============= - -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled") - - -for( i in my_ivs){ - cat ("===============================\n") - cat(i) - cat ("\n===============================\n") - print(summary(glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df2 - , family = "binomial"))) -} -############################################################################### -model1 = glm(mutation_info_labels ~ or_mychisq - , data = merged_df2 - , family = "binomial"); summary(model1) - -model2 = glm(mutation_info_labels ~ or_kin - , data = merged_df2 - , family = "binomial"); summary(model2) - -model3 = glm(mutation_info_labels ~ pval_fisher - , data = merged_df2 - , family = "binomial"); summary(model3) - -model4 = glm(mutation_info_labels ~ af - , data = merged_df2 - , family = "binomial"); summary(model4) - -model5 = glm(mutation_info_labels ~ duet_stability_change - , data = merged_df2 - , family = "binomial"); summary(model5) - -model6 = glm(mutation_info_labels ~ duet_scaled - , data = merged_df2 - , family = "binomial"); summary(model6) -############################################################################### -#======================================== -# loop and output in df: individually -#======================================== - -my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" - , "rsa" - , "rd_values" - , "kd_values" - , "duet_stability_change" - , "duet_scaled" - , "duet_outcome" - , "ddg" - , "foldx_scaled" - , "foldx_outcome") - -ps_logistic_df2 = data.frame() - -for( i in my_ivs){ - print(i) - - df = data.frame(var_name = NA - , number_samples = NA - , beta = NA - , odds_ratio = NA - , pvalue = NA - , se = NA - , zvalue = NA - , ci_lower = NA - , ci_upper = NA) - - model = glm(mutation_info_labels ~ eval(parse(text=i)) - , data = merged_df2 - , family = "binomial") - - var_name = i - - number_samples = nobs(model) - - beta_logistic = summary(model)$coefficients[2,1]; beta_logistic - - or_logistic = exp(summary(model)$coefficients[2,1]) - - pval_logistic = summary(model)$coefficients[2,4] - - se_logistic = summary(model)$coefficients[2,2] - - zval_logistic = summary(model)$coefficients[2,3] - - ci_mod = exp(confint(model))[2,] - - ci_lower_logistic = ci_mod[["2.5 %"]] - ci_upper_logistic = ci_mod[["97.5 %"]] - - print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) - - df$var_name = var_name - df$number_samples = number_samples - df$beta = beta_logistic - df$odds_ratio = or_logistic - df$pvalue = pval_logistic - df$se = se_logistic - df$zvalue = zval_logistic - df$ci_lower = ci_lower_logistic - df$ci_upper = ci_upper_logistic - - print(df) - ps_logistic_df2 = rbind(ps_logistic_df2, df) - -} -#-------------------- -# formatting df -#-------------------- -ps_logistic_df2$data_source = "df2" - -# adding pvalue_signif -ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue -str(ps_logistic_df2$pvalue_signif) - -ps_logistic_df2 = dplyr::mutate(ps_logistic_df2 - , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." - , pvalue_signif <=0.0001 ~ '****' - , pvalue_signif <=0.001 ~ '***' - , pvalue_signif <=0.01 ~ '**' - , pvalue_signif <0.05 ~ '*' - , TRUE ~ 'ns')) - -############################################################################### -#======================================== -# loop and output in df: adjusted -#======================================== -my_ivs = c("or_mychisq" - , "rsa", "rd_values", "kd_values" - , "duet_stability_change", "duet_scaled", "duet_outcome" - , "ddg", "foldx_scaled", "foldx_outcome") - -model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + - duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome - , data = merged_df2 - , family = "binomial") - -var_names = model_adjusted$terms -number_samples = nobs(model_adjusted) -summary(model_adjusted) -ci_mod = exp(confint(model_adjusted)) - - -my_ivs_adjusted = names(model_adjusted$coefficients) - -ps_logistic_adjusted_df2 = data.frame(var_name = NA - , number_samples = NA - , beta = NA - , odds_ratio = NA - , pvalue = NA - , se = NA - , zvalue = NA - , ci_lower = NA - , ci_upper = NA) - - -for (i in my_ivs_adjusted){ - print(i) - - ps_logistic_adjusted_df2$var_name = i - ps_logistic_adjusted_df2$number_samples = number_samples - ps_logistic_adjusted_df2$beta = model_adjusted$coefficients[[i]] - ps_logistic_adjusted_df2$odds_ratio = exp(model_adjusted$coefficients[[i]]) - ps_logistic_adjusted_df2$pvalue = - ps_logistic_adjusted_df2$se = - ps_logistic_adjusted_df2$zvalue = - ps_logistic_adjusted_df2$ci_lower = ci_mod[[i]] - ps_logistic_adjusted_df2$ci_upper = - -} - - - - - -ci_lower_logistic = ci_mod[["2.5 %"]] -ci_upper_logistic = ci_mod[["97.5 %"]] - -pval_logistic = summary(model_adjusted)$coefficients[2,4] - -se_logistic = summary(model_adjusted)$coefficients[2,2] - -zval_logistic = summary(model_adjusted)$coefficients[2,3] - -ci_mod = exp(confint(model_adjusted))[2,] - -ci_lower_logistic = ci_mod[["2.5 %"]] -ci_upper_logistic = ci_mod[["97.5 %"]] - -df$var_name = var_name -df$number_samples = number_samples -df$beta = beta_logistic -df$odds_ratio = or_logistic -df$pvalue = pval_logistic -df$se = se_logistic -df$zvalue = zval_logistic -df$ci_lower = ci_lower_logistic -df$ci_upper = ci_upper_logistic - diff --git a/scripts/prediction_all.R b/scripts/prediction_all.R new file mode 100644 index 0000000..fa91239 --- /dev/null +++ b/scripts/prediction_all.R @@ -0,0 +1,426 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: prediction + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +source("plotting/combining_dfs_plotting.R") + +#################################################################### +# end of loading libraries and functions +#################################################################### + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# ps +table(merged_df2$mutation_info) +merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0) +table(merged_df2$mutation_info_labels) + + +table(merged_df3$mutation_info) +merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) +table(merged_df3$mutation_info_labels) + + +# lig +table(merged_df2_lig$mutation_info) +merged_df2_lig$mutation_info_labels = ifelse(merged_df2_lig$mutation_info == dr_muts_col, 1, 0) +table(merged_df2_lig$mutation_info_labels) + + +table(merged_df3_lig$mutation_info) +merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0) +table(merged_df3_lig$mutation_info_labels) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +############################################################################### +model_ind = glm(mutation_info_labels ~ or_mychisq + , data = merged_df2 + , family = "binomial") +summary(model_ind) +nobs(model_ind) + +#============= +# try loop +#============= +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled") + +for( i in my_ivs){ + cat ("===============================\n") + cat(i) + cat ("\n===============================\n") + print(summary(glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df2 + , family = "binomial"))) +} +############################################################################### +#======================================== +# merged_df2: UNadjusted,loop +#======================================== +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" + , "ligand_distance" + , "rsa" + , "rd_values" + , "kd_values" + , "duet_stability_change" + , "duet_scaled" + , "duet_outcome" + , "ddg" + , "foldx_scaled" + , "foldx_outcome") + +ps_logistic_df2 = data.frame() + +for( i in my_ivs){ + print(i) + + df = data.frame(var_name = NA + , number_samples = NA + , beta = NA + , odds_ratio = NA + , pvalue = NA + , se = NA + , zvalue = NA + , ci_lower = NA + , ci_upper = NA) + + model = glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df2 + , family = "binomial") + + var_name = i + + number_samples = nobs(model) + + beta_logistic = summary(model)$coefficients[2,1]; beta_logistic + + or_logistic = exp(summary(model)$coefficients[2,1]) + + pval_logistic = summary(model)$coefficients[2,4] + + se_logistic = summary(model)$coefficients[2,2] + + zval_logistic = summary(model)$coefficients[2,3] + + ci_mod = exp(confint(model))[2,] + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) + + df$var_name = var_name + df$number_samples = number_samples + df$beta = beta_logistic + df$odds_ratio = or_logistic + df$pvalue = pval_logistic + df$se = se_logistic + df$zvalue = zval_logistic + df$ci_lower = ci_lower_logistic + df$ci_upper = ci_upper_logistic + + print(df) + ps_logistic_df2 = rbind(ps_logistic_df2, df) + +} +#-------------------- +# formatting df +#-------------------- +ps_logistic_df2$data_source = "df2" +ps_logistic_df2$model = "unadjusted" + +ps_logistic_df2$odds_ratio = round(ps_logistic_df2$odds_ratio, 2) +ps_logistic_df2$ci_lower = round(ps_logistic_df2$ci_lower, 2) +ps_logistic_df2$ci_upper = round(ps_logistic_df2$ci_upper, 2) + +# adding pvalue_signif +ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue +str(ps_logistic_df2$pvalue_signif) + +ps_logistic_df2 = dplyr::mutate(ps_logistic_df2 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +colnames(ps_logistic_df2) +ps_logistic_df2_o = ps_logistic_df2 [c("var_name" + , "number_samples" + , "model" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + , "data_source")] + +############################################################################### +#======================================== +# merged_df2: adjusted, loop +#======================================== +#model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + +# duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome +# , data = merged_df2 +# , family = "binomial") + +model_adjusted_df2 = glm(mutation_info_labels ~ or_mychisq + or_kin + rd_values + kd_values + + ligand_distance + duet_stability_change + , data = merged_df2 + , family = "binomial");summary(model_adjusted_df2) + +var_names_df = as.data.frame(names(model_adjusted_df2$coefficients)) +names(var_names_df) = c("var_name") + +ci_mod = exp(confint(model_adjusted_df2)) +ci_mod_df = as.data.frame(ci_mod) +names(ci_mod_df) = c("ci_lower", "ci_upper") +ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) +ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) + + +estimates_df = as.data.frame(summary(model_adjusted_df2)$coefficients) +names(estimates_df) = c("beta", "se", "zvalue", "pvalue") +estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) +number_samples = nobs(model_adjusted_df2) +estimates_df$number_samples = number_samples +estimates_df$data_source = "df2" +estimates_df$model = "adjusted" + +names(ps_logistic_adjusted_df2) + +if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ + cat("PASS: rownames match. Preparing to merge...") + ps_logistic_adjusted_df2 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) +} + + +colnames(ps_logistic_adjusted_df2)[1] <- c("var_name") +d1 = which(ps_logistic_adjusted_df2$var_name == "(Intercept)") +ps_logistic_adjusted_df2 = ps_logistic_adjusted_df2[-d1,] + +# adding pvalue_signif +ps_logistic_adjusted_df2$pvalue_signif = ps_logistic_adjusted_df2$pvalue +str(ps_logistic_adjusted_df2$pvalue_signif) + +ps_logistic_adjusted_df2 = dplyr::mutate(ps_logistic_adjusted_df2 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +colnames(ps_logistic_adjusted_df2) +ps_logistic_adjusted_df2_o = ps_logistic_adjusted_df2[c("var_name" + , "number_samples" + , "model" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + , "data_source")] + +############################################################################### + +############################################################################### +#======================================== +# merged_df3: UNadjusted,loop +#======================================== + +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" + , "ligand_distance" + , "rsa" + , "rd_values" + , "kd_values" + , "duet_stability_change" + , "duet_scaled" + , "duet_outcome" + , "ddg" + , "foldx_scaled" + , "foldx_outcome") + +ps_logistic_df3 = data.frame() + +for( i in my_ivs){ + print(i) + + df = data.frame(var_name = NA + , number_samples = NA + , beta = NA + , odds_ratio = NA + , pvalue = NA + , se = NA + , zvalue = NA + , ci_lower = NA + , ci_upper = NA) + + model = glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df3 + , family = "binomial") + + var_name = i + + number_samples = nobs(model) + + beta_logistic = summary(model)$coefficients[2,1]; beta_logistic + + or_logistic = exp(summary(model)$coefficients[2,1]) + + pval_logistic = summary(model)$coefficients[2,4] + + se_logistic = summary(model)$coefficients[2,2] + + zval_logistic = summary(model)$coefficients[2,3] + + ci_mod = exp(confint(model))[2,] + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) + + df$var_name = var_name + df$number_samples = number_samples + df$beta = beta_logistic + df$odds_ratio = or_logistic + df$pvalue = pval_logistic + df$se = se_logistic + df$zvalue = zval_logistic + df$ci_lower = ci_lower_logistic + df$ci_upper = ci_upper_logistic + + print(df) + ps_logistic_df3 = rbind(ps_logistic_df3, df) + +} +#-------------------- +# formatting df +#-------------------- +ps_logistic_df3$data_source = "df3" +ps_logistic_df3$model = "unadjusted" + +ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2) +ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2) +ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2) + +# adding pvalue_signif +ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue +str(ps_logistic_df3$pvalue_signif) + +ps_logistic_df3 = dplyr::mutate(ps_logistic_df3 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +ps_logistic_df3_o = ps_logistic_df3[c("var_name" + , "number_samples" + , "model" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + , "data_source")] + +#======================================== +# merged_df3: adjusted, loop +#======================================== +#model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + +# duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome +# , data = merged_df3 +# , family = "binomial") + +model_adjusted_df3 = glm(mutation_info_labels ~ rd_values + + ligand_distance + duet_stability_change + , data = merged_df3 + , family = "binomial");summary(model_adjusted_df3) + +var_names_df = as.data.frame(names(model_adjusted_df3$coefficients)) +names(var_names_df) = c("var_name") + +ci_mod = exp(confint(model_adjusted_df3)) +ci_mod_df = as.data.frame(ci_mod) +names(ci_mod_df) = c("ci_lower", "ci_upper") +ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) +ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) + + +estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients) +names(estimates_df) = c("beta", "se", "zvalue", "pvalue") +estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) +number_samples = nobs(model_adjusted_df3) +estimates_df$number_samples = number_samples +estimates_df$data_source = "df3" +estimates_df$model = "adjusted" + +names(ps_logistic_adjusted_df3) + +if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ + cat("PASS: rownames match. Preparing to merge...") + ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) +} + +colnames(ps_logistic_adjusted_df3)[1] <- c("var_name") +d2 = which(ps_logistic_adjusted_df3$var_name == "(Intercept)") +ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,] + +# adding pvalue_signif +ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue +str(ps_logistic_adjusted_df3$pvalue_signif) + +ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +colnames(ps_logistic_adjusted_df3) +ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name" + , "number_samples" + , "model" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + ,"data_source")] + +#------------- +# lm +#------------- + +model_lm = lm(or_kin ~ rsa + rd_values + duet_stability_change + ddg + mutation_info_labels + , data = merged_df3) + +summary(model_lm) + + +model_lm1 = lm(or_mychisq ~ mutation_info_labels + , data = merged_df2) + +summary(model_lm1) diff --git a/scripts/prediction_lig.R b/scripts/prediction_lig.R new file mode 100644 index 0000000..86eb2aa --- /dev/null +++ b/scripts/prediction_lig.R @@ -0,0 +1,203 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: prediction + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +source("plotting/combining_dfs_plotting.R") + +#======= +# output +#======= +lig_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_LIG.csv") +lig_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_LIG.csv") + +#################################################################### +# end of loading libraries and functions +#################################################################### + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# lig +table(merged_df3_lig$mutation_info) +merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0) +table(merged_df3_lig$mutation_info_labels) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +############################################################################### + +#======================================== +# merged_df3_lig: UNadjusted,loop +#======================================== +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" + #, "ligand_distance" + , "rsa" + , "rd_values" + , "kd_values" + , "ligand_affinity_change" + , "affinity_scaled" + , "ligand_outcome") + +lig_logistic_df3 = data.frame() + +for( i in my_ivs){ + print(i) + + df = data.frame(var_name = NA + , number_samples = NA + , beta = NA + , odds_ratio = NA + , pvalue = NA + , se = NA + , zvalue = NA + , ci_lower = NA + , ci_upper = NA) + + model_lig = glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df3_lig + , family = "binomial") + + var_name = i + + number_samples = nobs(model_lig) + + beta_logistic = summary(model_lig)$coefficients[2,1]; beta_logistic + + or_logistic = exp(summary(model_lig)$coefficients[2,1]) + + pval_logistic = summary(model_lig)$coefficients[2,4] + + se_logistic = summary(model_lig)$coefficients[2,2] + + zval_logistic = summary(model_lig)$coefficients[2,3] + + ci_mod = exp(confint(model_lig))[2,] + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) + + df$var_name = var_name + df$number_samples = number_samples + df$beta = beta_logistic + df$odds_ratio = or_logistic + df$pvalue = pval_logistic + df$se = se_logistic + df$zvalue = zval_logistic + df$ci_lower = ci_lower_logistic + df$ci_upper = ci_upper_logistic + + print(df) + lig_logistic_df3 = rbind(lig_logistic_df3, df) + +} +#-------------------- +# formatting df +#-------------------- +lig_logistic_df3$data_source = "df3_lig" +lig_logistic_df3$model_lig = "unadjusted" + +lig_logistic_df3$odds_ratio = round(lig_logistic_df3$odds_ratio, 2) +lig_logistic_df3$ci_lower = round(lig_logistic_df3$ci_lower, 2) +lig_logistic_df3$ci_upper = round(lig_logistic_df3$ci_upper, 2) + +# adding pvalue_signif +lig_logistic_df3$pvalue_signif = lig_logistic_df3$pvalue +str(lig_logistic_df3$pvalue_signif) + +lig_logistic_df3 = dplyr::mutate(lig_logistic_df3 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +lig_logistic_df3_o = lig_logistic_df3[c("var_name" + , "number_samples" + , "model_lig" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + , "data_source")] +# writing file +write.csv(lig_logistic_df3_o, lig_unadjusted, row.names = F) + +#======================================== +# merged_df3_lig: adjusted, loop +#======================================== +#model_lig_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + +# ligand_affinity_change + affinity_scaled + ligand_outcome +# , data = merged_df3_lig +# , family = "binomial") + +model_lig_adjusted_df3 = glm(mutation_info_labels ~ rd_values + ligand_affinity_change + , data = merged_df3_lig + , family = "binomial");summary(model_lig_adjusted_df3) + +var_names_df = as.data.frame(names(model_lig_adjusted_df3$coefficients)) +names(var_names_df) = c("var_name") + +ci_mod = exp(confint(model_lig_adjusted_df3)) +ci_mod_df = as.data.frame(ci_mod) +names(ci_mod_df) = c("ci_lower", "ci_upper") +ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) +ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) + + +estimates_df = as.data.frame(summary(model_lig_adjusted_df3)$coefficients) +names(estimates_df) = c("beta", "se", "zvalue", "pvalue") +estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) +number_samples = nobs(model_lig_adjusted_df3) +estimates_df$number_samples = number_samples +estimates_df$data_source = "df3_lig" +estimates_df$model_lig = "adjusted" + +names(lig_logistic_adjusted_df3) + +if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ + cat("PASS: rownames match. Preparing to merge...") + lig_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) +} + +colnames(lig_logistic_adjusted_df3)[1] <- c("var_name") +d2 = which(lig_logistic_adjusted_df3$var_name == "(Intercept)") +lig_logistic_adjusted_df3 = lig_logistic_adjusted_df3[-d2,] + +# adding pvalue_signif +lig_logistic_adjusted_df3$pvalue_signif = lig_logistic_adjusted_df3$pvalue +str(lig_logistic_adjusted_df3$pvalue_signif) + +lig_logistic_adjusted_df3 = dplyr::mutate(lig_logistic_adjusted_df3 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +colnames(lig_logistic_adjusted_df3) +lig_logistic_adjusted_df3_o = lig_logistic_adjusted_df3[c("var_name" + , "number_samples" + , "model_lig" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + ,"data_source")] +# writing file +write.csv(lig_logistic_adjusted_df3_o, lig_adjusted, row.names = F) \ No newline at end of file diff --git a/scripts/prediction_ps.R b/scripts/prediction_ps.R new file mode 100644 index 0000000..c9a7239 --- /dev/null +++ b/scripts/prediction_ps.R @@ -0,0 +1,213 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: prediction + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +source("plotting/combining_dfs_plotting.R") + +#======= +# output +#======= +ps_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_PS.csv") +ps_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_PS.csv") + +#################################################################### +# end of loading libraries and functions +#################################################################### + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# ps +table(merged_df3$mutation_info) +merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) +table(merged_df3$mutation_info_labels) + + +# lig +table(merged_df3_lig$mutation_info) +merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0) +table(merged_df3_lig$mutation_info_labels) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +############################################################################### +#======================================== +# merged_df3: UNadjusted,loop +#======================================== +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" + , "ligand_distance" + , "rsa" + , "rd_values" + , "kd_values" + , "duet_stability_change" + , "duet_scaled" + , "duet_outcome" + , "ddg" + , "foldx_scaled" + , "foldx_outcome") + +ps_logistic_df3 = data.frame() + +for( i in my_ivs){ + print(i) + + df = data.frame(var_name = NA + , number_samples = NA + , beta = NA + , odds_ratio = NA + , pvalue = NA + , se = NA + , zvalue = NA + , ci_lower = NA + , ci_upper = NA) + + model = glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df3 + , family = "binomial") + + var_name = i + + number_samples = nobs(model) + + beta_logistic = summary(model)$coefficients[2,1]; beta_logistic + + or_logistic = exp(summary(model)$coefficients[2,1]) + + pval_logistic = summary(model)$coefficients[2,4] + + se_logistic = summary(model)$coefficients[2,2] + + zval_logistic = summary(model)$coefficients[2,3] + + ci_mod = exp(confint(model))[2,] + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) + + df$var_name = var_name + df$number_samples = number_samples + df$beta = beta_logistic + df$odds_ratio = or_logistic + df$pvalue = pval_logistic + df$se = se_logistic + df$zvalue = zval_logistic + df$ci_lower = ci_lower_logistic + df$ci_upper = ci_upper_logistic + + print(df) + ps_logistic_df3 = rbind(ps_logistic_df3, df) + +} +#-------------------- +# formatting df +#-------------------- +ps_logistic_df3$data_source = "df3" +ps_logistic_df3$model = "unadjusted" + +ps_logistic_df3$odds_ratio = round(ps_logistic_df3$odds_ratio, 2) +ps_logistic_df3$ci_lower = round(ps_logistic_df3$ci_lower, 2) +ps_logistic_df3$ci_upper = round(ps_logistic_df3$ci_upper, 2) + +# adding pvalue_signif +ps_logistic_df3$pvalue_signif = ps_logistic_df3$pvalue +str(ps_logistic_df3$pvalue_signif) + +ps_logistic_df3 = dplyr::mutate(ps_logistic_df3 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +ps_logistic_df3_o = ps_logistic_df3[c("var_name" + , "number_samples" + , "model" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + , "data_source")] +# writing file +write.csv(ps_logistic_df3_o, ps_unadjusted, row.names = F) + +#======================================== +# merged_df3: adjusted, loop +#======================================== +#model_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + +# duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome +# , data = merged_df3 +# , family = "binomial") + +model_adjusted_df3 = glm(mutation_info_labels ~ rd_values + + ligand_distance + duet_stability_change + , data = merged_df3 + , family = "binomial");summary(model_adjusted_df3) + +var_names_df = as.data.frame(names(model_adjusted_df3$coefficients)) +names(var_names_df) = c("var_name") + +ci_mod = exp(confint(model_adjusted_df3)) +ci_mod_df = as.data.frame(ci_mod) +names(ci_mod_df) = c("ci_lower", "ci_upper") +ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) +ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) + + +estimates_df = as.data.frame(summary(model_adjusted_df3)$coefficients) +names(estimates_df) = c("beta", "se", "zvalue", "pvalue") +estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) +number_samples = nobs(model_adjusted_df3) +estimates_df$number_samples = number_samples +estimates_df$data_source = "df3" +estimates_df$model = "adjusted" + +names(ps_logistic_adjusted_df3) + +if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ + cat("PASS: rownames match. Preparing to merge...") + ps_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) +} + +colnames(ps_logistic_adjusted_df3)[1] <- c("var_name") +d2 = which(ps_logistic_adjusted_df3$var_name == "(Intercept)") +ps_logistic_adjusted_df3 = ps_logistic_adjusted_df3[-d2,] + +# adding pvalue_signif +ps_logistic_adjusted_df3$pvalue_signif = ps_logistic_adjusted_df3$pvalue +str(ps_logistic_adjusted_df3$pvalue_signif) + +ps_logistic_adjusted_df3 = dplyr::mutate(ps_logistic_adjusted_df3 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# rearranging columns +colnames(ps_logistic_adjusted_df3) +ps_logistic_adjusted_df3_o = ps_logistic_adjusted_df3[c("var_name" + , "number_samples" + , "model" + , "odds_ratio" + , "pvalue" + , "pvalue_signif" + , "beta" + , "se" + , "zvalue" + , "ci_lower" + , "ci_upper" + ,"data_source")] +# writing file +write.csv(ps_logistic_adjusted_df3_o, ps_adjusted, row.names = F) +############################################################################### \ No newline at end of file