From a77b472dfaa4f5bdbd42c04b81d098aa9ba5d195 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 30 Sep 2020 10:04:49 +0100 Subject: [PATCH] added prediction.R to do logistic regression --- scripts/prediction.R | 255 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 scripts/prediction.R diff --git a/scripts/prediction.R b/scripts/prediction.R new file mode 100644 index 0000000..7d24fbf --- /dev/null +++ b/scripts/prediction.R @@ -0,0 +1,255 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: prediction + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts") +getwd() + +source("combining_dfs_plotting.R") +#source("plotting/corr_data.R") + +#======= +# output +#======= + +#################################################################### +# end of loading libraries and functions +#################################################################### + +#%%%%%%%%%%%%%%%%%%%%%%%%% + + +table(merged_df2$mutation_info) +merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0) +table(merged_df2$mutation_info_labels) + + +table(merged_df3$mutation_info) +merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) +table(merged_df3$mutation_info_labels) + + +# lig +table(merged_df3$mutation_info) +merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) +table(merged_df3$mutation_info_labels) + + +table(merged_df3$mutation_info) +merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) +table(merged_df3$mutation_info_labels) + +#%%%%%%%%%%%%%%%%%%%%%%%%% + +############################################################################### +model_ind = glm(mutation_info_labels ~ or_mychisq + , data = merged_df2 + , family = "binomial") +summary(model_ind) +nobs(model_ind) + +#============= +# try loop +#============= + +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled") + + +for( i in my_ivs){ + cat ("===============================\n") + cat(i) + cat ("\n===============================\n") + print(summary(glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df2 + , family = "binomial"))) +} +############################################################################### +model1 = glm(mutation_info_labels ~ or_mychisq + , data = merged_df2 + , family = "binomial"); summary(model1) + +model2 = glm(mutation_info_labels ~ or_kin + , data = merged_df2 + , family = "binomial"); summary(model2) + +model3 = glm(mutation_info_labels ~ pval_fisher + , data = merged_df2 + , family = "binomial"); summary(model3) + +model4 = glm(mutation_info_labels ~ af + , data = merged_df2 + , family = "binomial"); summary(model4) + +model5 = glm(mutation_info_labels ~ duet_stability_change + , data = merged_df2 + , family = "binomial"); summary(model5) + +model6 = glm(mutation_info_labels ~ duet_scaled + , data = merged_df2 + , family = "binomial"); summary(model6) +############################################################################### +#======================================== +# loop and output in df: individually +#======================================== + +my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" + , "rsa" + , "rd_values" + , "kd_values" + , "duet_stability_change" + , "duet_scaled" + , "duet_outcome" + , "ddg" + , "foldx_scaled" + , "foldx_outcome") + +ps_logistic_df2 = data.frame() + +for( i in my_ivs){ + print(i) + + df = data.frame(var_name = NA + , number_samples = NA + , beta = NA + , odds_ratio = NA + , pvalue = NA + , se = NA + , zvalue = NA + , ci_lower = NA + , ci_upper = NA) + + model = glm(mutation_info_labels ~ eval(parse(text=i)) + , data = merged_df2 + , family = "binomial") + + var_name = i + + number_samples = nobs(model) + + beta_logistic = summary(model)$coefficients[2,1]; beta_logistic + + or_logistic = exp(summary(model)$coefficients[2,1]) + + pval_logistic = summary(model)$coefficients[2,4] + + se_logistic = summary(model)$coefficients[2,2] + + zval_logistic = summary(model)$coefficients[2,3] + + ci_mod = exp(confint(model))[2,] + + ci_lower_logistic = ci_mod[["2.5 %"]] + ci_upper_logistic = ci_mod[["97.5 %"]] + + print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) + + df$var_name = var_name + df$number_samples = number_samples + df$beta = beta_logistic + df$odds_ratio = or_logistic + df$pvalue = pval_logistic + df$se = se_logistic + df$zvalue = zval_logistic + df$ci_lower = ci_lower_logistic + df$ci_upper = ci_upper_logistic + + print(df) + ps_logistic_df2 = rbind(ps_logistic_df2, df) + +} +#-------------------- +# formatting df +#-------------------- +ps_logistic_df2$data_source = "df2" + +# adding pvalue_signif +ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue +str(ps_logistic_df2$pvalue_signif) + +ps_logistic_df2 = dplyr::mutate(ps_logistic_df2 + , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." + , pvalue_signif <=0.0001 ~ '****' + , pvalue_signif <=0.001 ~ '***' + , pvalue_signif <=0.01 ~ '**' + , pvalue_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +############################################################################### +#======================================== +# loop and output in df: adjusted +#======================================== +my_ivs = c("or_mychisq" + , "rsa", "rd_values", "kd_values" + , "duet_stability_change", "duet_scaled", "duet_outcome" + , "ddg", "foldx_scaled", "foldx_outcome") + +model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + + duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome + , data = merged_df2 + , family = "binomial") + +var_names = model_adjusted$terms +number_samples = nobs(model_adjusted) +summary(model_adjusted) +ci_mod = exp(confint(model_adjusted)) + + +my_ivs_adjusted = names(model_adjusted$coefficients) + +ps_logistic_adjusted_df2 = data.frame(var_name = NA + , number_samples = NA + , beta = NA + , odds_ratio = NA + , pvalue = NA + , se = NA + , zvalue = NA + , ci_lower = NA + , ci_upper = NA) + + +for (i in my_ivs_adjusted){ + print(i) + + ps_logistic_adjusted_df2$var_name = i + ps_logistic_adjusted_df2$number_samples = number_samples + ps_logistic_adjusted_df2$beta = model_adjusted$coefficients[[i]] + ps_logistic_adjusted_df2$odds_ratio = exp(model_adjusted$coefficients[[i]]) + ps_logistic_adjusted_df2$pvalue = + ps_logistic_adjusted_df2$se = + ps_logistic_adjusted_df2$zvalue = + ps_logistic_adjusted_df2$ci_lower = ci_mod[[i]] + ps_logistic_adjusted_df2$ci_upper = + +} + + + + + +ci_lower_logistic = ci_mod[["2.5 %"]] +ci_upper_logistic = ci_mod[["97.5 %"]] + +pval_logistic = summary(model_adjusted)$coefficients[2,4] + +se_logistic = summary(model_adjusted)$coefficients[2,2] + +zval_logistic = summary(model_adjusted)$coefficients[2,3] + +ci_mod = exp(confint(model_adjusted))[2,] + +ci_lower_logistic = ci_mod[["2.5 %"]] +ci_upper_logistic = ci_mod[["97.5 %"]] + +df$var_name = var_name +df$number_samples = number_samples +df$beta = beta_logistic +df$odds_ratio = or_logistic +df$pvalue = pval_logistic +df$se = se_logistic +df$zvalue = zval_logistic +df$ci_lower = ci_lower_logistic +df$ci_upper = ci_upper_logistic +