#!/usr/bin/env Rscript ######################################################### # TASK: prediction #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts") getwd() source("combining_dfs_plotting.R") #source("plotting/corr_data.R") #======= # output #======= #################################################################### # end of loading libraries and functions #################################################################### #%%%%%%%%%%%%%%%%%%%%%%%%% table(merged_df2$mutation_info) merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0) table(merged_df2$mutation_info_labels) table(merged_df3$mutation_info) merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) table(merged_df3$mutation_info_labels) # lig table(merged_df3$mutation_info) merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) table(merged_df3$mutation_info_labels) table(merged_df3$mutation_info) merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0) table(merged_df3$mutation_info_labels) #%%%%%%%%%%%%%%%%%%%%%%%%% ############################################################################### model_ind = glm(mutation_info_labels ~ or_mychisq , data = merged_df2 , family = "binomial") summary(model_ind) nobs(model_ind) #============= # try loop #============= my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled") for( i in my_ivs){ cat ("===============================\n") cat(i) cat ("\n===============================\n") print(summary(glm(mutation_info_labels ~ eval(parse(text=i)) , data = merged_df2 , family = "binomial"))) } ############################################################################### model1 = glm(mutation_info_labels ~ or_mychisq , data = merged_df2 , family = "binomial"); summary(model1) model2 = glm(mutation_info_labels ~ or_kin , data = merged_df2 , family = "binomial"); summary(model2) model3 = glm(mutation_info_labels ~ pval_fisher , data = merged_df2 , family = "binomial"); summary(model3) model4 = glm(mutation_info_labels ~ af , data = merged_df2 , family = "binomial"); summary(model4) model5 = glm(mutation_info_labels ~ duet_stability_change , data = merged_df2 , family = "binomial"); summary(model5) model6 = glm(mutation_info_labels ~ duet_scaled , data = merged_df2 , family = "binomial"); summary(model6) ############################################################################### #======================================== # loop and output in df: individually #======================================== my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" , "rsa" , "rd_values" , "kd_values" , "duet_stability_change" , "duet_scaled" , "duet_outcome" , "ddg" , "foldx_scaled" , "foldx_outcome") ps_logistic_df2 = data.frame() for( i in my_ivs){ print(i) df = data.frame(var_name = NA , number_samples = NA , beta = NA , odds_ratio = NA , pvalue = NA , se = NA , zvalue = NA , ci_lower = NA , ci_upper = NA) model = glm(mutation_info_labels ~ eval(parse(text=i)) , data = merged_df2 , family = "binomial") var_name = i number_samples = nobs(model) beta_logistic = summary(model)$coefficients[2,1]; beta_logistic or_logistic = exp(summary(model)$coefficients[2,1]) pval_logistic = summary(model)$coefficients[2,4] se_logistic = summary(model)$coefficients[2,2] zval_logistic = summary(model)$coefficients[2,3] ci_mod = exp(confint(model))[2,] ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) df$var_name = var_name df$number_samples = number_samples df$beta = beta_logistic df$odds_ratio = or_logistic df$pvalue = pval_logistic df$se = se_logistic df$zvalue = zval_logistic df$ci_lower = ci_lower_logistic df$ci_upper = ci_upper_logistic print(df) ps_logistic_df2 = rbind(ps_logistic_df2, df) } #-------------------- # formatting df #-------------------- ps_logistic_df2$data_source = "df2" # adding pvalue_signif ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue str(ps_logistic_df2$pvalue_signif) ps_logistic_df2 = dplyr::mutate(ps_logistic_df2 , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." , pvalue_signif <=0.0001 ~ '****' , pvalue_signif <=0.001 ~ '***' , pvalue_signif <=0.01 ~ '**' , pvalue_signif <0.05 ~ '*' , TRUE ~ 'ns')) ############################################################################### #======================================== # loop and output in df: adjusted #======================================== my_ivs = c("or_mychisq" , "rsa", "rd_values", "kd_values" , "duet_stability_change", "duet_scaled", "duet_outcome" , "ddg", "foldx_scaled", "foldx_outcome") model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome , data = merged_df2 , family = "binomial") var_names = model_adjusted$terms number_samples = nobs(model_adjusted) summary(model_adjusted) ci_mod = exp(confint(model_adjusted)) my_ivs_adjusted = names(model_adjusted$coefficients) ps_logistic_adjusted_df2 = data.frame(var_name = NA , number_samples = NA , beta = NA , odds_ratio = NA , pvalue = NA , se = NA , zvalue = NA , ci_lower = NA , ci_upper = NA) for (i in my_ivs_adjusted){ print(i) ps_logistic_adjusted_df2$var_name = i ps_logistic_adjusted_df2$number_samples = number_samples ps_logistic_adjusted_df2$beta = model_adjusted$coefficients[[i]] ps_logistic_adjusted_df2$odds_ratio = exp(model_adjusted$coefficients[[i]]) ps_logistic_adjusted_df2$pvalue = ps_logistic_adjusted_df2$se = ps_logistic_adjusted_df2$zvalue = ps_logistic_adjusted_df2$ci_lower = ci_mod[[i]] ps_logistic_adjusted_df2$ci_upper = } ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] pval_logistic = summary(model_adjusted)$coefficients[2,4] se_logistic = summary(model_adjusted)$coefficients[2,2] zval_logistic = summary(model_adjusted)$coefficients[2,3] ci_mod = exp(confint(model_adjusted))[2,] ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] df$var_name = var_name df$number_samples = number_samples df$beta = beta_logistic df$odds_ratio = or_logistic df$pvalue = pval_logistic df$se = se_logistic df$zvalue = zval_logistic df$ci_lower = ci_lower_logistic df$ci_upper = ci_upper_logistic