#!/usr/bin/env Rscript ######################################################### # TASK: prediction #======================================================================= # working dir and loading libraries getwd() setwd("~/git/LSHTM_analysis/scripts/") getwd() source("plotting/combining_dfs_plotting.R") #======= # output #======= lig_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_LIG.csv") lig_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_LIG.csv") #################################################################### # end of loading libraries and functions #################################################################### #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # lig table(merged_df3_lig$mutation_info) merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0) table(merged_df3_lig$mutation_info_labels) #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ############################################################################### #======================================== # merged_df3_lig: UNadjusted,loop #======================================== my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af" #, "ligand_distance" , "rsa" , "rd_values" , "kd_values" , "ligand_affinity_change" , "affinity_scaled" , "ligand_outcome") lig_logistic_df3 = data.frame() for( i in my_ivs){ print(i) df = data.frame(var_name = NA , number_samples = NA , beta = NA , odds_ratio = NA , pvalue = NA , se = NA , zvalue = NA , ci_lower = NA , ci_upper = NA) model_lig = glm(mutation_info_labels ~ eval(parse(text=i)) , data = merged_df3_lig , family = "binomial") var_name = i number_samples = nobs(model_lig) beta_logistic = summary(model_lig)$coefficients[2,1]; beta_logistic or_logistic = exp(summary(model_lig)$coefficients[2,1]) pval_logistic = summary(model_lig)$coefficients[2,4] se_logistic = summary(model_lig)$coefficients[2,2] zval_logistic = summary(model_lig)$coefficients[2,3] ci_mod = exp(confint(model_lig))[2,] ci_lower_logistic = ci_mod[["2.5 %"]] ci_upper_logistic = ci_mod[["97.5 %"]] print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod)) df$var_name = var_name df$number_samples = number_samples df$beta = beta_logistic df$odds_ratio = or_logistic df$pvalue = pval_logistic df$se = se_logistic df$zvalue = zval_logistic df$ci_lower = ci_lower_logistic df$ci_upper = ci_upper_logistic print(df) lig_logistic_df3 = rbind(lig_logistic_df3, df) } #-------------------- # formatting df #-------------------- lig_logistic_df3$data_source = "df3_lig" lig_logistic_df3$model_lig = "unadjusted" lig_logistic_df3$odds_ratio = round(lig_logistic_df3$odds_ratio, 2) lig_logistic_df3$ci_lower = round(lig_logistic_df3$ci_lower, 2) lig_logistic_df3$ci_upper = round(lig_logistic_df3$ci_upper, 2) # adding pvalue_signif lig_logistic_df3$pvalue_signif = lig_logistic_df3$pvalue str(lig_logistic_df3$pvalue_signif) lig_logistic_df3 = dplyr::mutate(lig_logistic_df3 , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." , pvalue_signif <=0.0001 ~ '****' , pvalue_signif <=0.001 ~ '***' , pvalue_signif <=0.01 ~ '**' , pvalue_signif <0.05 ~ '*' , TRUE ~ 'ns')) # rearranging columns lig_logistic_df3_o = lig_logistic_df3[c("var_name" , "number_samples" , "model_lig" , "odds_ratio" , "pvalue" , "pvalue_signif" , "beta" , "se" , "zvalue" , "ci_lower" , "ci_upper" , "data_source")] # writing file write.csv(lig_logistic_df3_o, lig_unadjusted, row.names = F) #======================================== # merged_df3_lig: adjusted, loop #======================================== #model_lig_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values + # ligand_affinity_change + affinity_scaled + ligand_outcome # , data = merged_df3_lig # , family = "binomial") model_lig_adjusted_df3 = glm(mutation_info_labels ~ rd_values + ligand_affinity_change , data = merged_df3_lig , family = "binomial");summary(model_lig_adjusted_df3) var_names_df = as.data.frame(names(model_lig_adjusted_df3$coefficients)) names(var_names_df) = c("var_name") ci_mod = exp(confint(model_lig_adjusted_df3)) ci_mod_df = as.data.frame(ci_mod) names(ci_mod_df) = c("ci_lower", "ci_upper") ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2) ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2) estimates_df = as.data.frame(summary(model_lig_adjusted_df3)$coefficients) names(estimates_df) = c("beta", "se", "zvalue", "pvalue") estimates_df$odds_ratio = round(exp(estimates_df$beta), 2) number_samples = nobs(model_lig_adjusted_df3) estimates_df$number_samples = number_samples estimates_df$data_source = "df3_lig" estimates_df$model_lig = "adjusted" names(lig_logistic_adjusted_df3) if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){ cat("PASS: rownames match. Preparing to merge...") lig_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T) } colnames(lig_logistic_adjusted_df3)[1] <- c("var_name") d2 = which(lig_logistic_adjusted_df3$var_name == "(Intercept)") lig_logistic_adjusted_df3 = lig_logistic_adjusted_df3[-d2,] # adding pvalue_signif lig_logistic_adjusted_df3$pvalue_signif = lig_logistic_adjusted_df3$pvalue str(lig_logistic_adjusted_df3$pvalue_signif) lig_logistic_adjusted_df3 = dplyr::mutate(lig_logistic_adjusted_df3 , pvalue_signif = case_when(pvalue_signif == 0.05 ~ "." , pvalue_signif <=0.0001 ~ '****' , pvalue_signif <=0.001 ~ '***' , pvalue_signif <=0.01 ~ '**' , pvalue_signif <0.05 ~ '*' , TRUE ~ 'ns')) # rearranging columns colnames(lig_logistic_adjusted_df3) lig_logistic_adjusted_df3_o = lig_logistic_adjusted_df3[c("var_name" , "number_samples" , "model_lig" , "odds_ratio" , "pvalue" , "pvalue_signif" , "beta" , "se" , "zvalue" , "ci_lower" , "ci_upper" ,"data_source")] # writing file write.csv(lig_logistic_adjusted_df3_o, lig_adjusted, row.names = F)