LSHTM_analysis/scripts/prediction_lig.R

203 lines
No EOL
8 KiB
R

#!/usr/bin/env Rscript
#########################################################
# TASK: prediction
#=======================================================================
# working dir and loading libraries
getwd()
setwd("~/git/LSHTM_analysis/scripts/")
getwd()
source("plotting/combining_dfs_plotting.R")
#=======
# output
#=======
lig_unadjusted = paste0(outdir, "/results/", tolower(gene), "_unadjusted_logistic_LIG.csv")
lig_adjusted = paste0(outdir, "/results/", tolower(gene), "_adjusted_logistic_LIG.csv")
####################################################################
# end of loading libraries and functions
####################################################################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# lig
table(merged_df3_lig$mutation_info)
merged_df3_lig$mutation_info_labels = ifelse(merged_df3_lig$mutation_info == dr_muts_col, 1, 0)
table(merged_df3_lig$mutation_info_labels)
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
###############################################################################
#========================================
# merged_df3_lig: UNadjusted,loop
#========================================
my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
#, "ligand_distance"
, "rsa"
, "rd_values"
, "kd_values"
, "ligand_affinity_change"
, "affinity_scaled"
, "ligand_outcome")
lig_logistic_df3 = data.frame()
for( i in my_ivs){
print(i)
df = data.frame(var_name = NA
, number_samples = NA
, beta = NA
, odds_ratio = NA
, pvalue = NA
, se = NA
, zvalue = NA
, ci_lower = NA
, ci_upper = NA)
model_lig = glm(mutation_info_labels ~ eval(parse(text=i))
, data = merged_df3_lig
, family = "binomial")
var_name = i
number_samples = nobs(model_lig)
beta_logistic = summary(model_lig)$coefficients[2,1]; beta_logistic
or_logistic = exp(summary(model_lig)$coefficients[2,1])
pval_logistic = summary(model_lig)$coefficients[2,4]
se_logistic = summary(model_lig)$coefficients[2,2]
zval_logistic = summary(model_lig)$coefficients[2,3]
ci_mod = exp(confint(model_lig))[2,]
ci_lower_logistic = ci_mod[["2.5 %"]]
ci_upper_logistic = ci_mod[["97.5 %"]]
print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
df$var_name = var_name
df$number_samples = number_samples
df$beta = beta_logistic
df$odds_ratio = or_logistic
df$pvalue = pval_logistic
df$se = se_logistic
df$zvalue = zval_logistic
df$ci_lower = ci_lower_logistic
df$ci_upper = ci_upper_logistic
print(df)
lig_logistic_df3 = rbind(lig_logistic_df3, df)
}
#--------------------
# formatting df
#--------------------
lig_logistic_df3$data_source = "df3_lig"
lig_logistic_df3$model_lig = "unadjusted"
lig_logistic_df3$odds_ratio = round(lig_logistic_df3$odds_ratio, 2)
lig_logistic_df3$ci_lower = round(lig_logistic_df3$ci_lower, 2)
lig_logistic_df3$ci_upper = round(lig_logistic_df3$ci_upper, 2)
# adding pvalue_signif
lig_logistic_df3$pvalue_signif = lig_logistic_df3$pvalue
str(lig_logistic_df3$pvalue_signif)
lig_logistic_df3 = dplyr::mutate(lig_logistic_df3
, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
, pvalue_signif <=0.0001 ~ '****'
, pvalue_signif <=0.001 ~ '***'
, pvalue_signif <=0.01 ~ '**'
, pvalue_signif <0.05 ~ '*'
, TRUE ~ 'ns'))
# rearranging columns
lig_logistic_df3_o = lig_logistic_df3[c("var_name"
, "number_samples"
, "model_lig"
, "odds_ratio"
, "pvalue"
, "pvalue_signif"
, "beta"
, "se"
, "zvalue"
, "ci_lower"
, "ci_upper"
, "data_source")]
# writing file
write.csv(lig_logistic_df3_o, lig_unadjusted, row.names = F)
#========================================
# merged_df3_lig: adjusted, loop
#========================================
#model_lig_adjusted_df3 = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
# ligand_affinity_change + affinity_scaled + ligand_outcome
# , data = merged_df3_lig
# , family = "binomial")
model_lig_adjusted_df3 = glm(mutation_info_labels ~ rd_values + ligand_affinity_change
, data = merged_df3_lig
, family = "binomial");summary(model_lig_adjusted_df3)
var_names_df = as.data.frame(names(model_lig_adjusted_df3$coefficients))
names(var_names_df) = c("var_name")
ci_mod = exp(confint(model_lig_adjusted_df3))
ci_mod_df = as.data.frame(ci_mod)
names(ci_mod_df) = c("ci_lower", "ci_upper")
ci_mod_df$ci_lower = round(ci_mod_df$ci_lower, 2)
ci_mod_df$ci_upper = round(ci_mod_df$ci_upper, 2)
estimates_df = as.data.frame(summary(model_lig_adjusted_df3)$coefficients)
names(estimates_df) = c("beta", "se", "zvalue", "pvalue")
estimates_df$odds_ratio = round(exp(estimates_df$beta), 2)
number_samples = nobs(model_lig_adjusted_df3)
estimates_df$number_samples = number_samples
estimates_df$data_source = "df3_lig"
estimates_df$model_lig = "adjusted"
names(lig_logistic_adjusted_df3)
if ( all(rownames(estimates_df) == rownames(ci_mod_df)) ){
cat("PASS: rownames match. Preparing to merge...")
lig_logistic_adjusted_df3 = merge(estimates_df, ci_mod_df, by = "row.names", all = T)
}
colnames(lig_logistic_adjusted_df3)[1] <- c("var_name")
d2 = which(lig_logistic_adjusted_df3$var_name == "(Intercept)")
lig_logistic_adjusted_df3 = lig_logistic_adjusted_df3[-d2,]
# adding pvalue_signif
lig_logistic_adjusted_df3$pvalue_signif = lig_logistic_adjusted_df3$pvalue
str(lig_logistic_adjusted_df3$pvalue_signif)
lig_logistic_adjusted_df3 = dplyr::mutate(lig_logistic_adjusted_df3
, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
, pvalue_signif <=0.0001 ~ '****'
, pvalue_signif <=0.001 ~ '***'
, pvalue_signif <=0.01 ~ '**'
, pvalue_signif <0.05 ~ '*'
, TRUE ~ 'ns'))
# rearranging columns
colnames(lig_logistic_adjusted_df3)
lig_logistic_adjusted_df3_o = lig_logistic_adjusted_df3[c("var_name"
, "number_samples"
, "model_lig"
, "odds_ratio"
, "pvalue"
, "pvalue_signif"
, "beta"
, "se"
, "zvalue"
, "ci_lower"
, "ci_upper"
,"data_source")]
# writing file
write.csv(lig_logistic_adjusted_df3_o, lig_adjusted, row.names = F)