added prediction.R to do logistic regression
This commit is contained in:
parent
8e16b2635e
commit
c95db27b06
1 changed files with 255 additions and 0 deletions
255
scripts/prediction.R
Normal file
255
scripts/prediction.R
Normal file
|
@ -0,0 +1,255 @@
|
||||||
|
#!/usr/bin/env Rscript
|
||||||
|
#########################################################
|
||||||
|
# TASK: prediction
|
||||||
|
|
||||||
|
#=======================================================================
|
||||||
|
# working dir and loading libraries
|
||||||
|
getwd()
|
||||||
|
setwd("~/git/LSHTM_analysis/scripts")
|
||||||
|
getwd()
|
||||||
|
|
||||||
|
source("combining_dfs_plotting.R")
|
||||||
|
#source("plotting/corr_data.R")
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# output
|
||||||
|
#=======
|
||||||
|
|
||||||
|
####################################################################
|
||||||
|
# end of loading libraries and functions
|
||||||
|
####################################################################
|
||||||
|
|
||||||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
|
|
||||||
|
table(merged_df2$mutation_info)
|
||||||
|
merged_df2$mutation_info_labels = ifelse(merged_df2$mutation_info == dr_muts_col, 1, 0)
|
||||||
|
table(merged_df2$mutation_info_labels)
|
||||||
|
|
||||||
|
|
||||||
|
table(merged_df3$mutation_info)
|
||||||
|
merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
|
||||||
|
table(merged_df3$mutation_info_labels)
|
||||||
|
|
||||||
|
|
||||||
|
# lig
|
||||||
|
table(merged_df3$mutation_info)
|
||||||
|
merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
|
||||||
|
table(merged_df3$mutation_info_labels)
|
||||||
|
|
||||||
|
|
||||||
|
table(merged_df3$mutation_info)
|
||||||
|
merged_df3$mutation_info_labels = ifelse(merged_df3$mutation_info == dr_muts_col, 1, 0)
|
||||||
|
table(merged_df3$mutation_info_labels)
|
||||||
|
|
||||||
|
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
model_ind = glm(mutation_info_labels ~ or_mychisq
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial")
|
||||||
|
summary(model_ind)
|
||||||
|
nobs(model_ind)
|
||||||
|
|
||||||
|
#=============
|
||||||
|
# try loop
|
||||||
|
#=============
|
||||||
|
|
||||||
|
my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af", "duet_stability_change", "duet_scaled")
|
||||||
|
|
||||||
|
|
||||||
|
for( i in my_ivs){
|
||||||
|
cat ("===============================\n")
|
||||||
|
cat(i)
|
||||||
|
cat ("\n===============================\n")
|
||||||
|
print(summary(glm(mutation_info_labels ~ eval(parse(text=i))
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial")))
|
||||||
|
}
|
||||||
|
###############################################################################
|
||||||
|
model1 = glm(mutation_info_labels ~ or_mychisq
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial"); summary(model1)
|
||||||
|
|
||||||
|
model2 = glm(mutation_info_labels ~ or_kin
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial"); summary(model2)
|
||||||
|
|
||||||
|
model3 = glm(mutation_info_labels ~ pval_fisher
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial"); summary(model3)
|
||||||
|
|
||||||
|
model4 = glm(mutation_info_labels ~ af
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial"); summary(model4)
|
||||||
|
|
||||||
|
model5 = glm(mutation_info_labels ~ duet_stability_change
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial"); summary(model5)
|
||||||
|
|
||||||
|
model6 = glm(mutation_info_labels ~ duet_scaled
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial"); summary(model6)
|
||||||
|
###############################################################################
|
||||||
|
#========================================
|
||||||
|
# loop and output in df: individually
|
||||||
|
#========================================
|
||||||
|
|
||||||
|
my_ivs = c("or_mychisq", "or_kin", "pval_fisher", "af"
|
||||||
|
, "rsa"
|
||||||
|
, "rd_values"
|
||||||
|
, "kd_values"
|
||||||
|
, "duet_stability_change"
|
||||||
|
, "duet_scaled"
|
||||||
|
, "duet_outcome"
|
||||||
|
, "ddg"
|
||||||
|
, "foldx_scaled"
|
||||||
|
, "foldx_outcome")
|
||||||
|
|
||||||
|
ps_logistic_df2 = data.frame()
|
||||||
|
|
||||||
|
for( i in my_ivs){
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
df = data.frame(var_name = NA
|
||||||
|
, number_samples = NA
|
||||||
|
, beta = NA
|
||||||
|
, odds_ratio = NA
|
||||||
|
, pvalue = NA
|
||||||
|
, se = NA
|
||||||
|
, zvalue = NA
|
||||||
|
, ci_lower = NA
|
||||||
|
, ci_upper = NA)
|
||||||
|
|
||||||
|
model = glm(mutation_info_labels ~ eval(parse(text=i))
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial")
|
||||||
|
|
||||||
|
var_name = i
|
||||||
|
|
||||||
|
number_samples = nobs(model)
|
||||||
|
|
||||||
|
beta_logistic = summary(model)$coefficients[2,1]; beta_logistic
|
||||||
|
|
||||||
|
or_logistic = exp(summary(model)$coefficients[2,1])
|
||||||
|
|
||||||
|
pval_logistic = summary(model)$coefficients[2,4]
|
||||||
|
|
||||||
|
se_logistic = summary(model)$coefficients[2,2]
|
||||||
|
|
||||||
|
zval_logistic = summary(model)$coefficients[2,3]
|
||||||
|
|
||||||
|
ci_mod = exp(confint(model))[2,]
|
||||||
|
|
||||||
|
ci_lower_logistic = ci_mod[["2.5 %"]]
|
||||||
|
ci_upper_logistic = ci_mod[["97.5 %"]]
|
||||||
|
|
||||||
|
print(c(var_name, beta_logistic, or_logistic, pval_logistic, se_logistic, zval_logistic, ci_mod))
|
||||||
|
|
||||||
|
df$var_name = var_name
|
||||||
|
df$number_samples = number_samples
|
||||||
|
df$beta = beta_logistic
|
||||||
|
df$odds_ratio = or_logistic
|
||||||
|
df$pvalue = pval_logistic
|
||||||
|
df$se = se_logistic
|
||||||
|
df$zvalue = zval_logistic
|
||||||
|
df$ci_lower = ci_lower_logistic
|
||||||
|
df$ci_upper = ci_upper_logistic
|
||||||
|
|
||||||
|
print(df)
|
||||||
|
ps_logistic_df2 = rbind(ps_logistic_df2, df)
|
||||||
|
|
||||||
|
}
|
||||||
|
#--------------------
|
||||||
|
# formatting df
|
||||||
|
#--------------------
|
||||||
|
ps_logistic_df2$data_source = "df2"
|
||||||
|
|
||||||
|
# adding pvalue_signif
|
||||||
|
ps_logistic_df2$pvalue_signif = ps_logistic_df2$pvalue
|
||||||
|
str(ps_logistic_df2$pvalue_signif)
|
||||||
|
|
||||||
|
ps_logistic_df2 = dplyr::mutate(ps_logistic_df2
|
||||||
|
, pvalue_signif = case_when(pvalue_signif == 0.05 ~ "."
|
||||||
|
, pvalue_signif <=0.0001 ~ '****'
|
||||||
|
, pvalue_signif <=0.001 ~ '***'
|
||||||
|
, pvalue_signif <=0.01 ~ '**'
|
||||||
|
, pvalue_signif <0.05 ~ '*'
|
||||||
|
, TRUE ~ 'ns'))
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
#========================================
|
||||||
|
# loop and output in df: adjusted
|
||||||
|
#========================================
|
||||||
|
my_ivs = c("or_mychisq"
|
||||||
|
, "rsa", "rd_values", "kd_values"
|
||||||
|
, "duet_stability_change", "duet_scaled", "duet_outcome"
|
||||||
|
, "ddg", "foldx_scaled", "foldx_outcome")
|
||||||
|
|
||||||
|
model_adjusted = glm(mutation_info_labels ~ or_mychisq + rsa + rd_values + kd_values +
|
||||||
|
duet_stability_change + duet_scaled + duet_outcome + ddg + foldx_scaled + foldx_outcome
|
||||||
|
, data = merged_df2
|
||||||
|
, family = "binomial")
|
||||||
|
|
||||||
|
var_names = model_adjusted$terms
|
||||||
|
number_samples = nobs(model_adjusted)
|
||||||
|
summary(model_adjusted)
|
||||||
|
ci_mod = exp(confint(model_adjusted))
|
||||||
|
|
||||||
|
|
||||||
|
my_ivs_adjusted = names(model_adjusted$coefficients)
|
||||||
|
|
||||||
|
ps_logistic_adjusted_df2 = data.frame(var_name = NA
|
||||||
|
, number_samples = NA
|
||||||
|
, beta = NA
|
||||||
|
, odds_ratio = NA
|
||||||
|
, pvalue = NA
|
||||||
|
, se = NA
|
||||||
|
, zvalue = NA
|
||||||
|
, ci_lower = NA
|
||||||
|
, ci_upper = NA)
|
||||||
|
|
||||||
|
|
||||||
|
for (i in my_ivs_adjusted){
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
ps_logistic_adjusted_df2$var_name = i
|
||||||
|
ps_logistic_adjusted_df2$number_samples = number_samples
|
||||||
|
ps_logistic_adjusted_df2$beta = model_adjusted$coefficients[[i]]
|
||||||
|
ps_logistic_adjusted_df2$odds_ratio = exp(model_adjusted$coefficients[[i]])
|
||||||
|
ps_logistic_adjusted_df2$pvalue =
|
||||||
|
ps_logistic_adjusted_df2$se =
|
||||||
|
ps_logistic_adjusted_df2$zvalue =
|
||||||
|
ps_logistic_adjusted_df2$ci_lower = ci_mod[[i]]
|
||||||
|
ps_logistic_adjusted_df2$ci_upper =
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ci_lower_logistic = ci_mod[["2.5 %"]]
|
||||||
|
ci_upper_logistic = ci_mod[["97.5 %"]]
|
||||||
|
|
||||||
|
pval_logistic = summary(model_adjusted)$coefficients[2,4]
|
||||||
|
|
||||||
|
se_logistic = summary(model_adjusted)$coefficients[2,2]
|
||||||
|
|
||||||
|
zval_logistic = summary(model_adjusted)$coefficients[2,3]
|
||||||
|
|
||||||
|
ci_mod = exp(confint(model_adjusted))[2,]
|
||||||
|
|
||||||
|
ci_lower_logistic = ci_mod[["2.5 %"]]
|
||||||
|
ci_upper_logistic = ci_mod[["97.5 %"]]
|
||||||
|
|
||||||
|
df$var_name = var_name
|
||||||
|
df$number_samples = number_samples
|
||||||
|
df$beta = beta_logistic
|
||||||
|
df$odds_ratio = or_logistic
|
||||||
|
df$pvalue = pval_logistic
|
||||||
|
df$se = se_logistic
|
||||||
|
df$zvalue = zval_logistic
|
||||||
|
df$ci_lower = ci_lower_logistic
|
||||||
|
df$ci_upper = ci_upper_logistic
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue