#!/usr/bin/Rscript getwd() setwd('~/git/mosaic_2020/') getwd() ######################################################################## # TASK: Run regression analysis # clinical params and npa meds ######################################################################## #==================== # Input: source data #==================== source("data_extraction_formatting_clinical.R") # quick sanity checks table(fp_adults_ics$ia_exac_copd==1 & fp_adults_ics$asthma == 1) table(fp_adults_ics$ia_exac_copd==1 & fp_adults_ics$asthma == 1) table(fp_adults_ics_na$ia_exac_copd==1 & fp_adults_ics_na$asthma == 1) table(fp_adults_ics$asthma) if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){ cat("PASS: extracting clinical and sign npa cols for regression") } else{ cat("FAIL: could not find cols to extract for regression") quit() } #fp_adults_reg = fp_adults_ics[, cols_to_extract] fp_adults_reg = fp_adults_ics[, colnames(fp_adults_ics)%in%cols_to_extract] cols_to_extract[!cols_to_extract%in%colnames(fp_adults_reg)] fp_adults_reg_na = fp_adults_ics_na[, colnames(fp_adults_ics_na)%in%cols_to_extract] #-------------------- # Data reassignment #-------------------- my_data = fp_adults_reg my_data_na = fp_adults_reg_na table(my_data$ia_exac_copd==1 & my_data$asthma == 1) table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1) cols_to_omit = c("mosaic", "flustat", "onset_2_initial", "ia_exac_copd", "onsfindeath", "o2_sat_admis") my_clinical_cols = clinical_cols[!clinical_cols%in%cols_to_omit] my_clinical_cols ######################################################################### #============================ # Identifying column types: Reg data #=========================== my_vars = colnames(my_data) lapply(my_data, class) check_int_vars = my_vars[lapply(my_data, class)%in%c("integer")] check_num_vars = my_vars[lapply(my_data, class)%in%c("numeric")] check_charac_vars = my_vars[lapply(my_data, class)%in%c("character")] check_factor_vars = my_vars[lapply(my_data, class)%in%c("factor")] cat("\nNo. of int cols:", length(check_int_vars) , "\nNo. of num cols:", length(check_num_vars) , "\nNo. of char cols:", length(check_charac_vars) , "\nNo. of factor cols:", length(check_factor_vars) ) #======================================= # changing dtypes in cols #======================================= # what I need to be numerical explicitly my_numerical_vars = c("age" , "vl_pfu_ul_npa1" , "los" , "onset2final" , "onsfindeath" , "o2_sat_admis") my_data[my_numerical_vars] <- lapply(my_data[my_numerical_vars], as.numeric) num_vars = my_vars[lapply(my_data, class)%in%c("numeric")] num_vars # what I need to be integer explicitly not_int_vars = c(my_numerical_vars, sig_npa_cols) my_int_vars = my_vars[!my_vars%in%not_int_vars] my_int_vars #my_int_vars = my_clinical_cols[int_vars_extract%in%check_int_vars] #my_int_vars # convert int cols to factor my_data[my_int_vars] <- lapply(my_data[my_int_vars], as.factor) factor_vars = my_vars[lapply(my_data, class)%in%c("factor")] factor_vars check_factor_vars # check again lapply(my_data, class) #====================================================================== my_reg_data = my_data[!colnames(my_data)%in%cols_to_omit] my_reg_params = colnames(my_reg_data) #length(factor_vars) + length(num_vars); ncol(my_reg_data) #my_reg_params = c(factor_vars, num_vars) na_count = sapply(my_reg_data, function(x) sum(is.na(x)));na_count names(na_count)[na_count>0] # check again lapply(my_reg_data, class) # all parasm for reg my_reg_params_DEL = c("age" , "age_bins" , "vl_pfu_ul_npa1" , "los" , "onset2final" #, "onsfindeath" #, "o2_sat_admis" , "death" , "obesity" , "sfluv" , "h1n1v" , "gender" , "asthma" , "ethnicity" , "smoking" , "ia_cxr" , "max_resp_score" , "T1_resp_score" , "com_noasthma" , "T2_resp_score" , "inresp_sev" , "steroid" , "o2_sat_bin" , "onset_initial_bin" , "steroid_ics" , "t1_resp_recoded" , sig_npa_cols) #================= # reg data prepare #================= pv1 = "death" pv2 = "t1_resp_recoded" #reg_params1 = factor_vars[!factor_vars%in%pv1] #reg_params_mixed = my_vars[!my_vars%in%pv1] ######################################################################## # outcome: death ######################################################################## #----------------------------- # outcome: death + obesity # data: fp adults #----------------------------- my_reg_params1 = my_reg_params[!my_reg_params%in%c("death", "obesity")] sink(file = "reg_output_out1.txt", append = T) for(i in my_reg_params1) { #print (i) p_form = as.formula(paste("death ~ obesity + ", i ,sep = "")) print(p_form) model_reg = glm(p_form , family = binomial, data = my_reg_data) print(summary(model_reg)) print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) #print (PseudoR2(model_reg)) print(nobs(model_reg)) cat("=================================================================================\n") } sink() #----------------------------- # outcome: death + obesity # data: fp adults #----------------------------- my_reg_params2 = my_reg_params[!my_reg_params%in%c( "death" , "obesity" , "asthma")] sink(file = "reg_output_out1_ob_as.txt", append = T) for(i in my_reg_params2) { #print (i) p_form = as.formula(paste("death ~ obesity + asthma +", i ,sep = "")) print(p_form) model_reg = glm(p_form , family = binomial, data = my_reg_data) print(summary(model_reg)) print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) #print (PseudoR2(model_reg)) print(nobs(model_reg)) cat("=================================================================================\n") } sink() #------------------- # Full model #------------------- full_mod = glm(death ~ obesity + asthma + #age + age_bins + #t1_resp_recoded + com_noasthma + eotaxin3_npa1 + il1_npa1+ il8_2_npa1+ il12p70_npa1 , family = "binomial" , data = my_reg_data) print(summary(full_mod)) print(exp(cbind(OR = coef(full_mod), confint(full_mod)))) print(nobs(full_mod)) cat("=================================================================================\n") ######################################################################## # outcome: t1_resp_recoded ######################################################################## #----------------------------- # outcome: t1_resp_recoded + obesity # data: fp adults #----------------------------- my_reg_params3 = my_reg_params[!my_reg_params%in%c("t1_resp_recoded" , "T1_resp_score" , "death" , "obesity")] sink(file = "reg_output_out2.txt", append = T) for(i in my_reg_params3) { #print (i) p_form = as.formula(paste("t1_resp_recoded ~ obesity + ", i ,sep = "")) print(p_form) model_reg = glm(p_form , family = binomial, data = my_reg_data) print(summary(model_reg)) print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) #print (PseudoR2(model_reg)) print(nobs(model_reg)) cat("=================================================================================\n") } sink() #----------------------------- # outcome: t1_resp_recoded + obesity # data: fp adults #----------------------------- my_reg_params4 = my_reg_params[!my_reg_params%in%c( "t1_resp_recoded" , "T1_resp_score" , "obesity" , "death" , "asthma")] sink(file = "reg_output_out2_ob_as.txt", append = T) for(i in my_reg_params4) { #print (i) p_form = as.formula(paste("t1_resp_recoded ~ obesity + asthma +", i ,sep = "")) print(p_form) model_reg = glm(p_form , family = binomial, data = my_reg_data) print(summary(model_reg)) print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) #print (PseudoR2(model_reg)) print(nobs(model_reg)) cat("=================================================================================\n") } sink() #------------------- # Full model #------------------- #full_mod2 = glm(death ~ obesity + # asthma + # #age + # #age_bins + # # # , family = "binomial" # , data = my_reg_data) # #print(summary(full_mod2)) #print(exp(cbind(OR = coef(full_mod2), confint(full_mod2)))) #print(nobs(full_mod2)) cat("=================================================================================\n") model = glm(death ~ obesity, family = binomial, data = my_reg_data) summary(model); nobs(model) model2 = glm(t1_resp_recoded ~ obesity, family = binomial, data = my_reg_data) summary(model2); nobs(model2) model3 = glm(t1_resp_recoded ~ obesity + ia_cxr+ com_noasthma , family = binomial, data = my_reg_data) summary(model3); nobs(model3) model4 = glm(death ~ obesity + ia_cxr + com_noasthma , family = binomial, data = my_reg_data) summary(model4); nobs(model4)