diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R new file mode 100644 index 0000000..c6c5add --- /dev/null +++ b/data_extraction_formatting_clinical.R @@ -0,0 +1,323 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +######################################################################## +# TASK: Extract relevant columns from mosaic adults data +# npa +######################################################################## +#==================== +# Input: source data +#==================== +source("read_data.R") + +# extract the flu positive population +fp_adults = adult_df[adult_df$flustat == 1,] + +table(adult_df$ia_exac_copd) +table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 + +table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3 + +# clear unnecessary variables +rm(all_df) +rm(adult_df) +######################################################################## +cols_to_extract = c("mosaic" + , "ia_exac_copd" + , "death" + #, "obese2" #inc peaeds, but once you subset data for adults, its the same! + , "obesity" + , "flustat" + , "sfluv" + , "h1n1v" + , "age" + , "gender" + , "asthma" + , "vl_pfu_ul_npa1" + , "los" + , "onset2final" + , "onsfindeath" + , "onset_2_initial" + , "o2_sat_admis" + , "o2_sat_suppl" + , "ethnicity" + , "smoking" + , "ia_cxr" + , "max_resp_score" + , "T1_resp_score" + , "com_noasthma" + , "T2_resp_score" + , "inresp_sev" + , "steroid") + +#npa_data = + +reg_data = fp_adults[, cols_to_extract] + +# sanity checks +table(reg_data$obesity) +#table(reg_data$obese2) + +table(reg_data$age>=18) +table(reg_data$death) +table(reg_data$asthma) +table(reg_data$ia_exac_copd) + +######################################################################## +# Reassign the copd and asthma status and do some checks +table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) + +reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0 +reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0 + +table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) + +# check copd and asthma status +table(reg_data$ia_exac_copd, reg_data$asthma) +check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 + +# reassign these 4 so these are treated as non-asthmatics as copd with asthma is not TRUE asthma +reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 +table(reg_data$ia_exac_copd, reg_data$asthma) + +foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0 + +rm(check_copd_and_asthma_1, foo) +#===================================================================== +# count the resp scores +max_resp_score_table<- table(reg_data$max_resp_score) +max_resp_score_table + +T1_resp_score_table<- table(reg_data$T1_resp_score) +T1_resp_score_table + +T2_resp_score_table<- table(reg_data$T2_resp_score) +T2_resp_score_table + +Inresp_sev<- table(reg_data$inresp_sev) +Inresp_sev + +# Reassign the resp score so all 4 are replace by 3 +reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3 +revised_resp_score_table<- table(reg_data$max_resp_score) +revised_resp_score_table + +reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3 +revised_T1_resp_score_table<- table(reg_data$T1_resp_score) +revised_T1_resp_score_table + +reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3 +revised_T2_resp_score_table<- table(reg_data$T2_resp_score) +revised_T2_resp_score_table + +reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3 +revised_Inresp_sev<- table(reg_data$inresp_sev) +revised_Inresp_sev +#===================================================================== +# Remove these after checking +rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev + , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) +#===================================================================== + + +##### age +# Create categories of variables +reg_data$age = round(reg_data$age, digits = 0) +table(reg_data$age) +table(reg_data$asthma, reg_data$age) +min(reg_data$age); max(reg_data$age) + +library(plyr) +max_age_interval = round_any(max(reg_data$age), 10, f = ceiling) +max_age_interval + +#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90)) +age_bins = cut(reg_data$age, c(18, 30, 40, 50, 60, 70, max_age_interval)) +reg_data$age_bins = age_bins +dim(reg_data) # 133 27 + +#age_bins (to keep consistent with the results table) +class(reg_data$age_bins) +levels(reg_data$age_bins) +#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" +table(reg_data$asthma, reg_data$age_bins) +# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] +#0 25 17 23 14 10 1 +#1 11 8 14 5 3 2 + +sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) + +#reassign +levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") +table(reg_data$asthma, reg_data$age_bins) +table(reg_data$asthma, reg_data$age_bins) +#(18,30] (30,40] (40,50] (50,60] +#0 25 17 23 25 +#1 11 8 14 10 + +sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) + +##### O2 saturation binning +reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0) +table(reg_data$o2_sat_admis) +tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]] +tot_o2 + +o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100)) +reg_data$o2_sat_bin = o2_sat_bin +table(reg_data$o2_sat_bin) + +sum(table(reg_data$o2_sat_bin)) == tot_o2 + +##### Onset to initial binning = "(==not inclusive) +max_in = max(reg_data$onset_2_initial); max_in #23 +min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6 + +tot_onset2ini = sum(table(reg_data$onset_2_initial)) +tot_onset2ini + +onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in)) +reg_data$onset_initial_bin = onset_initial_bin + +sum(table(reg_data$onset_initial_bin)) == tot_onset2ini + +#======================= +# seasonal flu: sfluv +#======================= +# should be a factor +if (! is.factor(reg_data$sfluv)){ + reg_data$sfluv = as.factor(reg_data$sfluv) +} +class(reg_data$sfluv) #[1] "factor" + +levels(reg_data$sfluv) +table(reg_data$asthma, reg_data$sfluv) +# reassign +levels(reg_data$sfluv) <- c("0", "0", "1") +table(reg_data$asthma, reg_data$sfluv) + +#======================= +# h1n1v +#======================= +# should be a factor +if (! is.factor(reg_data$h1n1v)){ + reg_data$h1n1v = as.factor(reg_data$h1n1v) +} +class(reg_data$h1n1v) #[1] "factor" + +levels(reg_data$h1n1v) +table(reg_data$asthma, reg_data$h1n1v) +# reassign +levels(reg_data$h1n1v) <- c("0", "0", "1") +table(reg_data$asthma, reg_data$h1n1v) + +#======================= +# ethnicity +#======================= +class(reg_data$ethnicity) # integer +table(reg_data$asthma, reg_data$ethnicity) + +reg_data$ethnicity[reg_data$ethnicity == 4] <- 2 +table(reg_data$asthma, reg_data$ethnicity) + +#======================= +# pneumonia +#======================= +class(reg_data$ia_cxr) # integer +# ia_cxr 2 ---> yes pneumonia (1) +# 1 ---> no (0) +# ! 1 or 2 -- > "unkown" + +# reassign the pneumonia codes +#0: not performed +#1: normal +#2: findings consistent with pneumonia +#3: abnormal +#-1: not recorded +#-2: n/a specified by the clinician # not in the data... +#-3: unknown specified by clinician + + +table(reg_data$ia_cxr) +#-3 -1 0 1 2 3 +#5 48 13 47 17 3 + +# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist +reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- "" +table(reg_data$ia_cxr) +# 1 2 +#69 47 17 + +reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0 +reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1 +table(reg_data$ia_cxr) +# 0 1 +#69 47 17 + +#======================= +# smoking [tricky one] +#======================= +class(reg_data$smoking) # integer +table(reg_data$asthma, reg_data$smoking) + +# orig +# -3 -1 1 2 3 4 +#0 15 9 22 2 15 30 +#1 4 2 13 0 4 17 + +# -3 -1 1 2 3 4 +#0 14 9 20 2 15 30 +#1 5 2 15 0 4 17 + +# never smoking, 4 and 2 -- > no (0) +#1 and 3 ---> yes (1) +#!-3 and -1 ---- > NA + +################# smoking + +#1: current daily ===> categ smoker(1) +#2: occasional =====> categ no smoker(0) +#3: ex-smoker ===> categ smoker(1) +#4: never =====> categ no smoker(0) +#-1: not recorded =====> categ blank (NA) +#-2: n/a specified by the clinician =====> categ blank (NA) +#-3: unknown specified by clinician=====> categ blank (NA) + +table(reg_data$smoking) +#-3 -1 1 2 3 4 +#19 11 35 2 19 47 + +# reassign the smoking codes +reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0 +reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1 +reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- "" + +table(reg_data$smoking) +# 0 1 +#30 49 54 + +table(reg_data$asthma, reg_data$smoking) + +# orig +# 0 1 +#0 24 32 37 +#1 6 17 17 + +# 0 1 +#0 23 32 35 +#1 7 17 19 + +################################################################ +#================== +# writing output file +#================== +outfile_name_reg = "reg_data_recoded_with_NA.csv" +outfile_reg = paste0(outdir, outfile_name_reg) + +cat("Writing clinical file for regression:", outfile_reg) + +#write.csv(reg_data, file = outfile_reg) +################################################################ + +rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols) diff --git a/logistic_regression.R b/logistic_regression.R new file mode 100755 index 0000000..d483858 --- /dev/null +++ b/logistic_regression.R @@ -0,0 +1,97 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +######################################################################## +# TASK: Run regression analysis +# npa +######################################################################## +#================================================================================= +# TO DO: +# Simple stats b/w obesity and non-obesity to consider including in reg analysis +# Include NPA statistically sign params +# Rerun graphs and plots without asthma? +#================================================================================= + +#==================== +# Input: source data +#==================== +source("data_extraction_formatting_clinical.R") + +rm(fp_adults, metadata_all) + +######################################################################## +my_data = reg_data +######################################################################### +# check factor of each column +lapply(my_data, class) + +character_vars <- lapply(my_data, class) == "character" +character_vars +table(character_vars) + +factor_vars <- lapply(my_data, class) == "factor" +table(factor_vars) + +my_data[, character_vars] <- lapply(my_data[, character_vars], as.factor) +factor_vars <- lapply(my_data, class) == "factor" +factor_vars +table(factor_vars) + +# check again +lapply(my_data, class) + +table(my_data$ethnicity) +my_data$ethnicity = as.factor(my_data$ethnicity) +class(my_data$ethnicity) + +colnames(my_data) +reg_param = c("age" + , "age_bins" + #, "death" # outcome + , "asthma" + , "obesity" + , "gender" + , "los" + , "o2_sat_admis" + #, "logistic_outcome" + #, "steroid_ics" + , "ethnicity" + , "smoking" + , "sfluv" + , "h1n1v" + , "ia_cxr" + , "max_resp_score" + , "T1_resp_score" + , "com_noasthma" + , "onset_initial_bin") + +for(i in reg_param) { + # print (i) + p_form = as.formula(paste("death ~ ", i ,sep = "")) + model_reg = glm(p_form , family = binomial, data = my_data) + print(summary(model_reg)) + print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) + #print (PseudoR2(model_reg)) + cat("=================================================================================\n") +} + + +full_mod = glm(death ~ asthma + + gender + + age_bins + + los + + #ethnicity + + onset_initial_bin + + o2_sat_bin + + com_noasthma + + obesity + + #ia_cxr + + smoking + + #sfluv + + #h1n1v + max_resp_score + + T1_resp_score + + , family = "binomial", data = my_data) + +summary(full_mod)