From f0c0fd72d1a7169508cc0a1c3b60e7fd69221291 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 24 Nov 2020 18:46:47 +0000 Subject: [PATCH] perfomed LR analysis and tidyed up clinical formatting code --- colnames_clinical_meds.R | 12 +- data_extraction_formatting_clinical.R | 275 +++++++++++++------------- flu_stats_contingency.R | 2 +- logistic_regression.R | 270 ++++++++++++------------- read_data.R | 38 ++-- 5 files changed, 296 insertions(+), 301 deletions(-) diff --git a/colnames_clinical_meds.R b/colnames_clinical_meds.R index b59d8ed..d809174 100755 --- a/colnames_clinical_meds.R +++ b/colnames_clinical_meds.R @@ -8,7 +8,7 @@ getwd() ######################################################################## ######################################################################## -clinical_cols = c("mosaic" +clinical_cols_data = c("mosaic" , "ia_exac_copd" , "death" #, "obese2" #inc peaeds, but once you subset data for adults, its the same! @@ -17,7 +17,7 @@ clinical_cols = c("mosaic" , "sfluv" , "h1n1v" , "age" - , "gender" + , "gender" , "asthma" , "vl_pfu_ul_npa1" , "los" @@ -36,6 +36,14 @@ clinical_cols = c("mosaic" , "inresp_sev" , "steroid") +clinical_cols_added = c("age_bins" + , "o2_sat_bin" + , "onset_initial_bin" + , "steroid_ics" + , "t1_resp_recoded" ) + +clinical_cols = c(clinical_cols_data, clinical_cols_added) + sig_npa_cols = c("eotaxin_npa1" , "eotaxin3_npa1" , "eotaxin3_npa2" diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R index 0347011..b0b9753 100755 --- a/data_extraction_formatting_clinical.R +++ b/data_extraction_formatting_clinical.R @@ -28,34 +28,27 @@ clinical_ics = read.csv(infile_ics) str(clinical_ics) ######################################################################## -# quick sanity checks -table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 - -table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3 - -# clear unnecessary variables -rm(all_df, adult_df, metadata_all) +table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) ######################################################################## # Clinical_data extraction ######################################################################## -cat("\nExtracting:", length(clinical_cols), "cols from fp_adults") +#cat("\nExtracting:", length(clinical_cols), "cols from fp_adults") -clinical_df = fp_adults[, clinical_cols] +#clinical_df = fp_adults[, clinical_cols] # sanity checks -if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ - cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df)) -}else{ - cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!") - quit() -} +#if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ +# cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df)) +#}else{ +# cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!") +# quit() +#} -table(clinical_df$ia_exac_copd) +#table(clinical_df$ia_exac_copd) -str(clinical_df) +#str(clinical_df) #clinical_df$o2_sat_suppl - ######################################################################## #================================== # Check asthma and copd conflict @@ -80,42 +73,41 @@ if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){ foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0 rm(check_copd_and_asthma_1, foo) cat("Check status again...") - + } - #===================================================================== #================================= # resp scores: In, max and t1 & t2 #================================= # count the resp scores -max_resp_score_table<- table(clinical_df$max_resp_score) +max_resp_score_table<- table(fp_adults$max_resp_score) max_resp_score_table -T1_resp_score_table<- table(clinical_df$T1_resp_score) +T1_resp_score_table<- table(fp_adults$T1_resp_score) T1_resp_score_table -T2_resp_score_table<- table(clinical_df$T2_resp_score) +T2_resp_score_table<- table(fp_adults$T2_resp_score) T2_resp_score_table -Inresp_sev<- table(clinical_df$inresp_sev) +Inresp_sev<- table(fp_adults$inresp_sev) Inresp_sev # Reassign the resp score so all 4 are replace by 3 -clinical_df$max_resp_score[clinical_df$max_resp_score == 4 ] <- 3 -revised_resp_score_table<- table(clinical_df$max_resp_score) +fp_adults$max_resp_score[fp_adults$max_resp_score == 4 ] <- 3 +revised_resp_score_table<- table(fp_adults$max_resp_score) revised_resp_score_table -clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3 -revised_T1_resp_score_table<- table(clinical_df$T1_resp_score) +fp_adults$T1_resp_score[fp_adults$T1_resp_score ==4 ] <- 3 +revised_T1_resp_score_table<- table(fp_adults$T1_resp_score) revised_T1_resp_score_table -clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3 -revised_T2_resp_score_table<- table(clinical_df$T2_resp_score) +fp_adults$T2_resp_score[fp_adults$T2_resp_score == 4]<- 3 +revised_T2_resp_score_table<- table(fp_adults$T2_resp_score) revised_T2_resp_score_table -clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3 -revised_Inresp_sev<- table(clinical_df$inresp_sev) +fp_adults$inresp_sev[fp_adults$inresp_sev == 4]<- 3 +revised_Inresp_sev<- table(fp_adults$inresp_sev) revised_Inresp_sev #===================================================================== # Remove these after checking @@ -130,32 +122,32 @@ rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev # age #======== # Create categories of variables -clinical_df$age_int = round(clinical_df$age, digits = 0) -table(clinical_df$age_int) -table(clinical_df$asthma, clinical_df$age_int) -min(clinical_df$age_int); max(clinical_df$age_int) +fp_adults$age_int = round(fp_adults$age, digits = 0) +table(fp_adults$age_int) +table(fp_adults$asthma, fp_adults$age_int) +min(fp_adults$age_int); max(fp_adults$age_int) -max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling) +max_age_interval = round_any(max(fp_adults$age_int), 10, f = ceiling) max_age_interval -min_age = min(clinical_df$age_int); min_age #19 +min_age = min(fp_adults$age_int); min_age #19 min_age_interval = min_age - 1; min_age_interval -#age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90)) -age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) -clinical_df$age_bins = age_bins -dim(clinical_df) # 133 28 +#age_bins = cut(fp_adults$age_int, c(0,18,30,40,50,60,70,80,90)) +age_bins = cut(fp_adults$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) +fp_adults$age_bins = age_bins +dim(fp_adults) # 133 28 # age_bins (to keep consistent with the results table) -class(clinical_df$age_bins) -levels(clinical_df$age_bins) +class(fp_adults$age_bins) +levels(fp_adults$age_bins) #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" -table(clinical_df$asthma, clinical_df$age_bins) +table(fp_adults$asthma, fp_adults$age_bins) # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] #0 25 17 25 14 11 1 #1 11 8 12 5 2 2 -if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ +if (sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults) ){ cat("\nPASS: age_bins assigned successfully") }else{ cat("\nFAIL: no. mismatch when assigning age_bins") @@ -163,37 +155,37 @@ if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ } # reassign levels -class(clinical_df$age_bins) -levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") -table(clinical_df$asthma, clinical_df$age_bins) -table(clinical_df$asthma, clinical_df$age_bins) +class(fp_adults$age_bins) +levels(fp_adults$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") +table(fp_adults$asthma, fp_adults$age_bins) +table(fp_adults$asthma, fp_adults$age_bins) # (18,30] (30,40] (40,50] (50,80] #0 25 17 25 26 #1 11 8 12 9 -sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) -table(clinical_df$age_int) -clinical_df = subset(clinical_df, select = -c(age_int)) -table(clinical_df$age_int) +sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults) +table(fp_adults$age_int) +fp_adults = subset(fp_adults, select = -c(age_int)) +table(fp_adults$age_int) -class(clinical_df$age_bins) -clinical_df$age_bins +class(fp_adults$age_bins) +fp_adults$age_bins #=========================== # O2 saturation binning #=========================== -clinical_df$o2_sat_admis -n1 = sum(is.na(clinical_df$o2_sat_admis)) +fp_adults$o2_sat_admis +n1 = sum(is.na(fp_adults$o2_sat_admis)) -clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0) -table(clinical_df$o2_sat_admis) -tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]] +fp_adults$o2_sat_admis = round(fp_adults$o2_sat_admis, digits = 0) +table(fp_adults$o2_sat_admis) +tot_o2 = sum(table(fp_adults$o2_sat_admis))- table(fp_adults$o2_sat_admis)[["-1"]] tot_o2 -n_text_code = table(clinical_df$o2_sat_admis)[["-1"]] +n_text_code = table(fp_adults$o2_sat_admis)[["-1"]] -clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA -n2 = sum(is.na(clinical_df$o2_sat_admis)) +fp_adults$o2_sat_admis[fp_adults$o2_sat_admis <0] <- NA +n2 = sum(is.na(fp_adults$o2_sat_admis)) if (n2 == n1 + n_text_code) { cat ("PASS: -1 code converted to NA") @@ -201,75 +193,75 @@ if (n2 == n1 + n_text_code) { cat("FAIL: something went wrong!") } -o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100)) -clinical_df$o2_sat_bin = o2_sat_bin -table(clinical_df$o2_sat_bin) +o2_sat_bin = cut(fp_adults$o2_sat_admis, c(0,92,100)) +fp_adults$o2_sat_bin = o2_sat_bin +table(fp_adults$o2_sat_bin) -sum(table(clinical_df$o2_sat_bin)) == tot_o2 +sum(table(fp_adults$o2_sat_bin)) == tot_o2 #=========================== # Onset to initial binning #=========================== -clinical_df$onset_2_initial +fp_adults$onset_2_initial -max_in = max(clinical_df$onset_2_initial); max_in #23 -min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6 +max_in = max(fp_adults$onset_2_initial); max_in #23 +min_in = min(fp_adults$onset_2_initial) - 1 ; min_in # -6 -tot_onset2ini = sum(table(clinical_df$onset_2_initial)) +tot_onset2ini = sum(table(fp_adults$onset_2_initial)) tot_onset2ini -onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in)) -clinical_df$onset_initial_bin = onset_initial_bin -sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini +onset_initial_bin = cut(fp_adults$onset_2_initial, c(min_in, 4, max_in)) +fp_adults$onset_initial_bin = onset_initial_bin +sum(table(fp_adults$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= # reassign as 0 and 1 -table(clinical_df$sfluv) -table(clinical_df$asthma, clinical_df$sfluv) -clinical_df$sfluv = ifelse(clinical_df$sfluv == "yes", 1, 0) -table(clinical_df$sfluv) -table(clinical_df$asthma, clinical_df$sfluv) +table(fp_adults$sfluv) +table(fp_adults$asthma, fp_adults$sfluv) +fp_adults$sfluv = ifelse(fp_adults$sfluv == "yes", 1, 0) +table(fp_adults$sfluv) +table(fp_adults$asthma, fp_adults$sfluv) # convert to integer -str(clinical_df$sfluv) -clinical_df$sfluv = as.integer(clinical_df$sfluv) -str(clinical_df$sfluv) +str(fp_adults$sfluv) +fp_adults$sfluv = as.integer(fp_adults$sfluv) +str(fp_adults$sfluv) #======================= # h1n1v #======================= # reassign as 0 and 1 -table(clinical_df$h1n1v) -table(clinical_df$asthma, clinical_df$h1n1v) -clinical_df$h1n1v = ifelse(clinical_df$h1n1v == "yes", 1, 0) -table(clinical_df$h1n1v) -table(clinical_df$asthma, clinical_df$h1n1v) +table(fp_adults$h1n1v) +table(fp_adults$asthma, fp_adults$h1n1v) +fp_adults$h1n1v = ifelse(fp_adults$h1n1v == "yes", 1, 0) +table(fp_adults$h1n1v) +table(fp_adults$asthma, fp_adults$h1n1v) # convert to integer -str(clinical_df$h1n1v) -clinical_df$h1n1v = as.integer(clinical_df$h1n1v) -str(clinical_df$h1n1v) +str(fp_adults$h1n1v) +fp_adults$h1n1v = as.integer(fp_adults$h1n1v) +str(fp_adults$h1n1v) #======================= # ethnicity #======================= -class(clinical_df$ethnicity) # integer -table(clinical_df$ethnicity) -table(clinical_df$asthma, clinical_df$ethnicity) +class(fp_adults$ethnicity) # integer +table(fp_adults$ethnicity) +table(fp_adults$asthma, fp_adults$ethnicity) -clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2 -table(clinical_df$ethnicity) -table(clinical_df$asthma, clinical_df$ethnicity) +fp_adults$ethnicity[fp_adults$ethnicity == 4] <- 2 +table(fp_adults$ethnicity) +table(fp_adults$asthma, fp_adults$ethnicity) #======================= # pneumonia #======================= -table(clinical_df$ia_cxr) -class(clinical_df$ia_cxr) # integer +table(fp_adults$ia_cxr) +class(fp_adults$ia_cxr) # integer # ia_cxr 2 ---> yes pneumonia (1) # 1 ---> no (0) # ! 1 or 2 -- > "unknown" @@ -283,29 +275,29 @@ class(clinical_df$ia_cxr) # integer #-2: n/a specified by the clinician # not in the data... #-3: unknown specified by clinician -table(clinical_df$ia_cxr) +table(fp_adults$ia_cxr) #-3 -1 0 1 2 3 #5 48 13 47 17 3 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist -clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- NA -table(clinical_df$ia_cxr) +fp_adults$ia_cxr[fp_adults$ia_cxr == -3 | fp_adults$ia_cxr == -1 | fp_adults$ia_cxr == 0 | fp_adults$ia_cxr == 3 ] <- NA +table(fp_adults$ia_cxr) # 1 2 #69 47 17 -sum(is.na(clinical_df$ia_cxr)) +sum(is.na(fp_adults$ia_cxr)) -clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0 -clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1 -table(clinical_df$ia_cxr) +fp_adults$ia_cxr[fp_adults$ia_cxr == 1] <- 0 +fp_adults$ia_cxr[fp_adults$ia_cxr == 2] <- 1 +table(fp_adults$ia_cxr) # 0 1 #69 47 17 #======================= # smoking [tricky one] #======================= -class(clinical_df$smoking) # integer -table(clinical_df$asthma, clinical_df$smoking) +class(fp_adults$smoking) # integer +table(fp_adults$asthma, fp_adults$smoking) # orig # -3 -1 1 2 3 4 @@ -330,20 +322,20 @@ table(clinical_df$asthma, clinical_df$smoking) #-2: n/a specified by the clinician =====> categ blank (NA) #-3: unknown specified by clinician=====> categ blank (NA) -table(clinical_df$smoking) +table(fp_adults$smoking) #-3 -1 1 2 3 4 #19 11 35 2 19 47 # reassign the smoking codes -clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0 -clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1 -clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA +fp_adults$smoking[fp_adults$smoking == 4 | fp_adults$smoking == 2 ] <- 0 +fp_adults$smoking[fp_adults$smoking == 1 | fp_adults$smoking == 3 ] <- 1 +fp_adults$smoking[fp_adults$smoking == -1 | fp_adults$smoking == -2 | fp_adults$smoking == -3 ] <- NA -table(clinical_df$smoking); sum(is.na(clinical_df$smoking)) +table(fp_adults$smoking); sum(is.na(fp_adults$smoking)) # 0 1 #30 49 54 -table(clinical_df$asthma, clinical_df$smoking) +table(fp_adults$asthma, fp_adults$smoking) # orig # 0 1 @@ -352,24 +344,24 @@ table(clinical_df$asthma, clinical_df$smoking) ################################################################ #========================= -# Merge: clinical_df and infile ics +# Merge: fp_adults and infile ics #========================= -merging_cols = intersect( names(clinical_df), names(clinical_ics) ) +merging_cols = intersect( names(fp_adults), names(clinical_ics) ) merging_cols -clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics +fp_adults_ics = merge(fp_adults, clinical_ics, by = merging_cols, all = T); fp_adults_ics -colnames(clinical_df_ics) +colnames(fp_adults_ics) -if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ - cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics) +if (nrow(fp_adults_ics) == nrow(fp_adults) & nrow(clinical_ics)){ + cat("\nPASS: No. of rows match, nrow =", nrow(fp_adults_ics) , "\nChecking ncols...") - if ( ncol(clinical_df_ics) == ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) ){ - cat("\nPASS: No. of cols match, ncol =", ncol(clinical_df_ics)) + if ( ncol(fp_adults_ics) == ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols) ){ + cat("\nPASS: No. of cols match, ncol =", ncol(fp_adults_ics)) } else { cat("\nFAIL: ncols mismatch" - , "Expected ncols:", ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) - , "\nGot:", ncol(clinical_df_ics)) + , "Expected ncols:", ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols) + , "\nGot:", ncol(fp_adults_ics)) } } else { cat("\nFAIL: nrows mismatch" @@ -379,49 +371,54 @@ if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ #========================= # add binary outcome for T1 resp score #========================= -table(clinical_df_ics$T1_resp_score) +table(fp_adults_ics$T1_resp_score) -clinical_df_ics$t1_resp_recoded = ifelse(clinical_df_ics$T1_resp_score <3, 0, 1) -table(clinical_df_ics$t1_resp_recoded) -#table(clinical_df_ics$steroid) -table(clinical_df_ics$steroid_ics) +fp_adults_ics$t1_resp_recoded = ifelse(fp_adults_ics$T1_resp_score <3, 0, 1) +table(fp_adults_ics$t1_resp_recoded) +#table(fp_adults_ics$steroid) +table(fp_adults_ics$steroid_ics) #========================= # change the factor vars to integers #========================= -#str(clinical_df_ics) -#factor_vars = lapply(clinical_df_ics, class) == "factor" +#str(fp_adults_ics) +#factor_vars = lapply(fp_adults_ics, class) == "factor" #table(factor_vars) -#clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer) +#fp_adults_ics[, factor_vars] <- lapply(fp_adults_ics[, factor_vars], as.integer) #table(factor_vars) -#str(clinical_df_ics) +#str(fp_adults_ics) #========================= # remove cols #========================= -clinical_df_ics = subset(clinical_df_ics, select = -c(onset_2_initial)) +fp_adults_ics = subset(fp_adults_ics, select = -c(onset_2_initial)) #====================== # writing output file #====================== -outfile_name_reg = "clinical_df_recoded.csv" +outfile_name_reg = "fp_adults_recoded.csv" outfile_reg = paste0(outdir, outfile_name_reg) cat("\nWriting clinical file for regression:", outfile_reg) -#write.csv(clinical_df_ics, file = outfile_reg) +#write.csv(fp_adults_ics, file = outfile_reg) #========================= -# clinical_df_ics: without asthma +# fp_adults_ics: without asthma #========================= -clinical_df_ics_na = clinical_df_ics[clinical_df_ics$asthma == 0,] +fp_adults_ics_na = fp_adults_ics[fp_adults_ics$asthma == 0,] + +#========================= +# clinical_df only +#========================= +clinical_df_ics = fp_adults[, clinical_cols] ################################################################ rm(age_bins, max_age_interval, max_in, min_in , o2_sat_bin, onset_initial_bin, tot_o2 , n_text_code, n1, n2, tot_onset2ini, infile_ics , tot_onset2ini, meta_data_cols - , clinical_df, clinical_ics) + , fp_adults, clinical_ics) ################################################################ diff --git a/flu_stats_contingency.R b/flu_stats_contingency.R index b1d8648..9085643 100755 --- a/flu_stats_contingency.R +++ b/flu_stats_contingency.R @@ -214,7 +214,7 @@ comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif # write output file #****************** cat("Chisq and fishers test results in:", outfile_clin_categ) -write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE) +#write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE) #================== #0 date not recorded diff --git a/logistic_regression.R b/logistic_regression.R index 75f5470..d610cd4 100644 --- a/logistic_regression.R +++ b/logistic_regression.R @@ -13,21 +13,25 @@ getwd() source("data_extraction_formatting_clinical.R") # quick sanity checks -table(clinical_df_ics$ia_exac_copd==1 & clinical_df_ics$asthma == 1) -table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) -table(fp_adults_na$ia_exac_copd==1 & fp_adults_na$asthma == 1) +table(fp_adults_ics$ia_exac_copd==1 & fp_adults_ics$asthma == 1) +table(fp_adults_ics$ia_exac_copd==1 & fp_adults_ics$asthma == 1) +table(fp_adults_ics_na$ia_exac_copd==1 & fp_adults_ics_na$asthma == 1) -table(clinical_df_ics$asthma) +table(fp_adults_ics$asthma) if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){ - cat("PASS: extracting clinical and sign npa cols") + cat("PASS: extracting clinical and sign npa cols for regression") } else{ - cat("FAIL: could not find cols to extract") + cat("FAIL: could not find cols to extract for regression") quit() } -fp_adults_reg = fp_adults[, cols_to_extract] -fp_adults_reg_na = fp_adults_na[, cols_to_extract] + +#fp_adults_reg = fp_adults_ics[, cols_to_extract] +fp_adults_reg = fp_adults_ics[, colnames(fp_adults_ics)%in%cols_to_extract] +cols_to_extract[!cols_to_extract%in%colnames(fp_adults_reg)] + +fp_adults_reg_na = fp_adults_ics_na[, colnames(fp_adults_ics_na)%in%cols_to_extract] #-------------------- # Data reassignment @@ -38,26 +42,23 @@ my_data_na = fp_adults_reg_na table(my_data$ia_exac_copd==1 & my_data$asthma == 1) table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1) -# clear variables -rm(fp_adults, fp_adults_na, clinical_df_ics, clinical_df_ics_na) + +cols_to_omit = c("mosaic", "flustat", "onset_2_initial", "ia_exac_copd", "onsfindeath", "o2_sat_admis") +my_clinical_cols = clinical_cols[!clinical_cols%in%cols_to_omit] +my_clinical_cols ######################################################################### #============================ # Identifying column types: Reg data #=========================== -cols_to_omit = c("mosaic", "flustat", "onset_2_initial", "ia_exac_copd") +my_vars = colnames(my_data) -my_reg_data = my_data[!colnames(my_data)%in%cols_to_omit] +lapply(my_data, class) -my_vars = colnames(my_reg_data) -my_vars - -lapply(my_reg_data, class) - -check_int_vars = my_vars[lapply(my_reg_data, class)%in%c("integer")] -check_num_vars = my_vars[lapply(my_reg_data, class)%in%c("numeric")] -check_charac_vars = my_vars[lapply(my_reg_data, class)%in%c("character")] -check_factor_vars = my_vars[lapply(my_reg_data, class)%in%c("factor")] +check_int_vars = my_vars[lapply(my_data, class)%in%c("integer")] +check_num_vars = my_vars[lapply(my_data, class)%in%c("numeric")] +check_charac_vars = my_vars[lapply(my_data, class)%in%c("character")] +check_factor_vars = my_vars[lapply(my_data, class)%in%c("factor")] cat("\nNo. of int cols:", length(check_int_vars) , "\nNo. of num cols:", length(check_num_vars) @@ -65,44 +66,56 @@ cat("\nNo. of int cols:", length(check_int_vars) , "\nNo. of factor cols:", length(check_factor_vars) ) -# convert char vals to int as these should be int -my_reg_data[,check_charac_vars] = lapply(my_reg_data[,check_charac_vars], as.integer) -str(my_reg_data$sfluv) - - -numerical_vars = c("age" +#======================================= +# changing dtypes in cols +#======================================= +# what I need to be numerical explicitly +my_numerical_vars = c("age" , "vl_pfu_ul_npa1" , "los" , "onset2final" , "onsfindeath" , "o2_sat_admis") +my_data[my_numerical_vars] <- lapply(my_data[my_numerical_vars], as.numeric) +num_vars = my_vars[lapply(my_data, class)%in%c("numeric")] +num_vars +# what I need to be integer explicitly +not_int_vars = c(my_numerical_vars, sig_npa_cols) +my_int_vars = my_vars[!my_vars%in%not_int_vars] +my_int_vars +#my_int_vars = my_clinical_cols[int_vars_extract%in%check_int_vars] +#my_int_vars -my_reg_data[numerical_vars] <- lapply(my_reg_data[numerical_vars], as.numeric) +# convert int cols to factor +my_data[my_int_vars] <- lapply(my_data[my_int_vars], as.factor) -my_reg_params = my_vars +factor_vars = my_vars[lapply(my_data, class)%in%c("factor")] +factor_vars +check_factor_vars + +# check again +lapply(my_data, class) +#====================================================================== +my_reg_data = my_data[!colnames(my_data)%in%cols_to_omit] + +my_reg_params = colnames(my_reg_data) +#length(factor_vars) + length(num_vars); ncol(my_reg_data) +#my_reg_params = c(factor_vars, num_vars) na_count = sapply(my_reg_data, function(x) sum(is.na(x)));na_count names(na_count)[na_count>0] -vars_to_factor = my_vars[!my_vars%in%numerical_vars] - -# convert to factor -lapply(my_reg_data, class) -my_reg_data[vars_to_factor] <- lapply(my_reg_data[vars_to_factor], as.factor) -factor_vars <- colnames(my_reg_data)[lapply(my_reg_data, class) == "factor"] -table(factor_vars) - # check again lapply(my_reg_data, class) - # all parasm for reg -my_reg_params = c("age" +my_reg_params_DEL = c("age" + , "age_bins" , "vl_pfu_ul_npa1" , "los" , "onset2final" @@ -123,11 +136,12 @@ my_reg_params = c("age" , "T2_resp_score" , "inresp_sev" , "steroid" - , "age_bins" + , "o2_sat_bin" , "onset_initial_bin" , "steroid_ics" - , "t1_resp_recoded") + , "t1_resp_recoded" + , sig_npa_cols) #================= # reg data prepare @@ -139,16 +153,16 @@ pv2 = "t1_resp_recoded" #reg_params_mixed = my_vars[!my_vars%in%pv1] ######################################################################## -#================= -# outcome2 -#================= +# outcome: death +######################################################################## #----------------------------- # outcome: death + obesity # data: fp adults #----------------------------- my_reg_params1 = my_reg_params[!my_reg_params%in%c("death", "obesity")] -for(i in my_reg_params1) { +sink(file = "reg_output_out1.txt", append = T) +for(i in my_reg_params1) { #print (i) p_form = as.formula(paste("death ~ obesity + ", i ,sep = "")) print(p_form) @@ -159,16 +173,20 @@ for(i in my_reg_params1) { print(nobs(model_reg)) cat("=================================================================================\n") } +sink() #----------------------------- -# outcome: death +# outcome: death + obesity # data: fp adults #----------------------------- -my_reg_params1v2 = my_reg_params[!my_reg_params%in%c("death")] +my_reg_params2 = my_reg_params[!my_reg_params%in%c( "death" + , "obesity" + , "asthma")] -for(i in my_reg_params1v2) { +sink(file = "reg_output_out1_ob_as.txt", append = T) +for(i in my_reg_params2) { #print (i) - p_form = as.formula(paste("death ~ ", i ,sep = "")) + p_form = as.formula(paste("death ~ obesity + asthma +", i ,sep = "")) print(p_form) model_reg = glm(p_form , family = binomial, data = my_reg_data) print(summary(model_reg)) @@ -177,20 +195,45 @@ for(i in my_reg_params1v2) { print(nobs(model_reg)) cat("=================================================================================\n") } +sink() + + +#------------------- +# Full model +#------------------- +full_mod = glm(death ~ obesity + + asthma + + #age + + age_bins + + #t1_resp_recoded + + com_noasthma + + eotaxin3_npa1 + + il1_npa1+ + il8_2_npa1+ + il12p70_npa1 + , family = "binomial" + , data = my_reg_data) + +print(summary(full_mod)) +print(exp(cbind(OR = coef(full_mod), confint(full_mod)))) +print(nobs(full_mod)) + +cat("=================================================================================\n") + +######################################################################## +# outcome: t1_resp_recoded ######################################################################## -#================= -# outcome2 -#================= #----------------------------- # outcome: t1_resp_recoded + obesity # data: fp adults #----------------------------- -my_reg_params2 = my_reg_params[!my_reg_params%in%c("death" - , "obesity" - , "t1_resp_recoded" - , "T1_resp_score")] +my_reg_params3 = my_reg_params[!my_reg_params%in%c("t1_resp_recoded" + , "T1_resp_score" + , "death" + , "obesity")] -for(i in my_reg_params2) { +sink(file = "reg_output_out2.txt", append = T) +for(i in my_reg_params3) { #print (i) p_form = as.formula(paste("t1_resp_recoded ~ obesity + ", i ,sep = "")) print(p_form) @@ -201,20 +244,22 @@ for(i in my_reg_params2) { print(nobs(model_reg)) cat("=================================================================================\n") } - +sink() #----------------------------- -# outcome: t1_resp_recoded +# outcome: t1_resp_recoded + obesity # data: fp adults #----------------------------- -my_reg_params2v2 = my_reg_params[!my_reg_params%in%c("death" - #, "obesity" - , "t1_resp_recoded" - , "T1_resp_score")] +my_reg_params4 = my_reg_params[!my_reg_params%in%c( "t1_resp_recoded" + , "T1_resp_score" + , "obesity" + , "death" + , "asthma")] -for(i in my_reg_params2v2) { +sink(file = "reg_output_out2_ob_as.txt", append = T) +for(i in my_reg_params4) { #print (i) - p_form = as.formula(paste("t1_resp_recoded ~ ", i ,sep = "")) + p_form = as.formula(paste("t1_resp_recoded ~ obesity + asthma +", i ,sep = "")) print(p_form) model_reg = glm(p_form , family = binomial, data = my_reg_data) print(summary(model_reg)) @@ -223,86 +268,23 @@ for(i in my_reg_params2v2) { print(nobs(model_reg)) cat("=================================================================================\n") } +sink() -######################################################################## +#------------------- # Full model -######################################################################## +#------------------- +#full_mod2 = glm(death ~ obesity + +# asthma + +# #age + +# #age_bins + +# +# +# , family = "binomial" +# , data = my_reg_data) +# +#print(summary(full_mod2)) +#print(exp(cbind(OR = coef(full_mod2), confint(full_mod2)))) +#print(nobs(full_mod2)) -full_mod = glm(death ~ obesity + - age + - #age_bins + - obesity + - asthma + - t1_resp_recoded + - #ia_cxr - , family = "binomial", data = my_reg_data) +cat("=================================================================================\n") -summary(full_mod) - - - -######################################################################## -# mediators -######################################################################## -sig_npa_cols = c("mosaic", sig_npa_cols) - -my_med_sig = fp_adults[, sig_npa_cols] - -my_reg_data_med = merge(clinical_df_ics, my_med_sig - , by = intersect(names(clinical_df_ics), names(my_med_sig)) -) - -#----------------------------- -# outcome: death + obesity -# data: fp adults -#----------------------------- -#my_reg_params_meds = c(my_reg_params, sig_npa_cols) -my_reg_params_meds = colnames(my_reg_data_med) -my_reg_params_meds1 = my_reg_params_meds[!my_reg_params_meds%in%c("mosaic", "flustat" - , "onset_2_initial" - , "onsfindeath" - , "ia_exac_copd" - , "death" - , "obesity")] - - - -for(i in my_reg_params_meds1) { - #print (i) - p_form = as.formula(paste("death ~ obesity + ", i ,sep = "")) - print(p_form) - model_reg = glm(p_form , family = binomial, data = my_reg_data_med) - print(summary(model_reg)) - print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) - #print (PseudoR2(model_reg)) - print(nobs(model_reg)) - cat("=================================================================================\n") -} - -#----------------------------- -# outcome: t1_resp_recoded ~ obesity + asthma -# data: fp adults -#----------------------------- -my_reg_params_meds2v2 = my_reg_params_meds[!my_reg_params_meds%in%c("mosaic" - , "flustat" - , "onset_2_initial" - , "onsfindeath" - , "ia_exac_copd" - , "death" - , "t1_resp_recoded" - , "T1_resp_score" - , "asthma")] - - - -for(i in my_reg_params_meds2v2) { - #print (i) - p_form = as.formula(paste("t1_resp_recoded ~ obesity + asthma + ", i ,sep = "")) - print(p_form) - model_reg = glm(p_form , family = binomial, data = my_reg_data_med) - print(summary(model_reg)) - print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) - #print (PseudoR2(model_reg)) - print(nobs(model_reg)) - cat("=================================================================================\n") -} \ No newline at end of file diff --git a/read_data.R b/read_data.R index e8686e3..19c8292 100755 --- a/read_data.R +++ b/read_data.R @@ -66,10 +66,6 @@ if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){ } else{ cat ("\nFAIL: adult df number mismatch!") } - -#============================================= -# FLU positive adult patients: without asthma -#============================================= #----------------------------------- # asthma and copd status correction # for conflicting field! @@ -101,18 +97,21 @@ if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0 ){ quit() } -cat("\nExtracting flu positive without asthma") -table(fp_adults$asthma) -cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]] - , "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]]) -str(fp_adults$asthma) +#============================================= +# FLU positive adult patients: without asthma +#============================================= +#cat("\nExtracting flu positive without asthma") +#table(fp_adults$asthma) +#cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]] +# , "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]]) +#str(fp_adults$asthma) -table(fp_adults$obesity) -table(fp_adults$obesity, fp_adults$asthma) +#table(fp_adults$obesity) +#table(fp_adults$obesity, fp_adults$asthma) -fp_adults_na = fp_adults[fp_adults$asthma == 0,] -table(fp_adults_na$obesity) -table(fp_adults_na$obesity, fp_adults_na$asthma) +#fp_adults_na = fp_adults[fp_adults$asthma == 0,] +#table(fp_adults_na$obesity) +#table(fp_adults_na$obesity, fp_adults_na$asthma) #============ # hc @@ -120,4 +119,13 @@ table(fp_adults_na$obesity, fp_adults_na$asthma) #hc_data<- read.csv("/home/backup/MOSAIC/MEDIATOR_Data/master_file/Mediators_for_HC.csv") #str(hc_data) #table(hc_data$Timepoint, hc_data$Sample) -######################################################################## \ No newline at end of file +######################################################################## + +# quick sanity checks +table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 + +table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3 + +# clear unnecessary variables +rm(metadata_all) +rm(all_df, adult_df)