diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R index cff9075..8ba6f7a 100644 --- a/data_extraction_formatting_clinical.R +++ b/data_extraction_formatting_clinical.R @@ -12,196 +12,207 @@ getwd() source("read_data.R") source("reg_cols_extraction.R") ######################################################################## -#========== -# -#========== -# extract the flu positive population -fp_adults = adult_df[adult_df$flustat == 1,] - -######################################################################## -table(adult_df$ia_exac_copd) +# quick sanity checks table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3 # clear unnecessary variables -rm(all_df) -rm(adult_df) - +rm(all_df, adult_df, metadata_all) ######################################################################## -reg_data = fp_adults[, cols_to_extract] +# Clinical_data extraction +######################################################################## +cat("\nExtracting:", length(clinical_cols), "cols from fp_adults") + +clinical_df = fp_adults[, clinical_cols] # sanity checks -table(reg_data$obesity) -#table(reg_data$obese2) +if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ + cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df)) +}else{ + cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!") + quit() +} -table(reg_data$age>=18) -table(reg_data$death) -table(reg_data$asthma) -table(reg_data$ia_exac_copd) +table(clinical_df$ia_exac_copd) ######################################################################## # Reassign the copd and asthma status and do some checks -table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) +table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd)) -reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0 -reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0 +clinical_df$ia_exac_copd[clinical_df$ia_exac_copd< 1]<- 0 +clinical_df$ia_exac_copd[is.na(clinical_df$ia_exac_copd)] <- 0 -table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) +table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd)) # check copd and asthma status -table(reg_data$ia_exac_copd, reg_data$asthma) -check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 +table(clinical_df$ia_exac_copd, clinical_df$asthma) +check_copd_and_asthma_1<- subset(clinical_df, ia_exac_copd ==1 & asthma == 1) # check this is 3 -# reassign these 4 so these are treated as non-asthmatics as copd with asthma is not TRUE asthma -reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 -table(reg_data$ia_exac_copd, reg_data$asthma) +# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma +clinical_df$asthma[clinical_df$ia_exac_copd == 1 & clinical_df$asthma == 1]= 0 +table(clinical_df$ia_exac_copd, clinical_df$asthma) -foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0 +foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0 rm(check_copd_and_asthma_1, foo) #===================================================================== # count the resp scores -max_resp_score_table<- table(reg_data$max_resp_score) +max_resp_score_table<- table(clinical_df$max_resp_score) max_resp_score_table -T1_resp_score_table<- table(reg_data$T1_resp_score) +T1_resp_score_table<- table(clinical_df$T1_resp_score) T1_resp_score_table -T2_resp_score_table<- table(reg_data$T2_resp_score) +T2_resp_score_table<- table(clinical_df$T2_resp_score) T2_resp_score_table -Inresp_sev<- table(reg_data$inresp_sev) +Inresp_sev<- table(clinical_df$inresp_sev) Inresp_sev # Reassign the resp score so all 4 are replace by 3 -reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3 -revised_resp_score_table<- table(reg_data$max_resp_score) +clinical_df$max_resp_score[clinical_df$max_resp_score ==4 ] <- 3 +revised_resp_score_table<- table(clinical_df$max_resp_score) revised_resp_score_table -reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3 -revised_T1_resp_score_table<- table(reg_data$T1_resp_score) +clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3 +revised_T1_resp_score_table<- table(clinical_df$T1_resp_score) revised_T1_resp_score_table -reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3 -revised_T2_resp_score_table<- table(reg_data$T2_resp_score) +clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3 +revised_T2_resp_score_table<- table(clinical_df$T2_resp_score) revised_T2_resp_score_table -reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3 -revised_Inresp_sev<- table(reg_data$inresp_sev) +clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3 +revised_Inresp_sev<- table(clinical_df$inresp_sev) revised_Inresp_sev #===================================================================== # Remove these after checking rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) #===================================================================== +# Binning +# "(": not inclusive +# "]": inclusive - -##### age +#======== +# age +#======== # Create categories of variables -reg_data$age = round(reg_data$age, digits = 0) -table(reg_data$age) -table(reg_data$asthma, reg_data$age) -min(reg_data$age); max(reg_data$age) +clinical_df$age = round(clinical_df$age, digits = 0) +table(clinical_df$age) +table(clinical_df$asthma, clinical_df$age) +min(clinical_df$age); max(clinical_df$age) -library(plyr) -max_age_interval = round_any(max(reg_data$age), 10, f = ceiling) +max_age_interval = round_any(max(clinical_df$age), 10, f = ceiling) max_age_interval +min_age = min(clinical_df$age); min_age #19 +min_age_interval = min_age - 1; min_age_interval -#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90)) -age_bins = cut(reg_data$age, c(18, 30, 40, 50, 60, 70, max_age_interval)) -reg_data$age_bins = age_bins -dim(reg_data) # 133 27 +#age_bins = cut(clinical_df$age, c(0,18,30,40,50,60,70,80,90)) +age_bins = cut(clinical_df$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) +clinical_df$age_bins = age_bins +dim(clinical_df) # 133 27 -#age_bins (to keep consistent with the results table) -class(reg_data$age_bins) -levels(reg_data$age_bins) +# age_bins (to keep consistent with the results table) +class(clinical_df$age_bins) +levels(clinical_df$age_bins) #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" -table(reg_data$asthma, reg_data$age_bins) +table(clinical_df$asthma, clinical_df$age_bins) # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] -#0 25 17 23 14 10 1 -#1 11 8 14 5 3 2 +#0 25 17 25 14 11 1 +#1 11 8 12 5 3 2 -sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) +if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ + cat("\nPASS: age_bins assigned successfully") +}else{ + cat("\nFAIL: no. mismatch when assigning age_bins") + quit() +} -#reassign -levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") -table(reg_data$asthma, reg_data$age_bins) -table(reg_data$asthma, reg_data$age_bins) -#(18,30] (30,40] (40,50] (50,60] -#0 25 17 23 25 -#1 11 8 14 10 +# reassign +class(clinical_df$age_bins) +levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") +table(clinical_df$asthma, clinical_df$age_bins) +table(clinical_df$asthma, clinical_df$age_bins) +# (18,30] (30,40] (40,50] (50,80] +#0 25 17 25 26 +#1 11 8 12 9 -sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) +sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) -##### O2 saturation binning -reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0) -table(reg_data$o2_sat_admis) -tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]] +#=========================== +# O2 saturation binning +#=========================== +clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0) +table(clinical_df$o2_sat_admis) +tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]] tot_o2 -o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100)) -reg_data$o2_sat_bin = o2_sat_bin -table(reg_data$o2_sat_bin) +o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100)) +clinical_df$o2_sat_bin = o2_sat_bin +table(clinical_df$o2_sat_bin) -sum(table(reg_data$o2_sat_bin)) == tot_o2 +sum(table(clinical_df$o2_sat_bin)) == tot_o2 -##### Onset to initial binning = "(==not inclusive) -max_in = max(reg_data$onset_2_initial); max_in #23 -min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6 +#=========================== +# Onset to initial binning +#=========================== +max_in = max(clinical_df$onset_2_initial); max_in #23 +min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6 -tot_onset2ini = sum(table(reg_data$onset_2_initial)) +tot_onset2ini = sum(table(clinical_df$onset_2_initial)) tot_onset2ini -onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in)) -reg_data$onset_initial_bin = onset_initial_bin +onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in)) +clinical_df$onset_initial_bin = onset_initial_bin -sum(table(reg_data$onset_initial_bin)) == tot_onset2ini +sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= # should be a factor -if (! is.factor(reg_data$sfluv)){ - reg_data$sfluv = as.factor(reg_data$sfluv) +if (! is.factor(clinical_df$sfluv)){ + clinical_df$sfluv = as.factor(clinical_df$sfluv) } -class(reg_data$sfluv) #[1] "factor" +class(clinical_df$sfluv) #[1] "factor" -levels(reg_data$sfluv) -table(reg_data$asthma, reg_data$sfluv) +levels(clinical_df$sfluv) +table(clinical_df$asthma, clinical_df$sfluv) # reassign -levels(reg_data$sfluv) <- c("0", "0", "1") -table(reg_data$asthma, reg_data$sfluv) +levels(clinical_df$sfluv) <- c("0", "0", "1") +table(clinical_df$asthma, clinical_df$sfluv) #======================= # h1n1v #======================= # should be a factor -if (! is.factor(reg_data$h1n1v)){ - reg_data$h1n1v = as.factor(reg_data$h1n1v) +if (! is.factor(clinical_df$h1n1v)){ + clinical_df$h1n1v = as.factor(clinical_df$h1n1v) } -class(reg_data$h1n1v) #[1] "factor" +class(clinical_df$h1n1v) #[1] "factor" -levels(reg_data$h1n1v) -table(reg_data$asthma, reg_data$h1n1v) +levels(clinical_df$h1n1v) +table(clinical_df$asthma, clinical_df$h1n1v) # reassign -levels(reg_data$h1n1v) <- c("0", "0", "1") -table(reg_data$asthma, reg_data$h1n1v) +levels(clinical_df$h1n1v) <- c("0", "0", "1") +table(clinical_df$asthma, clinical_df$h1n1v) #======================= # ethnicity #======================= -class(reg_data$ethnicity) # integer -table(reg_data$asthma, reg_data$ethnicity) +class(clinical_df$ethnicity) # integer +table(clinical_df$asthma, clinical_df$ethnicity) -reg_data$ethnicity[reg_data$ethnicity == 4] <- 2 -table(reg_data$asthma, reg_data$ethnicity) +clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2 +table(clinical_df$asthma, clinical_df$ethnicity) #======================= # pneumonia #======================= -class(reg_data$ia_cxr) # integer +class(clinical_df$ia_cxr) # integer # ia_cxr 2 ---> yes pneumonia (1) # 1 ---> no (0) # ! 1 or 2 -- > "unkown" @@ -216,27 +227,27 @@ class(reg_data$ia_cxr) # integer #-3: unknown specified by clinician -table(reg_data$ia_cxr) +table(clinical_df$ia_cxr) #-3 -1 0 1 2 3 #5 48 13 47 17 3 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist -reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- "" -table(reg_data$ia_cxr) +clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- "" +table(clinical_df$ia_cxr) # 1 2 #69 47 17 -reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0 -reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1 -table(reg_data$ia_cxr) +clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0 +clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1 +table(clinical_df$ia_cxr) # 0 1 #69 47 17 #======================= # smoking [tricky one] #======================= -class(reg_data$smoking) # integer -table(reg_data$asthma, reg_data$smoking) +class(clinical_df$smoking) # integer +table(clinical_df$asthma, clinical_df$smoking) # orig # -3 -1 1 2 3 4 @@ -261,20 +272,20 @@ table(reg_data$asthma, reg_data$smoking) #-2: n/a specified by the clinician =====> categ blank (NA) #-3: unknown specified by clinician=====> categ blank (NA) -table(reg_data$smoking) +table(clinical_df$smoking) #-3 -1 1 2 3 4 #19 11 35 2 19 47 # reassign the smoking codes -reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0 -reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1 -reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- "" +clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0 +clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1 +clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- "" -table(reg_data$smoking) +table(clinical_df$smoking) # 0 1 #30 49 54 -table(reg_data$asthma, reg_data$smoking) +table(clinical_df$asthma, clinical_df$smoking) # orig # 0 1 @@ -289,12 +300,12 @@ table(reg_data$asthma, reg_data$smoking) #================== # writing output file #================== -outfile_name_reg = "reg_data_recoded_with_NA.csv" +outfile_name_reg = "clinical_df_recoded.csv" outfile_reg = paste0(outdir, outfile_name_reg) -cat("Writing clinical file for regression:", outfile_reg) +cat("\nWriting clinical file for regression:", outfile_reg) -#write.csv(reg_data, file = outfile_reg) +#write.csv(clinical_df, file = outfile_reg) ################################################################ rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols) diff --git a/read_data.R b/read_data.R index 803d051..614ed87 100755 --- a/read_data.R +++ b/read_data.R @@ -50,9 +50,9 @@ metadata_all = all_df[, meta_data_cols] adult_df = all_df[all_df$age>=18,] if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){ - cat ("PASS: adult df extracted successfully") + cat ("\nPASS: adult df extracted successfully") } else{ - cat ("FAIL: adult df number mismatch!") + cat ("\nFAIL: adult df number mismatch!") } #============== @@ -62,9 +62,9 @@ if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){ fp_adults = adult_df[adult_df$flustat == 1,] if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){ - cat ("PASS: adult df extracted successfully") + cat ("\nPASS: adult df extracted successfully") } else{ - cat ("FAIL: adult df number mismatch!") + cat ("\nFAIL: adult df number mismatch!") } #============