separated clinical data processing from reg

2020-11-18 16:00:26 +00:00 · 2020-11-18 16:00:26 +00:00 · bb6e92fa0f
commit bb6e92fa0f
parent 9dbad32504
2 changed files with 133 additions and 122 deletions
--- a/data_extraction_formatting_clinical.R
+++ b/data_extraction_formatting_clinical.R
@ -12,196 +12,207 @@ getwd()
 source("read_data.R")
 source("reg_cols_extraction.R")
 ########################################################################
-#==========
-# 
-#==========
-# extract the flu positive population
-fp_adults = adult_df[adult_df$flustat == 1,]
-
-########################################################################
-table(adult_df$ia_exac_copd)
+# quick sanity checks
 table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 

 table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3

 # clear unnecessary variables
-rm(all_df)
-rm(adult_df)
-
+rm(all_df, adult_df, metadata_all)

 ########################################################################
-reg_data = fp_adults[, cols_to_extract]
+# Clinical_data extraction
+########################################################################
+cat("\nExtracting:", length(clinical_cols), "cols from fp_adults")
+
+clinical_df = fp_adults[, clinical_cols]

 # sanity checks
-table(reg_data$obesity)
-#table(reg_data$obese2)
+if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) ==  nrow(clinical_df) ){
+  cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df))
+}else{
+  cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!")
+  quit()
+}

-table(reg_data$age>=18)
-table(reg_data$death)
-table(reg_data$asthma)
-table(reg_data$ia_exac_copd)
+table(clinical_df$ia_exac_copd)

 ########################################################################
 # Reassign the copd and asthma status and do some checks 
-table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
+table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))

-reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
-reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
+clinical_df$ia_exac_copd[clinical_df$ia_exac_copd< 1]<- 0
+clinical_df$ia_exac_copd[is.na(clinical_df$ia_exac_copd)] <- 0

-table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
+table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))

 # check copd and asthma status
-table(reg_data$ia_exac_copd, reg_data$asthma)
-check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 
+table(clinical_df$ia_exac_copd, clinical_df$asthma)
+check_copd_and_asthma_1<- subset(clinical_df, ia_exac_copd ==1 & asthma == 1) # check this is 3 

-# reassign these 4 so these are treated as non-asthmatics as copd  with asthma is not TRUE asthma
-reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 
-table(reg_data$ia_exac_copd, reg_data$asthma)
+# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
+clinical_df$asthma[clinical_df$ia_exac_copd == 1 & clinical_df$asthma == 1]= 0 
+table(clinical_df$ia_exac_copd, clinical_df$asthma)

-foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
+foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0

 rm(check_copd_and_asthma_1, foo)
 #=====================================================================
 # count the resp scores 
-max_resp_score_table<- table(reg_data$max_resp_score)
+max_resp_score_table<- table(clinical_df$max_resp_score)
 max_resp_score_table

-T1_resp_score_table<- table(reg_data$T1_resp_score)
+T1_resp_score_table<- table(clinical_df$T1_resp_score)
 T1_resp_score_table

-T2_resp_score_table<- table(reg_data$T2_resp_score)
+T2_resp_score_table<- table(clinical_df$T2_resp_score)
 T2_resp_score_table

-Inresp_sev<- table(reg_data$inresp_sev)
+Inresp_sev<- table(clinical_df$inresp_sev)
 Inresp_sev

 # Reassign the resp score so all 4 are replace by 3
-reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
-revised_resp_score_table<- table(reg_data$max_resp_score)
+clinical_df$max_resp_score[clinical_df$max_resp_score ==4 ] <- 3
+revised_resp_score_table<- table(clinical_df$max_resp_score)
 revised_resp_score_table

-reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
-revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
+clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3
+revised_T1_resp_score_table<- table(clinical_df$T1_resp_score)
 revised_T1_resp_score_table

-reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
-revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
+clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3
+revised_T2_resp_score_table<- table(clinical_df$T2_resp_score)
 revised_T2_resp_score_table

-reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
-revised_Inresp_sev<- table(reg_data$inresp_sev)
+clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3
+revised_Inresp_sev<- table(clinical_df$inresp_sev)
 revised_Inresp_sev
 #=====================================================================
 # Remove these after checking
 rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
   , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
 #=====================================================================
+# Binning
+# "(": not inclusive
+# "]": inclusive

-
-##### age
+#========
+# age
+#========
 # Create categories of variables
-reg_data$age = round(reg_data$age, digits = 0)
-table(reg_data$age)
-table(reg_data$asthma, reg_data$age)
-min(reg_data$age); max(reg_data$age)
+clinical_df$age = round(clinical_df$age, digits = 0)
+table(clinical_df$age)
+table(clinical_df$asthma, clinical_df$age)
+min(clinical_df$age); max(clinical_df$age)

-library(plyr)
-max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)  
+max_age_interval = round_any(max(clinical_df$age), 10, f = ceiling)  
 max_age_interval 
+min_age = min(clinical_df$age); min_age #19
+min_age_interval = min_age - 1; min_age_interval

-#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
-age_bins = cut(reg_data$age, c(18, 30, 40, 50, 60, 70, max_age_interval))
-reg_data$age_bins = age_bins
-dim(reg_data) # 133 27
+#age_bins = cut(clinical_df$age, c(0,18,30,40,50,60,70,80,90))
+age_bins = cut(clinical_df$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
+clinical_df$age_bins = age_bins
+dim(clinical_df) # 133 27

-#age_bins (to keep consistent with the results table)
-class(reg_data$age_bins)
-levels(reg_data$age_bins)
+# age_bins (to keep consistent with the results table)
+class(clinical_df$age_bins)
+levels(clinical_df$age_bins)
 #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
-table(reg_data$asthma, reg_data$age_bins)
+table(clinical_df$asthma, clinical_df$age_bins)
 #     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
-#0      25      17      23      14      10       1
-#1      11       8      14       5       3      2
+#0      25      17      25      14      11      1
+#1      11       8      12       5       3      2

-sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
+if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
+  cat("\nPASS: age_bins assigned successfully")
+}else{
+  cat("\nFAIL: no. mismatch when assigning age_bins")
+  quit()
+}

-#reassign
-levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
-table(reg_data$asthma, reg_data$age_bins)
-table(reg_data$asthma, reg_data$age_bins)
-#(18,30] (30,40] (40,50] (50,60]
-#0      25      17      23      25
-#1      11       8      14      10
+# reassign
+class(clinical_df$age_bins)
+levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
+table(clinical_df$asthma, clinical_df$age_bins)
+table(clinical_df$asthma, clinical_df$age_bins)
+#     (18,30] (30,40] (40,50] (50,80]
+#0      25      17      25      26
+#1      11       8      12      9

-sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
+sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df)

-##### O2 saturation binning
-reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
-table(reg_data$o2_sat_admis)
-tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
+#===========================
+# O2 saturation binning
+#===========================
+clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0)
+table(clinical_df$o2_sat_admis)
+tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]]
 tot_o2

-o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
-reg_data$o2_sat_bin = o2_sat_bin
-table(reg_data$o2_sat_bin)
+o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100))
+clinical_df$o2_sat_bin = o2_sat_bin
+table(clinical_df$o2_sat_bin)

-sum(table(reg_data$o2_sat_bin)) == tot_o2
+sum(table(clinical_df$o2_sat_bin)) == tot_o2

-##### Onset to initial binning = "(==not inclusive)
-max_in = max(reg_data$onset_2_initial); max_in #23
-min_in = min(reg_data$onset_2_initial) - 1 ; min_in  # -6
+#===========================
+# Onset to initial binning
+#===========================
+max_in = max(clinical_df$onset_2_initial); max_in #23
+min_in = min(clinical_df$onset_2_initial) - 1 ; min_in  # -6

-tot_onset2ini = sum(table(reg_data$onset_2_initial))
+tot_onset2ini = sum(table(clinical_df$onset_2_initial))
 tot_onset2ini

-onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
-reg_data$onset_initial_bin = onset_initial_bin
+onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in))
+clinical_df$onset_initial_bin = onset_initial_bin

-sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
+sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini

 #=======================
 # seasonal flu: sfluv
 #=======================
 # should be a factor
-if (! is.factor(reg_data$sfluv)){
-  reg_data$sfluv  = as.factor(reg_data$sfluv)
+if (! is.factor(clinical_df$sfluv)){
+  clinical_df$sfluv  = as.factor(clinical_df$sfluv)
 }
-class(reg_data$sfluv) #[1] "factor"
+class(clinical_df$sfluv) #[1] "factor"

-levels(reg_data$sfluv)
-table(reg_data$asthma, reg_data$sfluv)
+levels(clinical_df$sfluv)
+table(clinical_df$asthma, clinical_df$sfluv)
 # reassign
-levels(reg_data$sfluv) <- c("0", "0", "1")
-table(reg_data$asthma, reg_data$sfluv)
+levels(clinical_df$sfluv) <- c("0", "0", "1")
+table(clinical_df$asthma, clinical_df$sfluv)

 #=======================
 # h1n1v
 #=======================
 # should be a factor
-if (! is.factor(reg_data$h1n1v)){
-  reg_data$h1n1v  = as.factor(reg_data$h1n1v)
+if (! is.factor(clinical_df$h1n1v)){
+  clinical_df$h1n1v  = as.factor(clinical_df$h1n1v)
 }
-class(reg_data$h1n1v) #[1] "factor"
+class(clinical_df$h1n1v) #[1] "factor"

-levels(reg_data$h1n1v)
-table(reg_data$asthma, reg_data$h1n1v)
+levels(clinical_df$h1n1v)
+table(clinical_df$asthma, clinical_df$h1n1v)
 # reassign
-levels(reg_data$h1n1v) <- c("0", "0", "1")
-table(reg_data$asthma, reg_data$h1n1v)
+levels(clinical_df$h1n1v) <- c("0", "0", "1")
+table(clinical_df$asthma, clinical_df$h1n1v)

 #=======================
 # ethnicity
 #=======================
-class(reg_data$ethnicity) # integer
-table(reg_data$asthma, reg_data$ethnicity)
+class(clinical_df$ethnicity) # integer
+table(clinical_df$asthma, clinical_df$ethnicity)

-reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
-table(reg_data$asthma, reg_data$ethnicity)
+clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2
+table(clinical_df$asthma, clinical_df$ethnicity)

 #=======================
 # pneumonia
 #=======================
-class(reg_data$ia_cxr) # integer
+class(clinical_df$ia_cxr) # integer
 # ia_cxr 2 ---> yes pneumonia (1)
 # 1 ---> no (0)
 # ! 1 or 2 -- > "unkown"
@ -216,27 +227,27 @@ class(reg_data$ia_cxr) # integer
 #-3: unknown specified by clinician


-table(reg_data$ia_cxr)
+table(clinical_df$ia_cxr)
 #-3 -1  0  1  2  3 
 #5 48 13 47 17  3 

 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
-reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
-table(reg_data$ia_cxr)
+clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- ""
+table(clinical_df$ia_cxr)
 #    1  2 
 #69 47 17

-reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
-reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
-table(reg_data$ia_cxr)
+clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0
+clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1
+table(clinical_df$ia_cxr)
 #    0  1 
 #69 47 17 

 #=======================
 # smoking [tricky one]
 #=======================
-class(reg_data$smoking) # integer
-table(reg_data$asthma, reg_data$smoking)
+class(clinical_df$smoking) # integer
+table(clinical_df$asthma, clinical_df$smoking)

 # orig
 #   -3 -1  1  2  3   4
@ -261,20 +272,20 @@ table(reg_data$asthma, reg_data$smoking)
 #-2: n/a specified by the clinician =====> categ blank (NA)
 #-3: unknown specified by clinician=====> categ blank (NA)

-table(reg_data$smoking)
+table(clinical_df$smoking)
 #-3 -1  1  2  3  4 
 #19 11 35  2 19 47 

 # reassign the smoking codes
-reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
-reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
-reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
+clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0
+clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1
+clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- ""

-table(reg_data$smoking)
+table(clinical_df$smoking)
 #    0  1 
 #30 49 54  

-table(reg_data$asthma, reg_data$smoking)
+table(clinical_df$asthma, clinical_df$smoking)

 # orig
 #      0  1
@ -289,12 +300,12 @@ table(reg_data$asthma, reg_data$smoking)
 #==================
 # writing output file
 #==================
-outfile_name_reg = "reg_data_recoded_with_NA.csv"
+outfile_name_reg = "clinical_df_recoded.csv"
 outfile_reg = paste0(outdir, outfile_name_reg)

-cat("Writing clinical file for regression:", outfile_reg)
+cat("\nWriting clinical file for regression:", outfile_reg)

-#write.csv(reg_data, file = outfile_reg)
+#write.csv(clinical_df, file = outfile_reg)
 ################################################################

 rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
--- a/read_data.R
+++ b/read_data.R
@ -50,9 +50,9 @@ metadata_all = all_df[, meta_data_cols]
 adult_df = all_df[all_df$age>=18,]

 if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){
-  cat ("PASS: adult df extracted successfully")
+  cat ("\nPASS: adult df extracted successfully")
 } else{
-  cat ("FAIL: adult df number mismatch!")
+  cat ("\nFAIL: adult df number mismatch!")
 }

 #==============
@ -62,9 +62,9 @@ if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){
 fp_adults = adult_df[adult_df$flustat == 1,]

 if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){
-  cat ("PASS: adult df extracted successfully")
+  cat ("\nPASS: adult df extracted successfully")
 } else{
-  cat ("FAIL: adult df number mismatch!")
+  cat ("\nFAIL: adult df number mismatch!")
 }

 #============