separated clinical data processing from reg
This commit is contained in:
parent
9dbad32504
commit
bb6e92fa0f
2 changed files with 133 additions and 122 deletions
|
@ -12,196 +12,207 @@ getwd()
|
|||
source("read_data.R")
|
||||
source("reg_cols_extraction.R")
|
||||
########################################################################
|
||||
#==========
|
||||
#
|
||||
#==========
|
||||
# extract the flu positive population
|
||||
fp_adults = adult_df[adult_df$flustat == 1,]
|
||||
|
||||
########################################################################
|
||||
table(adult_df$ia_exac_copd)
|
||||
# quick sanity checks
|
||||
table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4
|
||||
|
||||
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3
|
||||
|
||||
# clear unnecessary variables
|
||||
rm(all_df)
|
||||
rm(adult_df)
|
||||
|
||||
rm(all_df, adult_df, metadata_all)
|
||||
|
||||
########################################################################
|
||||
reg_data = fp_adults[, cols_to_extract]
|
||||
# Clinical_data extraction
|
||||
########################################################################
|
||||
cat("\nExtracting:", length(clinical_cols), "cols from fp_adults")
|
||||
|
||||
clinical_df = fp_adults[, clinical_cols]
|
||||
|
||||
# sanity checks
|
||||
table(reg_data$obesity)
|
||||
#table(reg_data$obese2)
|
||||
if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){
|
||||
cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df))
|
||||
}else{
|
||||
cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!")
|
||||
quit()
|
||||
}
|
||||
|
||||
table(reg_data$age>=18)
|
||||
table(reg_data$death)
|
||||
table(reg_data$asthma)
|
||||
table(reg_data$ia_exac_copd)
|
||||
table(clinical_df$ia_exac_copd)
|
||||
|
||||
########################################################################
|
||||
# Reassign the copd and asthma status and do some checks
|
||||
table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
|
||||
table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))
|
||||
|
||||
reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
|
||||
reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
|
||||
clinical_df$ia_exac_copd[clinical_df$ia_exac_copd< 1]<- 0
|
||||
clinical_df$ia_exac_copd[is.na(clinical_df$ia_exac_copd)] <- 0
|
||||
|
||||
table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
|
||||
table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))
|
||||
|
||||
# check copd and asthma status
|
||||
table(reg_data$ia_exac_copd, reg_data$asthma)
|
||||
check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3
|
||||
table(clinical_df$ia_exac_copd, clinical_df$asthma)
|
||||
check_copd_and_asthma_1<- subset(clinical_df, ia_exac_copd ==1 & asthma == 1) # check this is 3
|
||||
|
||||
# reassign these 4 so these are treated as non-asthmatics as copd with asthma is not TRUE asthma
|
||||
reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0
|
||||
table(reg_data$ia_exac_copd, reg_data$asthma)
|
||||
# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
|
||||
clinical_df$asthma[clinical_df$ia_exac_copd == 1 & clinical_df$asthma == 1]= 0
|
||||
table(clinical_df$ia_exac_copd, clinical_df$asthma)
|
||||
|
||||
foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
|
||||
foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0
|
||||
|
||||
rm(check_copd_and_asthma_1, foo)
|
||||
#=====================================================================
|
||||
# count the resp scores
|
||||
max_resp_score_table<- table(reg_data$max_resp_score)
|
||||
max_resp_score_table<- table(clinical_df$max_resp_score)
|
||||
max_resp_score_table
|
||||
|
||||
T1_resp_score_table<- table(reg_data$T1_resp_score)
|
||||
T1_resp_score_table<- table(clinical_df$T1_resp_score)
|
||||
T1_resp_score_table
|
||||
|
||||
T2_resp_score_table<- table(reg_data$T2_resp_score)
|
||||
T2_resp_score_table<- table(clinical_df$T2_resp_score)
|
||||
T2_resp_score_table
|
||||
|
||||
Inresp_sev<- table(reg_data$inresp_sev)
|
||||
Inresp_sev<- table(clinical_df$inresp_sev)
|
||||
Inresp_sev
|
||||
|
||||
# Reassign the resp score so all 4 are replace by 3
|
||||
reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
|
||||
revised_resp_score_table<- table(reg_data$max_resp_score)
|
||||
clinical_df$max_resp_score[clinical_df$max_resp_score ==4 ] <- 3
|
||||
revised_resp_score_table<- table(clinical_df$max_resp_score)
|
||||
revised_resp_score_table
|
||||
|
||||
reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
|
||||
revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
|
||||
clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3
|
||||
revised_T1_resp_score_table<- table(clinical_df$T1_resp_score)
|
||||
revised_T1_resp_score_table
|
||||
|
||||
reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
|
||||
revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
|
||||
clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3
|
||||
revised_T2_resp_score_table<- table(clinical_df$T2_resp_score)
|
||||
revised_T2_resp_score_table
|
||||
|
||||
reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
|
||||
revised_Inresp_sev<- table(reg_data$inresp_sev)
|
||||
clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3
|
||||
revised_Inresp_sev<- table(clinical_df$inresp_sev)
|
||||
revised_Inresp_sev
|
||||
#=====================================================================
|
||||
# Remove these after checking
|
||||
rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
|
||||
, revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
|
||||
#=====================================================================
|
||||
# Binning
|
||||
# "(": not inclusive
|
||||
# "]": inclusive
|
||||
|
||||
|
||||
##### age
|
||||
#========
|
||||
# age
|
||||
#========
|
||||
# Create categories of variables
|
||||
reg_data$age = round(reg_data$age, digits = 0)
|
||||
table(reg_data$age)
|
||||
table(reg_data$asthma, reg_data$age)
|
||||
min(reg_data$age); max(reg_data$age)
|
||||
clinical_df$age = round(clinical_df$age, digits = 0)
|
||||
table(clinical_df$age)
|
||||
table(clinical_df$asthma, clinical_df$age)
|
||||
min(clinical_df$age); max(clinical_df$age)
|
||||
|
||||
library(plyr)
|
||||
max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)
|
||||
max_age_interval = round_any(max(clinical_df$age), 10, f = ceiling)
|
||||
max_age_interval
|
||||
min_age = min(clinical_df$age); min_age #19
|
||||
min_age_interval = min_age - 1; min_age_interval
|
||||
|
||||
#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
|
||||
age_bins = cut(reg_data$age, c(18, 30, 40, 50, 60, 70, max_age_interval))
|
||||
reg_data$age_bins = age_bins
|
||||
dim(reg_data) # 133 27
|
||||
#age_bins = cut(clinical_df$age, c(0,18,30,40,50,60,70,80,90))
|
||||
age_bins = cut(clinical_df$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
|
||||
clinical_df$age_bins = age_bins
|
||||
dim(clinical_df) # 133 27
|
||||
|
||||
#age_bins (to keep consistent with the results table)
|
||||
class(reg_data$age_bins)
|
||||
levels(reg_data$age_bins)
|
||||
# age_bins (to keep consistent with the results table)
|
||||
class(clinical_df$age_bins)
|
||||
levels(clinical_df$age_bins)
|
||||
#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
|
||||
table(reg_data$asthma, reg_data$age_bins)
|
||||
table(clinical_df$asthma, clinical_df$age_bins)
|
||||
# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
|
||||
#0 25 17 23 14 10 1
|
||||
#1 11 8 14 5 3 2
|
||||
#0 25 17 25 14 11 1
|
||||
#1 11 8 12 5 3 2
|
||||
|
||||
sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
|
||||
if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
|
||||
cat("\nPASS: age_bins assigned successfully")
|
||||
}else{
|
||||
cat("\nFAIL: no. mismatch when assigning age_bins")
|
||||
quit()
|
||||
}
|
||||
|
||||
#reassign
|
||||
levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
|
||||
table(reg_data$asthma, reg_data$age_bins)
|
||||
table(reg_data$asthma, reg_data$age_bins)
|
||||
#(18,30] (30,40] (40,50] (50,60]
|
||||
#0 25 17 23 25
|
||||
#1 11 8 14 10
|
||||
# reassign
|
||||
class(clinical_df$age_bins)
|
||||
levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
|
||||
table(clinical_df$asthma, clinical_df$age_bins)
|
||||
table(clinical_df$asthma, clinical_df$age_bins)
|
||||
# (18,30] (30,40] (40,50] (50,80]
|
||||
#0 25 17 25 26
|
||||
#1 11 8 12 9
|
||||
|
||||
sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
|
||||
sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df)
|
||||
|
||||
##### O2 saturation binning
|
||||
reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
|
||||
table(reg_data$o2_sat_admis)
|
||||
tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
|
||||
#===========================
|
||||
# O2 saturation binning
|
||||
#===========================
|
||||
clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0)
|
||||
table(clinical_df$o2_sat_admis)
|
||||
tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]]
|
||||
tot_o2
|
||||
|
||||
o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
|
||||
reg_data$o2_sat_bin = o2_sat_bin
|
||||
table(reg_data$o2_sat_bin)
|
||||
o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100))
|
||||
clinical_df$o2_sat_bin = o2_sat_bin
|
||||
table(clinical_df$o2_sat_bin)
|
||||
|
||||
sum(table(reg_data$o2_sat_bin)) == tot_o2
|
||||
sum(table(clinical_df$o2_sat_bin)) == tot_o2
|
||||
|
||||
##### Onset to initial binning = "(==not inclusive)
|
||||
max_in = max(reg_data$onset_2_initial); max_in #23
|
||||
min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6
|
||||
#===========================
|
||||
# Onset to initial binning
|
||||
#===========================
|
||||
max_in = max(clinical_df$onset_2_initial); max_in #23
|
||||
min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6
|
||||
|
||||
tot_onset2ini = sum(table(reg_data$onset_2_initial))
|
||||
tot_onset2ini = sum(table(clinical_df$onset_2_initial))
|
||||
tot_onset2ini
|
||||
|
||||
onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
|
||||
reg_data$onset_initial_bin = onset_initial_bin
|
||||
onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in))
|
||||
clinical_df$onset_initial_bin = onset_initial_bin
|
||||
|
||||
sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
|
||||
sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini
|
||||
|
||||
#=======================
|
||||
# seasonal flu: sfluv
|
||||
#=======================
|
||||
# should be a factor
|
||||
if (! is.factor(reg_data$sfluv)){
|
||||
reg_data$sfluv = as.factor(reg_data$sfluv)
|
||||
if (! is.factor(clinical_df$sfluv)){
|
||||
clinical_df$sfluv = as.factor(clinical_df$sfluv)
|
||||
}
|
||||
class(reg_data$sfluv) #[1] "factor"
|
||||
class(clinical_df$sfluv) #[1] "factor"
|
||||
|
||||
levels(reg_data$sfluv)
|
||||
table(reg_data$asthma, reg_data$sfluv)
|
||||
levels(clinical_df$sfluv)
|
||||
table(clinical_df$asthma, clinical_df$sfluv)
|
||||
# reassign
|
||||
levels(reg_data$sfluv) <- c("0", "0", "1")
|
||||
table(reg_data$asthma, reg_data$sfluv)
|
||||
levels(clinical_df$sfluv) <- c("0", "0", "1")
|
||||
table(clinical_df$asthma, clinical_df$sfluv)
|
||||
|
||||
#=======================
|
||||
# h1n1v
|
||||
#=======================
|
||||
# should be a factor
|
||||
if (! is.factor(reg_data$h1n1v)){
|
||||
reg_data$h1n1v = as.factor(reg_data$h1n1v)
|
||||
if (! is.factor(clinical_df$h1n1v)){
|
||||
clinical_df$h1n1v = as.factor(clinical_df$h1n1v)
|
||||
}
|
||||
class(reg_data$h1n1v) #[1] "factor"
|
||||
class(clinical_df$h1n1v) #[1] "factor"
|
||||
|
||||
levels(reg_data$h1n1v)
|
||||
table(reg_data$asthma, reg_data$h1n1v)
|
||||
levels(clinical_df$h1n1v)
|
||||
table(clinical_df$asthma, clinical_df$h1n1v)
|
||||
# reassign
|
||||
levels(reg_data$h1n1v) <- c("0", "0", "1")
|
||||
table(reg_data$asthma, reg_data$h1n1v)
|
||||
levels(clinical_df$h1n1v) <- c("0", "0", "1")
|
||||
table(clinical_df$asthma, clinical_df$h1n1v)
|
||||
|
||||
#=======================
|
||||
# ethnicity
|
||||
#=======================
|
||||
class(reg_data$ethnicity) # integer
|
||||
table(reg_data$asthma, reg_data$ethnicity)
|
||||
class(clinical_df$ethnicity) # integer
|
||||
table(clinical_df$asthma, clinical_df$ethnicity)
|
||||
|
||||
reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
|
||||
table(reg_data$asthma, reg_data$ethnicity)
|
||||
clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2
|
||||
table(clinical_df$asthma, clinical_df$ethnicity)
|
||||
|
||||
#=======================
|
||||
# pneumonia
|
||||
#=======================
|
||||
class(reg_data$ia_cxr) # integer
|
||||
class(clinical_df$ia_cxr) # integer
|
||||
# ia_cxr 2 ---> yes pneumonia (1)
|
||||
# 1 ---> no (0)
|
||||
# ! 1 or 2 -- > "unkown"
|
||||
|
@ -216,27 +227,27 @@ class(reg_data$ia_cxr) # integer
|
|||
#-3: unknown specified by clinician
|
||||
|
||||
|
||||
table(reg_data$ia_cxr)
|
||||
table(clinical_df$ia_cxr)
|
||||
#-3 -1 0 1 2 3
|
||||
#5 48 13 47 17 3
|
||||
|
||||
# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
|
||||
reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
|
||||
table(reg_data$ia_cxr)
|
||||
clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- ""
|
||||
table(clinical_df$ia_cxr)
|
||||
# 1 2
|
||||
#69 47 17
|
||||
|
||||
reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
|
||||
reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
|
||||
table(reg_data$ia_cxr)
|
||||
clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0
|
||||
clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1
|
||||
table(clinical_df$ia_cxr)
|
||||
# 0 1
|
||||
#69 47 17
|
||||
|
||||
#=======================
|
||||
# smoking [tricky one]
|
||||
#=======================
|
||||
class(reg_data$smoking) # integer
|
||||
table(reg_data$asthma, reg_data$smoking)
|
||||
class(clinical_df$smoking) # integer
|
||||
table(clinical_df$asthma, clinical_df$smoking)
|
||||
|
||||
# orig
|
||||
# -3 -1 1 2 3 4
|
||||
|
@ -261,20 +272,20 @@ table(reg_data$asthma, reg_data$smoking)
|
|||
#-2: n/a specified by the clinician =====> categ blank (NA)
|
||||
#-3: unknown specified by clinician=====> categ blank (NA)
|
||||
|
||||
table(reg_data$smoking)
|
||||
table(clinical_df$smoking)
|
||||
#-3 -1 1 2 3 4
|
||||
#19 11 35 2 19 47
|
||||
|
||||
# reassign the smoking codes
|
||||
reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
|
||||
reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
|
||||
reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
|
||||
clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0
|
||||
clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1
|
||||
clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- ""
|
||||
|
||||
table(reg_data$smoking)
|
||||
table(clinical_df$smoking)
|
||||
# 0 1
|
||||
#30 49 54
|
||||
|
||||
table(reg_data$asthma, reg_data$smoking)
|
||||
table(clinical_df$asthma, clinical_df$smoking)
|
||||
|
||||
# orig
|
||||
# 0 1
|
||||
|
@ -289,12 +300,12 @@ table(reg_data$asthma, reg_data$smoking)
|
|||
#==================
|
||||
# writing output file
|
||||
#==================
|
||||
outfile_name_reg = "reg_data_recoded_with_NA.csv"
|
||||
outfile_name_reg = "clinical_df_recoded.csv"
|
||||
outfile_reg = paste0(outdir, outfile_name_reg)
|
||||
|
||||
cat("Writing clinical file for regression:", outfile_reg)
|
||||
cat("\nWriting clinical file for regression:", outfile_reg)
|
||||
|
||||
#write.csv(reg_data, file = outfile_reg)
|
||||
#write.csv(clinical_df, file = outfile_reg)
|
||||
################################################################
|
||||
|
||||
rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
|
||||
|
|
|
@ -50,9 +50,9 @@ metadata_all = all_df[, meta_data_cols]
|
|||
adult_df = all_df[all_df$age>=18,]
|
||||
|
||||
if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){
|
||||
cat ("PASS: adult df extracted successfully")
|
||||
cat ("\nPASS: adult df extracted successfully")
|
||||
} else{
|
||||
cat ("FAIL: adult df number mismatch!")
|
||||
cat ("\nFAIL: adult df number mismatch!")
|
||||
}
|
||||
|
||||
#==============
|
||||
|
@ -62,9 +62,9 @@ if (table(adult_df$adult == 1)[[1]] == nrow(adult_df) ){
|
|||
fp_adults = adult_df[adult_df$flustat == 1,]
|
||||
|
||||
if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){
|
||||
cat ("PASS: adult df extracted successfully")
|
||||
cat ("\nPASS: adult df extracted successfully")
|
||||
} else{
|
||||
cat ("FAIL: adult df number mismatch!")
|
||||
cat ("\nFAIL: adult df number mismatch!")
|
||||
}
|
||||
|
||||
#============
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue