perfomed LR analysis and tidyed up clinical formatting code

This commit is contained in:
Tanushree Tunstall 2020-11-24 18:46:47 +00:00
parent 08e01abfb5
commit f0c0fd72d1
5 changed files with 296 additions and 301 deletions

View file

@ -28,34 +28,27 @@ clinical_ics = read.csv(infile_ics)
str(clinical_ics)
########################################################################
# quick sanity checks
table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3
# clear unnecessary variables
rm(all_df, adult_df, metadata_all)
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1)
########################################################################
# Clinical_data extraction
########################################################################
cat("\nExtracting:", length(clinical_cols), "cols from fp_adults")
#cat("\nExtracting:", length(clinical_cols), "cols from fp_adults")
clinical_df = fp_adults[, clinical_cols]
#clinical_df = fp_adults[, clinical_cols]
# sanity checks
if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){
cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df))
}else{
cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!")
quit()
}
#if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){
# cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df))
#}else{
# cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!")
# quit()
#}
table(clinical_df$ia_exac_copd)
#table(clinical_df$ia_exac_copd)
str(clinical_df)
#str(clinical_df)
#clinical_df$o2_sat_suppl
########################################################################
#==================================
# Check asthma and copd conflict
@ -80,42 +73,41 @@ if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){
foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0
rm(check_copd_and_asthma_1, foo)
cat("Check status again...")
}
#=====================================================================
#=================================
# resp scores: In, max and t1 & t2
#=================================
# count the resp scores
max_resp_score_table<- table(clinical_df$max_resp_score)
max_resp_score_table<- table(fp_adults$max_resp_score)
max_resp_score_table
T1_resp_score_table<- table(clinical_df$T1_resp_score)
T1_resp_score_table<- table(fp_adults$T1_resp_score)
T1_resp_score_table
T2_resp_score_table<- table(clinical_df$T2_resp_score)
T2_resp_score_table<- table(fp_adults$T2_resp_score)
T2_resp_score_table
Inresp_sev<- table(clinical_df$inresp_sev)
Inresp_sev<- table(fp_adults$inresp_sev)
Inresp_sev
# Reassign the resp score so all 4 are replace by 3
clinical_df$max_resp_score[clinical_df$max_resp_score == 4 ] <- 3
revised_resp_score_table<- table(clinical_df$max_resp_score)
fp_adults$max_resp_score[fp_adults$max_resp_score == 4 ] <- 3
revised_resp_score_table<- table(fp_adults$max_resp_score)
revised_resp_score_table
clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3
revised_T1_resp_score_table<- table(clinical_df$T1_resp_score)
fp_adults$T1_resp_score[fp_adults$T1_resp_score ==4 ] <- 3
revised_T1_resp_score_table<- table(fp_adults$T1_resp_score)
revised_T1_resp_score_table
clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3
revised_T2_resp_score_table<- table(clinical_df$T2_resp_score)
fp_adults$T2_resp_score[fp_adults$T2_resp_score == 4]<- 3
revised_T2_resp_score_table<- table(fp_adults$T2_resp_score)
revised_T2_resp_score_table
clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3
revised_Inresp_sev<- table(clinical_df$inresp_sev)
fp_adults$inresp_sev[fp_adults$inresp_sev == 4]<- 3
revised_Inresp_sev<- table(fp_adults$inresp_sev)
revised_Inresp_sev
#=====================================================================
# Remove these after checking
@ -130,32 +122,32 @@ rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
# age
#========
# Create categories of variables
clinical_df$age_int = round(clinical_df$age, digits = 0)
table(clinical_df$age_int)
table(clinical_df$asthma, clinical_df$age_int)
min(clinical_df$age_int); max(clinical_df$age_int)
fp_adults$age_int = round(fp_adults$age, digits = 0)
table(fp_adults$age_int)
table(fp_adults$asthma, fp_adults$age_int)
min(fp_adults$age_int); max(fp_adults$age_int)
max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling)
max_age_interval = round_any(max(fp_adults$age_int), 10, f = ceiling)
max_age_interval
min_age = min(clinical_df$age_int); min_age #19
min_age = min(fp_adults$age_int); min_age #19
min_age_interval = min_age - 1; min_age_interval
#age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90))
age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
clinical_df$age_bins = age_bins
dim(clinical_df) # 133 28
#age_bins = cut(fp_adults$age_int, c(0,18,30,40,50,60,70,80,90))
age_bins = cut(fp_adults$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
fp_adults$age_bins = age_bins
dim(fp_adults) # 133 28
# age_bins (to keep consistent with the results table)
class(clinical_df$age_bins)
levels(clinical_df$age_bins)
class(fp_adults$age_bins)
levels(fp_adults$age_bins)
#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
table(clinical_df$asthma, clinical_df$age_bins)
table(fp_adults$asthma, fp_adults$age_bins)
# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
#0 25 17 25 14 11 1
#1 11 8 12 5 2 2
if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
if (sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults) ){
cat("\nPASS: age_bins assigned successfully")
}else{
cat("\nFAIL: no. mismatch when assigning age_bins")
@ -163,37 +155,37 @@ if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
}
# reassign levels
class(clinical_df$age_bins)
levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
table(clinical_df$asthma, clinical_df$age_bins)
table(clinical_df$asthma, clinical_df$age_bins)
class(fp_adults$age_bins)
levels(fp_adults$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
table(fp_adults$asthma, fp_adults$age_bins)
table(fp_adults$asthma, fp_adults$age_bins)
# (18,30] (30,40] (40,50] (50,80]
#0 25 17 25 26
#1 11 8 12 9
sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df)
table(clinical_df$age_int)
clinical_df = subset(clinical_df, select = -c(age_int))
table(clinical_df$age_int)
sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults)
table(fp_adults$age_int)
fp_adults = subset(fp_adults, select = -c(age_int))
table(fp_adults$age_int)
class(clinical_df$age_bins)
clinical_df$age_bins
class(fp_adults$age_bins)
fp_adults$age_bins
#===========================
# O2 saturation binning
#===========================
clinical_df$o2_sat_admis
n1 = sum(is.na(clinical_df$o2_sat_admis))
fp_adults$o2_sat_admis
n1 = sum(is.na(fp_adults$o2_sat_admis))
clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0)
table(clinical_df$o2_sat_admis)
tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]]
fp_adults$o2_sat_admis = round(fp_adults$o2_sat_admis, digits = 0)
table(fp_adults$o2_sat_admis)
tot_o2 = sum(table(fp_adults$o2_sat_admis))- table(fp_adults$o2_sat_admis)[["-1"]]
tot_o2
n_text_code = table(clinical_df$o2_sat_admis)[["-1"]]
n_text_code = table(fp_adults$o2_sat_admis)[["-1"]]
clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA
n2 = sum(is.na(clinical_df$o2_sat_admis))
fp_adults$o2_sat_admis[fp_adults$o2_sat_admis <0] <- NA
n2 = sum(is.na(fp_adults$o2_sat_admis))
if (n2 == n1 + n_text_code) {
cat ("PASS: -1 code converted to NA")
@ -201,75 +193,75 @@ if (n2 == n1 + n_text_code) {
cat("FAIL: something went wrong!")
}
o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100))
clinical_df$o2_sat_bin = o2_sat_bin
table(clinical_df$o2_sat_bin)
o2_sat_bin = cut(fp_adults$o2_sat_admis, c(0,92,100))
fp_adults$o2_sat_bin = o2_sat_bin
table(fp_adults$o2_sat_bin)
sum(table(clinical_df$o2_sat_bin)) == tot_o2
sum(table(fp_adults$o2_sat_bin)) == tot_o2
#===========================
# Onset to initial binning
#===========================
clinical_df$onset_2_initial
fp_adults$onset_2_initial
max_in = max(clinical_df$onset_2_initial); max_in #23
min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6
max_in = max(fp_adults$onset_2_initial); max_in #23
min_in = min(fp_adults$onset_2_initial) - 1 ; min_in # -6
tot_onset2ini = sum(table(clinical_df$onset_2_initial))
tot_onset2ini = sum(table(fp_adults$onset_2_initial))
tot_onset2ini
onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in))
clinical_df$onset_initial_bin = onset_initial_bin
sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini
onset_initial_bin = cut(fp_adults$onset_2_initial, c(min_in, 4, max_in))
fp_adults$onset_initial_bin = onset_initial_bin
sum(table(fp_adults$onset_initial_bin)) == tot_onset2ini
#=======================
# seasonal flu: sfluv
#=======================
# reassign as 0 and 1
table(clinical_df$sfluv)
table(clinical_df$asthma, clinical_df$sfluv)
clinical_df$sfluv = ifelse(clinical_df$sfluv == "yes", 1, 0)
table(clinical_df$sfluv)
table(clinical_df$asthma, clinical_df$sfluv)
table(fp_adults$sfluv)
table(fp_adults$asthma, fp_adults$sfluv)
fp_adults$sfluv = ifelse(fp_adults$sfluv == "yes", 1, 0)
table(fp_adults$sfluv)
table(fp_adults$asthma, fp_adults$sfluv)
# convert to integer
str(clinical_df$sfluv)
clinical_df$sfluv = as.integer(clinical_df$sfluv)
str(clinical_df$sfluv)
str(fp_adults$sfluv)
fp_adults$sfluv = as.integer(fp_adults$sfluv)
str(fp_adults$sfluv)
#=======================
# h1n1v
#=======================
# reassign as 0 and 1
table(clinical_df$h1n1v)
table(clinical_df$asthma, clinical_df$h1n1v)
clinical_df$h1n1v = ifelse(clinical_df$h1n1v == "yes", 1, 0)
table(clinical_df$h1n1v)
table(clinical_df$asthma, clinical_df$h1n1v)
table(fp_adults$h1n1v)
table(fp_adults$asthma, fp_adults$h1n1v)
fp_adults$h1n1v = ifelse(fp_adults$h1n1v == "yes", 1, 0)
table(fp_adults$h1n1v)
table(fp_adults$asthma, fp_adults$h1n1v)
# convert to integer
str(clinical_df$h1n1v)
clinical_df$h1n1v = as.integer(clinical_df$h1n1v)
str(clinical_df$h1n1v)
str(fp_adults$h1n1v)
fp_adults$h1n1v = as.integer(fp_adults$h1n1v)
str(fp_adults$h1n1v)
#=======================
# ethnicity
#=======================
class(clinical_df$ethnicity) # integer
table(clinical_df$ethnicity)
table(clinical_df$asthma, clinical_df$ethnicity)
class(fp_adults$ethnicity) # integer
table(fp_adults$ethnicity)
table(fp_adults$asthma, fp_adults$ethnicity)
clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2
table(clinical_df$ethnicity)
table(clinical_df$asthma, clinical_df$ethnicity)
fp_adults$ethnicity[fp_adults$ethnicity == 4] <- 2
table(fp_adults$ethnicity)
table(fp_adults$asthma, fp_adults$ethnicity)
#=======================
# pneumonia
#=======================
table(clinical_df$ia_cxr)
class(clinical_df$ia_cxr) # integer
table(fp_adults$ia_cxr)
class(fp_adults$ia_cxr) # integer
# ia_cxr 2 ---> yes pneumonia (1)
# 1 ---> no (0)
# ! 1 or 2 -- > "unknown"
@ -283,29 +275,29 @@ class(clinical_df$ia_cxr) # integer
#-2: n/a specified by the clinician # not in the data...
#-3: unknown specified by clinician
table(clinical_df$ia_cxr)
table(fp_adults$ia_cxr)
#-3 -1 0 1 2 3
#5 48 13 47 17 3
# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- NA
table(clinical_df$ia_cxr)
fp_adults$ia_cxr[fp_adults$ia_cxr == -3 | fp_adults$ia_cxr == -1 | fp_adults$ia_cxr == 0 | fp_adults$ia_cxr == 3 ] <- NA
table(fp_adults$ia_cxr)
# 1 2
#69 47 17
sum(is.na(clinical_df$ia_cxr))
sum(is.na(fp_adults$ia_cxr))
clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0
clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1
table(clinical_df$ia_cxr)
fp_adults$ia_cxr[fp_adults$ia_cxr == 1] <- 0
fp_adults$ia_cxr[fp_adults$ia_cxr == 2] <- 1
table(fp_adults$ia_cxr)
# 0 1
#69 47 17
#=======================
# smoking [tricky one]
#=======================
class(clinical_df$smoking) # integer
table(clinical_df$asthma, clinical_df$smoking)
class(fp_adults$smoking) # integer
table(fp_adults$asthma, fp_adults$smoking)
# orig
# -3 -1 1 2 3 4
@ -330,20 +322,20 @@ table(clinical_df$asthma, clinical_df$smoking)
#-2: n/a specified by the clinician =====> categ blank (NA)
#-3: unknown specified by clinician=====> categ blank (NA)
table(clinical_df$smoking)
table(fp_adults$smoking)
#-3 -1 1 2 3 4
#19 11 35 2 19 47
# reassign the smoking codes
clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0
clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1
clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA
fp_adults$smoking[fp_adults$smoking == 4 | fp_adults$smoking == 2 ] <- 0
fp_adults$smoking[fp_adults$smoking == 1 | fp_adults$smoking == 3 ] <- 1
fp_adults$smoking[fp_adults$smoking == -1 | fp_adults$smoking == -2 | fp_adults$smoking == -3 ] <- NA
table(clinical_df$smoking); sum(is.na(clinical_df$smoking))
table(fp_adults$smoking); sum(is.na(fp_adults$smoking))
# 0 1
#30 49 54
table(clinical_df$asthma, clinical_df$smoking)
table(fp_adults$asthma, fp_adults$smoking)
# orig
# 0 1
@ -352,24 +344,24 @@ table(clinical_df$asthma, clinical_df$smoking)
################################################################
#=========================
# Merge: clinical_df and infile ics
# Merge: fp_adults and infile ics
#=========================
merging_cols = intersect( names(clinical_df), names(clinical_ics) )
merging_cols = intersect( names(fp_adults), names(clinical_ics) )
merging_cols
clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics
fp_adults_ics = merge(fp_adults, clinical_ics, by = merging_cols, all = T); fp_adults_ics
colnames(clinical_df_ics)
colnames(fp_adults_ics)
if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics)
if (nrow(fp_adults_ics) == nrow(fp_adults) & nrow(clinical_ics)){
cat("\nPASS: No. of rows match, nrow =", nrow(fp_adults_ics)
, "\nChecking ncols...")
if ( ncol(clinical_df_ics) == ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) ){
cat("\nPASS: No. of cols match, ncol =", ncol(clinical_df_ics))
if ( ncol(fp_adults_ics) == ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols) ){
cat("\nPASS: No. of cols match, ncol =", ncol(fp_adults_ics))
} else {
cat("\nFAIL: ncols mismatch"
, "Expected ncols:", ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols)
, "\nGot:", ncol(clinical_df_ics))
, "Expected ncols:", ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols)
, "\nGot:", ncol(fp_adults_ics))
}
} else {
cat("\nFAIL: nrows mismatch"
@ -379,49 +371,54 @@ if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
#=========================
# add binary outcome for T1 resp score
#=========================
table(clinical_df_ics$T1_resp_score)
table(fp_adults_ics$T1_resp_score)
clinical_df_ics$t1_resp_recoded = ifelse(clinical_df_ics$T1_resp_score <3, 0, 1)
table(clinical_df_ics$t1_resp_recoded)
#table(clinical_df_ics$steroid)
table(clinical_df_ics$steroid_ics)
fp_adults_ics$t1_resp_recoded = ifelse(fp_adults_ics$T1_resp_score <3, 0, 1)
table(fp_adults_ics$t1_resp_recoded)
#table(fp_adults_ics$steroid)
table(fp_adults_ics$steroid_ics)
#=========================
# change the factor vars to integers
#=========================
#str(clinical_df_ics)
#factor_vars = lapply(clinical_df_ics, class) == "factor"
#str(fp_adults_ics)
#factor_vars = lapply(fp_adults_ics, class) == "factor"
#table(factor_vars)
#clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer)
#fp_adults_ics[, factor_vars] <- lapply(fp_adults_ics[, factor_vars], as.integer)
#table(factor_vars)
#str(clinical_df_ics)
#str(fp_adults_ics)
#=========================
# remove cols
#=========================
clinical_df_ics = subset(clinical_df_ics, select = -c(onset_2_initial))
fp_adults_ics = subset(fp_adults_ics, select = -c(onset_2_initial))
#======================
# writing output file
#======================
outfile_name_reg = "clinical_df_recoded.csv"
outfile_name_reg = "fp_adults_recoded.csv"
outfile_reg = paste0(outdir, outfile_name_reg)
cat("\nWriting clinical file for regression:", outfile_reg)
#write.csv(clinical_df_ics, file = outfile_reg)
#write.csv(fp_adults_ics, file = outfile_reg)
#=========================
# clinical_df_ics: without asthma
# fp_adults_ics: without asthma
#=========================
clinical_df_ics_na = clinical_df_ics[clinical_df_ics$asthma == 0,]
fp_adults_ics_na = fp_adults_ics[fp_adults_ics$asthma == 0,]
#=========================
# clinical_df only
#=========================
clinical_df_ics = fp_adults[, clinical_cols]
################################################################
rm(age_bins, max_age_interval, max_in, min_in
, o2_sat_bin, onset_initial_bin, tot_o2
, n_text_code, n1, n2, tot_onset2ini, infile_ics
, tot_onset2ini, meta_data_cols
, clinical_df, clinical_ics)
, fp_adults, clinical_ics)
################################################################