reformatting code to select needed df for analysis

This commit is contained in:
Tanushree Tunstall 2020-11-20 11:43:03 +00:00
parent a6cbaab40a
commit b72c4df796
7 changed files with 243 additions and 102 deletions

View file

@ -53,31 +53,36 @@ if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(tab
table(clinical_df$ia_exac_copd)
str(clinical_df)
#clinical_df$o2_sat_suppl
########################################################################
#==================================
# asthma and copd status correction
# for conflicting field!
# Check asthma and copd conflict
#=================================
if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){
cat("PASS: asthma and copd do not conflict")
}else{
cat("Conflict detected in asthm and copd filed, attempting to resolve...")
# Reassign the copd and asthma status and do some checks
table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0
fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0
table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd))
# check copd and asthma status
table(fp_adults$ia_exac_copd, fp_adults$asthma)
check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3
# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0
table(fp_adults$ia_exac_copd, fp_adults$asthma)
foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0
rm(check_copd_and_asthma_1, foo)
cat("Check status again...")
# Reassign the copd and asthma status and do some checks
table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))
}
clinical_df$ia_exac_copd[clinical_df$ia_exac_copd< 1]<- 0
clinical_df$ia_exac_copd[is.na(clinical_df$ia_exac_copd)] <- 0
table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd))
# check copd and asthma status
table(clinical_df$ia_exac_copd, clinical_df$asthma)
check_copd_and_asthma_1<- subset(clinical_df, ia_exac_copd ==1 & asthma == 1) # check this is 3
# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
clinical_df$asthma[clinical_df$ia_exac_copd == 1 & clinical_df$asthma == 1]= 0
table(clinical_df$ia_exac_copd, clinical_df$asthma)
foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0
rm(check_copd_and_asthma_1, foo)
#=====================================================================
#=================================
# resp scores: In, max and t1 & t2
@ -97,7 +102,7 @@ Inresp_sev<- table(clinical_df$inresp_sev)
Inresp_sev
# Reassign the resp score so all 4 are replace by 3
clinical_df$max_resp_score[clinical_df$max_resp_score ==4 ] <- 3
clinical_df$max_resp_score[clinical_df$max_resp_score == 4 ] <- 3
revised_resp_score_table<- table(clinical_df$max_resp_score)
revised_resp_score_table
@ -125,29 +130,30 @@ rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
# age
#========
# Create categories of variables
clinical_df$age = round(clinical_df$age, digits = 0)
table(clinical_df$age)
table(clinical_df$asthma, clinical_df$age)
min(clinical_df$age); max(clinical_df$age)
clinical_df$age_int = round(clinical_df$age, digits = 0)
table(clinical_df$age_int)
table(clinical_df$asthma, clinical_df$age_int)
min(clinical_df$age_int); max(clinical_df$age_int)
max_age_interval = round_any(max(clinical_df$age), 10, f = ceiling)
max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling)
max_age_interval
min_age = min(clinical_df$age); min_age #19
min_age = min(clinical_df$age_int); min_age #19
min_age_interval = min_age - 1; min_age_interval
#age_bins = cut(clinical_df$age, c(0,18,30,40,50,60,70,80,90))
age_bins = cut(clinical_df$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
#age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90))
age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
clinical_df$age_bins = age_bins
dim(clinical_df) # 133 27
dim(clinical_df) # 133 28
# age_bins (to keep consistent with the results table)
class(clinical_df$age_bins)
levels(clinical_df$age_bins)
#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
table(clinical_df$asthma, clinical_df$age_bins)
# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
#0 25 17 25 14 11 1
#1 11 8 12 5 3 2
#1 11 8 12 5 2 2
if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
cat("\nPASS: age_bins assigned successfully")
@ -156,7 +162,7 @@ if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
quit()
}
# reassign
# reassign levels
class(clinical_df$age_bins)
levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
table(clinical_df$asthma, clinical_df$age_bins)
@ -170,11 +176,25 @@ sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df)
#===========================
# O2 saturation binning
#===========================
clinical_df$o2_sat_admis
n1 = sum(is.na(clinical_df$o2_sat_admis))
clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0)
table(clinical_df$o2_sat_admis)
tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]]
tot_o2
n_text_code = table(clinical_df$o2_sat_admis)[["-1"]]
clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA
n2 = sum(is.na(clinical_df$o2_sat_admis))
if (n2 == n1 + n_text_code) {
cat ("PASS: -1 code converted to NA")
} else{
cat("FAIL: something went wrong!")
}
o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100))
clinical_df$o2_sat_bin = o2_sat_bin
table(clinical_df$o2_sat_bin)
@ -184,6 +204,8 @@ sum(table(clinical_df$o2_sat_bin)) == tot_o2
#===========================
# Onset to initial binning
#===========================
clinical_df$onset_2_initial
max_in = max(clinical_df$onset_2_initial); max_in #23
min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6
@ -198,14 +220,15 @@ sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini
#=======================
# seasonal flu: sfluv
#=======================
# should be a factor
if (! is.factor(clinical_df$sfluv)){
clinical_df$sfluv = as.factor(clinical_df$sfluv)
}
class(clinical_df$sfluv) #[1] "factor"
class(clinical_df$sfluv)
levels(clinical_df$sfluv)
table(clinical_df$sfluv)
table(clinical_df$asthma, clinical_df$sfluv)
# reassign
levels(clinical_df$sfluv) <- c("0", "0", "1")
table(clinical_df$asthma, clinical_df$sfluv)
@ -213,14 +236,16 @@ table(clinical_df$asthma, clinical_df$sfluv)
#=======================
# h1n1v
#=======================
# should be a factor
if (! is.factor(clinical_df$h1n1v)){
clinical_df$h1n1v = as.factor(clinical_df$h1n1v)
}
class(clinical_df$h1n1v) #[1] "factor"
class(clinical_df$h1n1v)
levels(clinical_df$h1n1v)
table(clinical_df$h1n1v)
table(clinical_df$asthma, clinical_df$h1n1v)
# reassign
levels(clinical_df$h1n1v) <- c("0", "0", "1")
table(clinical_df$asthma, clinical_df$h1n1v)
@ -229,18 +254,21 @@ table(clinical_df$asthma, clinical_df$h1n1v)
# ethnicity
#=======================
class(clinical_df$ethnicity) # integer
table(clinical_df$ethnicity)
table(clinical_df$asthma, clinical_df$ethnicity)
clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2
table(clinical_df$ethnicity)
table(clinical_df$asthma, clinical_df$ethnicity)
#=======================
# pneumonia
#=======================
table(clinical_df$ia_cxr)
class(clinical_df$ia_cxr) # integer
# ia_cxr 2 ---> yes pneumonia (1)
# 1 ---> no (0)
# ! 1 or 2 -- > "unkown"
# ! 1 or 2 -- > "unknown"
# reassign the pneumonia codes
#0: not performed
@ -251,7 +279,6 @@ class(clinical_df$ia_cxr) # integer
#-2: n/a specified by the clinician # not in the data...
#-3: unknown specified by clinician
table(clinical_df$ia_cxr)
#-3 -1 0 1 2 3
#5 48 13 47 17 3
@ -262,6 +289,8 @@ table(clinical_df$ia_cxr)
# 1 2
#69 47 17
sum(is.na(clinical_df$ia_cxr))
clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0
clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1
table(clinical_df$ia_cxr)
@ -306,7 +335,7 @@ clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0
clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1
clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA
table(clinical_df$smoking)
table(clinical_df$smoking); sum(is.na(clinical_df$smoking))
# 0 1
#30 49 54
@ -316,17 +345,13 @@ table(clinical_df$asthma, clinical_df$smoking)
# 0 1
#0 24 32 37
#1 6 17 17
# 0 1
#0 23 32 35
#1 7 17 19
################################################################
#=========================
# Merge: clinical_df and infile ics
#=========================
merging_cols = intersect( names(clinical_df), names(clinical_ics) )
merging_cols
clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics
@ -351,6 +376,15 @@ if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
, "\nExpected nrows:", nrow(fp_adults))
}
# change the factor vars to integers
str(clinical_df_ics)
factor_vars = lapply(clinical_df_ics, class) == "factor"
table(factor_vars)
clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer)
table(factor_vars)
str(clinical_df_ics)
#======================
# writing output file
#======================
@ -359,9 +393,8 @@ outfile_reg = paste0(outdir, outfile_name_reg)
cat("\nWriting clinical file for regression:", outfile_reg)
write.csv(clinical_df_ics, file = outfile_reg)
#write.csv(clinical_df_ics, file = outfile_reg)
################################################################
rm(age_bins, max_age_interval, max_in, min_in
, o2_sat_bin, onset_initial_bin, tot_o2
, tot_onset2ini, meta_data_cols