perfomed LR analysis and tidyed up clinical formatting code

This commit is contained in:
Tanushree Tunstall 2020-11-24 18:46:47 +00:00
parent 08e01abfb5
commit f0c0fd72d1
5 changed files with 296 additions and 301 deletions

View file

@ -8,7 +8,7 @@ getwd()
######################################################################## ########################################################################
######################################################################## ########################################################################
clinical_cols = c("mosaic" clinical_cols_data = c("mosaic"
, "ia_exac_copd" , "ia_exac_copd"
, "death" , "death"
#, "obese2" #inc peaeds, but once you subset data for adults, its the same! #, "obese2" #inc peaeds, but once you subset data for adults, its the same!
@ -17,7 +17,7 @@ clinical_cols = c("mosaic"
, "sfluv" , "sfluv"
, "h1n1v" , "h1n1v"
, "age" , "age"
, "gender" , "gender"
, "asthma" , "asthma"
, "vl_pfu_ul_npa1" , "vl_pfu_ul_npa1"
, "los" , "los"
@ -36,6 +36,14 @@ clinical_cols = c("mosaic"
, "inresp_sev" , "inresp_sev"
, "steroid") , "steroid")
clinical_cols_added = c("age_bins"
, "o2_sat_bin"
, "onset_initial_bin"
, "steroid_ics"
, "t1_resp_recoded" )
clinical_cols = c(clinical_cols_data, clinical_cols_added)
sig_npa_cols = c("eotaxin_npa1" sig_npa_cols = c("eotaxin_npa1"
, "eotaxin3_npa1" , "eotaxin3_npa1"
, "eotaxin3_npa2" , "eotaxin3_npa2"

View file

@ -28,34 +28,27 @@ clinical_ics = read.csv(infile_ics)
str(clinical_ics) str(clinical_ics)
######################################################################## ########################################################################
# quick sanity checks table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1)
table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3
# clear unnecessary variables
rm(all_df, adult_df, metadata_all)
######################################################################## ########################################################################
# Clinical_data extraction # Clinical_data extraction
######################################################################## ########################################################################
cat("\nExtracting:", length(clinical_cols), "cols from fp_adults") #cat("\nExtracting:", length(clinical_cols), "cols from fp_adults")
clinical_df = fp_adults[, clinical_cols] #clinical_df = fp_adults[, clinical_cols]
# sanity checks # sanity checks
if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ #if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){
cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df)) # cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df))
}else{ #}else{
cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!") # cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!")
quit() # quit()
} #}
table(clinical_df$ia_exac_copd) #table(clinical_df$ia_exac_copd)
str(clinical_df) #str(clinical_df)
#clinical_df$o2_sat_suppl #clinical_df$o2_sat_suppl
######################################################################## ########################################################################
#================================== #==================================
# Check asthma and copd conflict # Check asthma and copd conflict
@ -80,42 +73,41 @@ if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){
foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0 foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0
rm(check_copd_and_asthma_1, foo) rm(check_copd_and_asthma_1, foo)
cat("Check status again...") cat("Check status again...")
} }
#===================================================================== #=====================================================================
#================================= #=================================
# resp scores: In, max and t1 & t2 # resp scores: In, max and t1 & t2
#================================= #=================================
# count the resp scores # count the resp scores
max_resp_score_table<- table(clinical_df$max_resp_score) max_resp_score_table<- table(fp_adults$max_resp_score)
max_resp_score_table max_resp_score_table
T1_resp_score_table<- table(clinical_df$T1_resp_score) T1_resp_score_table<- table(fp_adults$T1_resp_score)
T1_resp_score_table T1_resp_score_table
T2_resp_score_table<- table(clinical_df$T2_resp_score) T2_resp_score_table<- table(fp_adults$T2_resp_score)
T2_resp_score_table T2_resp_score_table
Inresp_sev<- table(clinical_df$inresp_sev) Inresp_sev<- table(fp_adults$inresp_sev)
Inresp_sev Inresp_sev
# Reassign the resp score so all 4 are replace by 3 # Reassign the resp score so all 4 are replace by 3
clinical_df$max_resp_score[clinical_df$max_resp_score == 4 ] <- 3 fp_adults$max_resp_score[fp_adults$max_resp_score == 4 ] <- 3
revised_resp_score_table<- table(clinical_df$max_resp_score) revised_resp_score_table<- table(fp_adults$max_resp_score)
revised_resp_score_table revised_resp_score_table
clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3 fp_adults$T1_resp_score[fp_adults$T1_resp_score ==4 ] <- 3
revised_T1_resp_score_table<- table(clinical_df$T1_resp_score) revised_T1_resp_score_table<- table(fp_adults$T1_resp_score)
revised_T1_resp_score_table revised_T1_resp_score_table
clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3 fp_adults$T2_resp_score[fp_adults$T2_resp_score == 4]<- 3
revised_T2_resp_score_table<- table(clinical_df$T2_resp_score) revised_T2_resp_score_table<- table(fp_adults$T2_resp_score)
revised_T2_resp_score_table revised_T2_resp_score_table
clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3 fp_adults$inresp_sev[fp_adults$inresp_sev == 4]<- 3
revised_Inresp_sev<- table(clinical_df$inresp_sev) revised_Inresp_sev<- table(fp_adults$inresp_sev)
revised_Inresp_sev revised_Inresp_sev
#===================================================================== #=====================================================================
# Remove these after checking # Remove these after checking
@ -130,32 +122,32 @@ rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
# age # age
#======== #========
# Create categories of variables # Create categories of variables
clinical_df$age_int = round(clinical_df$age, digits = 0) fp_adults$age_int = round(fp_adults$age, digits = 0)
table(clinical_df$age_int) table(fp_adults$age_int)
table(clinical_df$asthma, clinical_df$age_int) table(fp_adults$asthma, fp_adults$age_int)
min(clinical_df$age_int); max(clinical_df$age_int) min(fp_adults$age_int); max(fp_adults$age_int)
max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling) max_age_interval = round_any(max(fp_adults$age_int), 10, f = ceiling)
max_age_interval max_age_interval
min_age = min(clinical_df$age_int); min_age #19 min_age = min(fp_adults$age_int); min_age #19
min_age_interval = min_age - 1; min_age_interval min_age_interval = min_age - 1; min_age_interval
#age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90)) #age_bins = cut(fp_adults$age_int, c(0,18,30,40,50,60,70,80,90))
age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) age_bins = cut(fp_adults$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
clinical_df$age_bins = age_bins fp_adults$age_bins = age_bins
dim(clinical_df) # 133 28 dim(fp_adults) # 133 28
# age_bins (to keep consistent with the results table) # age_bins (to keep consistent with the results table)
class(clinical_df$age_bins) class(fp_adults$age_bins)
levels(clinical_df$age_bins) levels(fp_adults$age_bins)
#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
table(clinical_df$asthma, clinical_df$age_bins) table(fp_adults$asthma, fp_adults$age_bins)
# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
#0 25 17 25 14 11 1 #0 25 17 25 14 11 1
#1 11 8 12 5 2 2 #1 11 8 12 5 2 2
if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ if (sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults) ){
cat("\nPASS: age_bins assigned successfully") cat("\nPASS: age_bins assigned successfully")
}else{ }else{
cat("\nFAIL: no. mismatch when assigning age_bins") cat("\nFAIL: no. mismatch when assigning age_bins")
@ -163,37 +155,37 @@ if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){
} }
# reassign levels # reassign levels
class(clinical_df$age_bins) class(fp_adults$age_bins)
levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") levels(fp_adults$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
table(clinical_df$asthma, clinical_df$age_bins) table(fp_adults$asthma, fp_adults$age_bins)
table(clinical_df$asthma, clinical_df$age_bins) table(fp_adults$asthma, fp_adults$age_bins)
# (18,30] (30,40] (40,50] (50,80] # (18,30] (30,40] (40,50] (50,80]
#0 25 17 25 26 #0 25 17 25 26
#1 11 8 12 9 #1 11 8 12 9
sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults)
table(clinical_df$age_int) table(fp_adults$age_int)
clinical_df = subset(clinical_df, select = -c(age_int)) fp_adults = subset(fp_adults, select = -c(age_int))
table(clinical_df$age_int) table(fp_adults$age_int)
class(clinical_df$age_bins) class(fp_adults$age_bins)
clinical_df$age_bins fp_adults$age_bins
#=========================== #===========================
# O2 saturation binning # O2 saturation binning
#=========================== #===========================
clinical_df$o2_sat_admis fp_adults$o2_sat_admis
n1 = sum(is.na(clinical_df$o2_sat_admis)) n1 = sum(is.na(fp_adults$o2_sat_admis))
clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0) fp_adults$o2_sat_admis = round(fp_adults$o2_sat_admis, digits = 0)
table(clinical_df$o2_sat_admis) table(fp_adults$o2_sat_admis)
tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]] tot_o2 = sum(table(fp_adults$o2_sat_admis))- table(fp_adults$o2_sat_admis)[["-1"]]
tot_o2 tot_o2
n_text_code = table(clinical_df$o2_sat_admis)[["-1"]] n_text_code = table(fp_adults$o2_sat_admis)[["-1"]]
clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA fp_adults$o2_sat_admis[fp_adults$o2_sat_admis <0] <- NA
n2 = sum(is.na(clinical_df$o2_sat_admis)) n2 = sum(is.na(fp_adults$o2_sat_admis))
if (n2 == n1 + n_text_code) { if (n2 == n1 + n_text_code) {
cat ("PASS: -1 code converted to NA") cat ("PASS: -1 code converted to NA")
@ -201,75 +193,75 @@ if (n2 == n1 + n_text_code) {
cat("FAIL: something went wrong!") cat("FAIL: something went wrong!")
} }
o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100)) o2_sat_bin = cut(fp_adults$o2_sat_admis, c(0,92,100))
clinical_df$o2_sat_bin = o2_sat_bin fp_adults$o2_sat_bin = o2_sat_bin
table(clinical_df$o2_sat_bin) table(fp_adults$o2_sat_bin)
sum(table(clinical_df$o2_sat_bin)) == tot_o2 sum(table(fp_adults$o2_sat_bin)) == tot_o2
#=========================== #===========================
# Onset to initial binning # Onset to initial binning
#=========================== #===========================
clinical_df$onset_2_initial fp_adults$onset_2_initial
max_in = max(clinical_df$onset_2_initial); max_in #23 max_in = max(fp_adults$onset_2_initial); max_in #23
min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6 min_in = min(fp_adults$onset_2_initial) - 1 ; min_in # -6
tot_onset2ini = sum(table(clinical_df$onset_2_initial)) tot_onset2ini = sum(table(fp_adults$onset_2_initial))
tot_onset2ini tot_onset2ini
onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in)) onset_initial_bin = cut(fp_adults$onset_2_initial, c(min_in, 4, max_in))
clinical_df$onset_initial_bin = onset_initial_bin fp_adults$onset_initial_bin = onset_initial_bin
sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini sum(table(fp_adults$onset_initial_bin)) == tot_onset2ini
#======================= #=======================
# seasonal flu: sfluv # seasonal flu: sfluv
#======================= #=======================
# reassign as 0 and 1 # reassign as 0 and 1
table(clinical_df$sfluv) table(fp_adults$sfluv)
table(clinical_df$asthma, clinical_df$sfluv) table(fp_adults$asthma, fp_adults$sfluv)
clinical_df$sfluv = ifelse(clinical_df$sfluv == "yes", 1, 0) fp_adults$sfluv = ifelse(fp_adults$sfluv == "yes", 1, 0)
table(clinical_df$sfluv) table(fp_adults$sfluv)
table(clinical_df$asthma, clinical_df$sfluv) table(fp_adults$asthma, fp_adults$sfluv)
# convert to integer # convert to integer
str(clinical_df$sfluv) str(fp_adults$sfluv)
clinical_df$sfluv = as.integer(clinical_df$sfluv) fp_adults$sfluv = as.integer(fp_adults$sfluv)
str(clinical_df$sfluv) str(fp_adults$sfluv)
#======================= #=======================
# h1n1v # h1n1v
#======================= #=======================
# reassign as 0 and 1 # reassign as 0 and 1
table(clinical_df$h1n1v) table(fp_adults$h1n1v)
table(clinical_df$asthma, clinical_df$h1n1v) table(fp_adults$asthma, fp_adults$h1n1v)
clinical_df$h1n1v = ifelse(clinical_df$h1n1v == "yes", 1, 0) fp_adults$h1n1v = ifelse(fp_adults$h1n1v == "yes", 1, 0)
table(clinical_df$h1n1v) table(fp_adults$h1n1v)
table(clinical_df$asthma, clinical_df$h1n1v) table(fp_adults$asthma, fp_adults$h1n1v)
# convert to integer # convert to integer
str(clinical_df$h1n1v) str(fp_adults$h1n1v)
clinical_df$h1n1v = as.integer(clinical_df$h1n1v) fp_adults$h1n1v = as.integer(fp_adults$h1n1v)
str(clinical_df$h1n1v) str(fp_adults$h1n1v)
#======================= #=======================
# ethnicity # ethnicity
#======================= #=======================
class(clinical_df$ethnicity) # integer class(fp_adults$ethnicity) # integer
table(clinical_df$ethnicity) table(fp_adults$ethnicity)
table(clinical_df$asthma, clinical_df$ethnicity) table(fp_adults$asthma, fp_adults$ethnicity)
clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2 fp_adults$ethnicity[fp_adults$ethnicity == 4] <- 2
table(clinical_df$ethnicity) table(fp_adults$ethnicity)
table(clinical_df$asthma, clinical_df$ethnicity) table(fp_adults$asthma, fp_adults$ethnicity)
#======================= #=======================
# pneumonia # pneumonia
#======================= #=======================
table(clinical_df$ia_cxr) table(fp_adults$ia_cxr)
class(clinical_df$ia_cxr) # integer class(fp_adults$ia_cxr) # integer
# ia_cxr 2 ---> yes pneumonia (1) # ia_cxr 2 ---> yes pneumonia (1)
# 1 ---> no (0) # 1 ---> no (0)
# ! 1 or 2 -- > "unknown" # ! 1 or 2 -- > "unknown"
@ -283,29 +275,29 @@ class(clinical_df$ia_cxr) # integer
#-2: n/a specified by the clinician # not in the data... #-2: n/a specified by the clinician # not in the data...
#-3: unknown specified by clinician #-3: unknown specified by clinician
table(clinical_df$ia_cxr) table(fp_adults$ia_cxr)
#-3 -1 0 1 2 3 #-3 -1 0 1 2 3
#5 48 13 47 17 3 #5 48 13 47 17 3
# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- NA fp_adults$ia_cxr[fp_adults$ia_cxr == -3 | fp_adults$ia_cxr == -1 | fp_adults$ia_cxr == 0 | fp_adults$ia_cxr == 3 ] <- NA
table(clinical_df$ia_cxr) table(fp_adults$ia_cxr)
# 1 2 # 1 2
#69 47 17 #69 47 17
sum(is.na(clinical_df$ia_cxr)) sum(is.na(fp_adults$ia_cxr))
clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0 fp_adults$ia_cxr[fp_adults$ia_cxr == 1] <- 0
clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1 fp_adults$ia_cxr[fp_adults$ia_cxr == 2] <- 1
table(clinical_df$ia_cxr) table(fp_adults$ia_cxr)
# 0 1 # 0 1
#69 47 17 #69 47 17
#======================= #=======================
# smoking [tricky one] # smoking [tricky one]
#======================= #=======================
class(clinical_df$smoking) # integer class(fp_adults$smoking) # integer
table(clinical_df$asthma, clinical_df$smoking) table(fp_adults$asthma, fp_adults$smoking)
# orig # orig
# -3 -1 1 2 3 4 # -3 -1 1 2 3 4
@ -330,20 +322,20 @@ table(clinical_df$asthma, clinical_df$smoking)
#-2: n/a specified by the clinician =====> categ blank (NA) #-2: n/a specified by the clinician =====> categ blank (NA)
#-3: unknown specified by clinician=====> categ blank (NA) #-3: unknown specified by clinician=====> categ blank (NA)
table(clinical_df$smoking) table(fp_adults$smoking)
#-3 -1 1 2 3 4 #-3 -1 1 2 3 4
#19 11 35 2 19 47 #19 11 35 2 19 47
# reassign the smoking codes # reassign the smoking codes
clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0 fp_adults$smoking[fp_adults$smoking == 4 | fp_adults$smoking == 2 ] <- 0
clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1 fp_adults$smoking[fp_adults$smoking == 1 | fp_adults$smoking == 3 ] <- 1
clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA fp_adults$smoking[fp_adults$smoking == -1 | fp_adults$smoking == -2 | fp_adults$smoking == -3 ] <- NA
table(clinical_df$smoking); sum(is.na(clinical_df$smoking)) table(fp_adults$smoking); sum(is.na(fp_adults$smoking))
# 0 1 # 0 1
#30 49 54 #30 49 54
table(clinical_df$asthma, clinical_df$smoking) table(fp_adults$asthma, fp_adults$smoking)
# orig # orig
# 0 1 # 0 1
@ -352,24 +344,24 @@ table(clinical_df$asthma, clinical_df$smoking)
################################################################ ################################################################
#========================= #=========================
# Merge: clinical_df and infile ics # Merge: fp_adults and infile ics
#========================= #=========================
merging_cols = intersect( names(clinical_df), names(clinical_ics) ) merging_cols = intersect( names(fp_adults), names(clinical_ics) )
merging_cols merging_cols
clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics fp_adults_ics = merge(fp_adults, clinical_ics, by = merging_cols, all = T); fp_adults_ics
colnames(clinical_df_ics) colnames(fp_adults_ics)
if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ if (nrow(fp_adults_ics) == nrow(fp_adults) & nrow(clinical_ics)){
cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics) cat("\nPASS: No. of rows match, nrow =", nrow(fp_adults_ics)
, "\nChecking ncols...") , "\nChecking ncols...")
if ( ncol(clinical_df_ics) == ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) ){ if ( ncol(fp_adults_ics) == ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols) ){
cat("\nPASS: No. of cols match, ncol =", ncol(clinical_df_ics)) cat("\nPASS: No. of cols match, ncol =", ncol(fp_adults_ics))
} else { } else {
cat("\nFAIL: ncols mismatch" cat("\nFAIL: ncols mismatch"
, "Expected ncols:", ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) , "Expected ncols:", ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols)
, "\nGot:", ncol(clinical_df_ics)) , "\nGot:", ncol(fp_adults_ics))
} }
} else { } else {
cat("\nFAIL: nrows mismatch" cat("\nFAIL: nrows mismatch"
@ -379,49 +371,54 @@ if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
#========================= #=========================
# add binary outcome for T1 resp score # add binary outcome for T1 resp score
#========================= #=========================
table(clinical_df_ics$T1_resp_score) table(fp_adults_ics$T1_resp_score)
clinical_df_ics$t1_resp_recoded = ifelse(clinical_df_ics$T1_resp_score <3, 0, 1) fp_adults_ics$t1_resp_recoded = ifelse(fp_adults_ics$T1_resp_score <3, 0, 1)
table(clinical_df_ics$t1_resp_recoded) table(fp_adults_ics$t1_resp_recoded)
#table(clinical_df_ics$steroid) #table(fp_adults_ics$steroid)
table(clinical_df_ics$steroid_ics) table(fp_adults_ics$steroid_ics)
#========================= #=========================
# change the factor vars to integers # change the factor vars to integers
#========================= #=========================
#str(clinical_df_ics) #str(fp_adults_ics)
#factor_vars = lapply(clinical_df_ics, class) == "factor" #factor_vars = lapply(fp_adults_ics, class) == "factor"
#table(factor_vars) #table(factor_vars)
#clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer) #fp_adults_ics[, factor_vars] <- lapply(fp_adults_ics[, factor_vars], as.integer)
#table(factor_vars) #table(factor_vars)
#str(clinical_df_ics) #str(fp_adults_ics)
#========================= #=========================
# remove cols # remove cols
#========================= #=========================
clinical_df_ics = subset(clinical_df_ics, select = -c(onset_2_initial)) fp_adults_ics = subset(fp_adults_ics, select = -c(onset_2_initial))
#====================== #======================
# writing output file # writing output file
#====================== #======================
outfile_name_reg = "clinical_df_recoded.csv" outfile_name_reg = "fp_adults_recoded.csv"
outfile_reg = paste0(outdir, outfile_name_reg) outfile_reg = paste0(outdir, outfile_name_reg)
cat("\nWriting clinical file for regression:", outfile_reg) cat("\nWriting clinical file for regression:", outfile_reg)
#write.csv(clinical_df_ics, file = outfile_reg) #write.csv(fp_adults_ics, file = outfile_reg)
#========================= #=========================
# clinical_df_ics: without asthma # fp_adults_ics: without asthma
#========================= #=========================
clinical_df_ics_na = clinical_df_ics[clinical_df_ics$asthma == 0,] fp_adults_ics_na = fp_adults_ics[fp_adults_ics$asthma == 0,]
#=========================
# clinical_df only
#=========================
clinical_df_ics = fp_adults[, clinical_cols]
################################################################ ################################################################
rm(age_bins, max_age_interval, max_in, min_in rm(age_bins, max_age_interval, max_in, min_in
, o2_sat_bin, onset_initial_bin, tot_o2 , o2_sat_bin, onset_initial_bin, tot_o2
, n_text_code, n1, n2, tot_onset2ini, infile_ics , n_text_code, n1, n2, tot_onset2ini, infile_ics
, tot_onset2ini, meta_data_cols , tot_onset2ini, meta_data_cols
, clinical_df, clinical_ics) , fp_adults, clinical_ics)
################################################################ ################################################################

View file

@ -214,7 +214,7 @@ comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif
# write output file # write output file
#****************** #******************
cat("Chisq and fishers test results in:", outfile_clin_categ) cat("Chisq and fishers test results in:", outfile_clin_categ)
write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE) #write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)
#================== #==================
#0 date not recorded #0 date not recorded

View file

@ -13,21 +13,25 @@ getwd()
source("data_extraction_formatting_clinical.R") source("data_extraction_formatting_clinical.R")
# quick sanity checks # quick sanity checks
table(clinical_df_ics$ia_exac_copd==1 & clinical_df_ics$asthma == 1) table(fp_adults_ics$ia_exac_copd==1 & fp_adults_ics$asthma == 1)
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) table(fp_adults_ics$ia_exac_copd==1 & fp_adults_ics$asthma == 1)
table(fp_adults_na$ia_exac_copd==1 & fp_adults_na$asthma == 1) table(fp_adults_ics_na$ia_exac_copd==1 & fp_adults_ics_na$asthma == 1)
table(clinical_df_ics$asthma) table(fp_adults_ics$asthma)
if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){ if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){
cat("PASS: extracting clinical and sign npa cols") cat("PASS: extracting clinical and sign npa cols for regression")
} else{ } else{
cat("FAIL: could not find cols to extract") cat("FAIL: could not find cols to extract for regression")
quit() quit()
} }
fp_adults_reg = fp_adults[, cols_to_extract]
fp_adults_reg_na = fp_adults_na[, cols_to_extract] #fp_adults_reg = fp_adults_ics[, cols_to_extract]
fp_adults_reg = fp_adults_ics[, colnames(fp_adults_ics)%in%cols_to_extract]
cols_to_extract[!cols_to_extract%in%colnames(fp_adults_reg)]
fp_adults_reg_na = fp_adults_ics_na[, colnames(fp_adults_ics_na)%in%cols_to_extract]
#-------------------- #--------------------
# Data reassignment # Data reassignment
@ -38,26 +42,23 @@ my_data_na = fp_adults_reg_na
table(my_data$ia_exac_copd==1 & my_data$asthma == 1) table(my_data$ia_exac_copd==1 & my_data$asthma == 1)
table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1) table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1)
# clear variables
rm(fp_adults, fp_adults_na, clinical_df_ics, clinical_df_ics_na) cols_to_omit = c("mosaic", "flustat", "onset_2_initial", "ia_exac_copd", "onsfindeath", "o2_sat_admis")
my_clinical_cols = clinical_cols[!clinical_cols%in%cols_to_omit]
my_clinical_cols
######################################################################### #########################################################################
#============================ #============================
# Identifying column types: Reg data # Identifying column types: Reg data
#=========================== #===========================
cols_to_omit = c("mosaic", "flustat", "onset_2_initial", "ia_exac_copd") my_vars = colnames(my_data)
my_reg_data = my_data[!colnames(my_data)%in%cols_to_omit] lapply(my_data, class)
my_vars = colnames(my_reg_data) check_int_vars = my_vars[lapply(my_data, class)%in%c("integer")]
my_vars check_num_vars = my_vars[lapply(my_data, class)%in%c("numeric")]
check_charac_vars = my_vars[lapply(my_data, class)%in%c("character")]
lapply(my_reg_data, class) check_factor_vars = my_vars[lapply(my_data, class)%in%c("factor")]
check_int_vars = my_vars[lapply(my_reg_data, class)%in%c("integer")]
check_num_vars = my_vars[lapply(my_reg_data, class)%in%c("numeric")]
check_charac_vars = my_vars[lapply(my_reg_data, class)%in%c("character")]
check_factor_vars = my_vars[lapply(my_reg_data, class)%in%c("factor")]
cat("\nNo. of int cols:", length(check_int_vars) cat("\nNo. of int cols:", length(check_int_vars)
, "\nNo. of num cols:", length(check_num_vars) , "\nNo. of num cols:", length(check_num_vars)
@ -65,44 +66,56 @@ cat("\nNo. of int cols:", length(check_int_vars)
, "\nNo. of factor cols:", length(check_factor_vars) , "\nNo. of factor cols:", length(check_factor_vars)
) )
# convert char vals to int as these should be int #=======================================
my_reg_data[,check_charac_vars] = lapply(my_reg_data[,check_charac_vars], as.integer) # changing dtypes in cols
str(my_reg_data$sfluv) #=======================================
# what I need to be numerical explicitly
my_numerical_vars = c("age"
numerical_vars = c("age"
, "vl_pfu_ul_npa1" , "vl_pfu_ul_npa1"
, "los" , "los"
, "onset2final" , "onset2final"
, "onsfindeath" , "onsfindeath"
, "o2_sat_admis") , "o2_sat_admis")
my_data[my_numerical_vars] <- lapply(my_data[my_numerical_vars], as.numeric)
num_vars = my_vars[lapply(my_data, class)%in%c("numeric")]
num_vars
# what I need to be integer explicitly
not_int_vars = c(my_numerical_vars, sig_npa_cols)
my_int_vars = my_vars[!my_vars%in%not_int_vars]
my_int_vars
#my_int_vars = my_clinical_cols[int_vars_extract%in%check_int_vars]
#my_int_vars
my_reg_data[numerical_vars] <- lapply(my_reg_data[numerical_vars], as.numeric) # convert int cols to factor
my_data[my_int_vars] <- lapply(my_data[my_int_vars], as.factor)
my_reg_params = my_vars factor_vars = my_vars[lapply(my_data, class)%in%c("factor")]
factor_vars
check_factor_vars
# check again
lapply(my_data, class)
#======================================================================
my_reg_data = my_data[!colnames(my_data)%in%cols_to_omit]
my_reg_params = colnames(my_reg_data)
#length(factor_vars) + length(num_vars); ncol(my_reg_data)
#my_reg_params = c(factor_vars, num_vars)
na_count = sapply(my_reg_data, function(x) sum(is.na(x)));na_count na_count = sapply(my_reg_data, function(x) sum(is.na(x)));na_count
names(na_count)[na_count>0] names(na_count)[na_count>0]
vars_to_factor = my_vars[!my_vars%in%numerical_vars]
# convert to factor
lapply(my_reg_data, class)
my_reg_data[vars_to_factor] <- lapply(my_reg_data[vars_to_factor], as.factor)
factor_vars <- colnames(my_reg_data)[lapply(my_reg_data, class) == "factor"]
table(factor_vars)
# check again # check again
lapply(my_reg_data, class) lapply(my_reg_data, class)
# all parasm for reg # all parasm for reg
my_reg_params = c("age" my_reg_params_DEL = c("age"
, "age_bins"
, "vl_pfu_ul_npa1" , "vl_pfu_ul_npa1"
, "los" , "los"
, "onset2final" , "onset2final"
@ -123,11 +136,12 @@ my_reg_params = c("age"
, "T2_resp_score" , "T2_resp_score"
, "inresp_sev" , "inresp_sev"
, "steroid" , "steroid"
, "age_bins"
, "o2_sat_bin" , "o2_sat_bin"
, "onset_initial_bin" , "onset_initial_bin"
, "steroid_ics" , "steroid_ics"
, "t1_resp_recoded") , "t1_resp_recoded"
, sig_npa_cols)
#================= #=================
# reg data prepare # reg data prepare
@ -139,16 +153,16 @@ pv2 = "t1_resp_recoded"
#reg_params_mixed = my_vars[!my_vars%in%pv1] #reg_params_mixed = my_vars[!my_vars%in%pv1]
######################################################################## ########################################################################
#================= # outcome: death
# outcome2 ########################################################################
#=================
#----------------------------- #-----------------------------
# outcome: death + obesity # outcome: death + obesity
# data: fp adults # data: fp adults
#----------------------------- #-----------------------------
my_reg_params1 = my_reg_params[!my_reg_params%in%c("death", "obesity")] my_reg_params1 = my_reg_params[!my_reg_params%in%c("death", "obesity")]
for(i in my_reg_params1) { sink(file = "reg_output_out1.txt", append = T)
for(i in my_reg_params1) {
#print (i) #print (i)
p_form = as.formula(paste("death ~ obesity + ", i ,sep = "")) p_form = as.formula(paste("death ~ obesity + ", i ,sep = ""))
print(p_form) print(p_form)
@ -159,16 +173,20 @@ for(i in my_reg_params1) {
print(nobs(model_reg)) print(nobs(model_reg))
cat("=================================================================================\n") cat("=================================================================================\n")
} }
sink()
#----------------------------- #-----------------------------
# outcome: death # outcome: death + obesity
# data: fp adults # data: fp adults
#----------------------------- #-----------------------------
my_reg_params1v2 = my_reg_params[!my_reg_params%in%c("death")] my_reg_params2 = my_reg_params[!my_reg_params%in%c( "death"
, "obesity"
, "asthma")]
for(i in my_reg_params1v2) { sink(file = "reg_output_out1_ob_as.txt", append = T)
for(i in my_reg_params2) {
#print (i) #print (i)
p_form = as.formula(paste("death ~ ", i ,sep = "")) p_form = as.formula(paste("death ~ obesity + asthma +", i ,sep = ""))
print(p_form) print(p_form)
model_reg = glm(p_form , family = binomial, data = my_reg_data) model_reg = glm(p_form , family = binomial, data = my_reg_data)
print(summary(model_reg)) print(summary(model_reg))
@ -177,20 +195,45 @@ for(i in my_reg_params1v2) {
print(nobs(model_reg)) print(nobs(model_reg))
cat("=================================================================================\n") cat("=================================================================================\n")
} }
sink()
#-------------------
# Full model
#-------------------
full_mod = glm(death ~ obesity +
asthma +
#age +
age_bins +
#t1_resp_recoded +
com_noasthma +
eotaxin3_npa1 +
il1_npa1+
il8_2_npa1+
il12p70_npa1
, family = "binomial"
, data = my_reg_data)
print(summary(full_mod))
print(exp(cbind(OR = coef(full_mod), confint(full_mod))))
print(nobs(full_mod))
cat("=================================================================================\n")
########################################################################
# outcome: t1_resp_recoded
######################################################################## ########################################################################
#=================
# outcome2
#=================
#----------------------------- #-----------------------------
# outcome: t1_resp_recoded + obesity # outcome: t1_resp_recoded + obesity
# data: fp adults # data: fp adults
#----------------------------- #-----------------------------
my_reg_params2 = my_reg_params[!my_reg_params%in%c("death" my_reg_params3 = my_reg_params[!my_reg_params%in%c("t1_resp_recoded"
, "obesity" , "T1_resp_score"
, "t1_resp_recoded" , "death"
, "T1_resp_score")] , "obesity")]
for(i in my_reg_params2) { sink(file = "reg_output_out2.txt", append = T)
for(i in my_reg_params3) {
#print (i) #print (i)
p_form = as.formula(paste("t1_resp_recoded ~ obesity + ", i ,sep = "")) p_form = as.formula(paste("t1_resp_recoded ~ obesity + ", i ,sep = ""))
print(p_form) print(p_form)
@ -201,20 +244,22 @@ for(i in my_reg_params2) {
print(nobs(model_reg)) print(nobs(model_reg))
cat("=================================================================================\n") cat("=================================================================================\n")
} }
sink()
#----------------------------- #-----------------------------
# outcome: t1_resp_recoded # outcome: t1_resp_recoded + obesity
# data: fp adults # data: fp adults
#----------------------------- #-----------------------------
my_reg_params2v2 = my_reg_params[!my_reg_params%in%c("death" my_reg_params4 = my_reg_params[!my_reg_params%in%c( "t1_resp_recoded"
#, "obesity" , "T1_resp_score"
, "t1_resp_recoded" , "obesity"
, "T1_resp_score")] , "death"
, "asthma")]
for(i in my_reg_params2v2) { sink(file = "reg_output_out2_ob_as.txt", append = T)
for(i in my_reg_params4) {
#print (i) #print (i)
p_form = as.formula(paste("t1_resp_recoded ~ ", i ,sep = "")) p_form = as.formula(paste("t1_resp_recoded ~ obesity + asthma +", i ,sep = ""))
print(p_form) print(p_form)
model_reg = glm(p_form , family = binomial, data = my_reg_data) model_reg = glm(p_form , family = binomial, data = my_reg_data)
print(summary(model_reg)) print(summary(model_reg))
@ -223,86 +268,23 @@ for(i in my_reg_params2v2) {
print(nobs(model_reg)) print(nobs(model_reg))
cat("=================================================================================\n") cat("=================================================================================\n")
} }
sink()
######################################################################## #-------------------
# Full model # Full model
######################################################################## #-------------------
#full_mod2 = glm(death ~ obesity +
# asthma +
# #age +
# #age_bins +
#
#
# , family = "binomial"
# , data = my_reg_data)
#
#print(summary(full_mod2))
#print(exp(cbind(OR = coef(full_mod2), confint(full_mod2))))
#print(nobs(full_mod2))
full_mod = glm(death ~ obesity + cat("=================================================================================\n")
age +
#age_bins +
obesity +
asthma +
t1_resp_recoded +
#ia_cxr
, family = "binomial", data = my_reg_data)
summary(full_mod)
########################################################################
# mediators
########################################################################
sig_npa_cols = c("mosaic", sig_npa_cols)
my_med_sig = fp_adults[, sig_npa_cols]
my_reg_data_med = merge(clinical_df_ics, my_med_sig
, by = intersect(names(clinical_df_ics), names(my_med_sig))
)
#-----------------------------
# outcome: death + obesity
# data: fp adults
#-----------------------------
#my_reg_params_meds = c(my_reg_params, sig_npa_cols)
my_reg_params_meds = colnames(my_reg_data_med)
my_reg_params_meds1 = my_reg_params_meds[!my_reg_params_meds%in%c("mosaic", "flustat"
, "onset_2_initial"
, "onsfindeath"
, "ia_exac_copd"
, "death"
, "obesity")]
for(i in my_reg_params_meds1) {
#print (i)
p_form = as.formula(paste("death ~ obesity + ", i ,sep = ""))
print(p_form)
model_reg = glm(p_form , family = binomial, data = my_reg_data_med)
print(summary(model_reg))
print(exp(cbind(OR = coef(model_reg), confint(model_reg))))
#print (PseudoR2(model_reg))
print(nobs(model_reg))
cat("=================================================================================\n")
}
#-----------------------------
# outcome: t1_resp_recoded ~ obesity + asthma
# data: fp adults
#-----------------------------
my_reg_params_meds2v2 = my_reg_params_meds[!my_reg_params_meds%in%c("mosaic"
, "flustat"
, "onset_2_initial"
, "onsfindeath"
, "ia_exac_copd"
, "death"
, "t1_resp_recoded"
, "T1_resp_score"
, "asthma")]
for(i in my_reg_params_meds2v2) {
#print (i)
p_form = as.formula(paste("t1_resp_recoded ~ obesity + asthma + ", i ,sep = ""))
print(p_form)
model_reg = glm(p_form , family = binomial, data = my_reg_data_med)
print(summary(model_reg))
print(exp(cbind(OR = coef(model_reg), confint(model_reg))))
#print (PseudoR2(model_reg))
print(nobs(model_reg))
cat("=================================================================================\n")
}

View file

@ -66,10 +66,6 @@ if (table(fp_adults$flustat == 1)[[1]] == nrow(fp_adults) ){
} else{ } else{
cat ("\nFAIL: adult df number mismatch!") cat ("\nFAIL: adult df number mismatch!")
} }
#=============================================
# FLU positive adult patients: without asthma
#=============================================
#----------------------------------- #-----------------------------------
# asthma and copd status correction # asthma and copd status correction
# for conflicting field! # for conflicting field!
@ -101,18 +97,21 @@ if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0 ){
quit() quit()
} }
cat("\nExtracting flu positive without asthma") #=============================================
table(fp_adults$asthma) # FLU positive adult patients: without asthma
cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]] #=============================================
, "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]]) #cat("\nExtracting flu positive without asthma")
str(fp_adults$asthma) #table(fp_adults$asthma)
#cat("\nNo. of asthmatics:", table(fp_adults$asthma)[[2]]
# , "\nNo. of non-asthmatics:", table(fp_adults$asthma)[[1]])
#str(fp_adults$asthma)
table(fp_adults$obesity) #table(fp_adults$obesity)
table(fp_adults$obesity, fp_adults$asthma) #table(fp_adults$obesity, fp_adults$asthma)
fp_adults_na = fp_adults[fp_adults$asthma == 0,] #fp_adults_na = fp_adults[fp_adults$asthma == 0,]
table(fp_adults_na$obesity) #table(fp_adults_na$obesity)
table(fp_adults_na$obesity, fp_adults_na$asthma) #table(fp_adults_na$obesity, fp_adults_na$asthma)
#============ #============
# hc # hc
@ -120,4 +119,13 @@ table(fp_adults_na$obesity, fp_adults_na$asthma)
#hc_data<- read.csv("/home/backup/MOSAIC/MEDIATOR_Data/master_file/Mediators_for_HC.csv") #hc_data<- read.csv("/home/backup/MOSAIC/MEDIATOR_Data/master_file/Mediators_for_HC.csv")
#str(hc_data) #str(hc_data)
#table(hc_data$Timepoint, hc_data$Sample) #table(hc_data$Timepoint, hc_data$Sample)
######################################################################## ########################################################################
# quick sanity checks
table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4
table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3
# clear unnecessary variables
rm(metadata_all)
rm(all_df, adult_df)