#!/usr/bin/Rscript getwd() setwd('~/git/mosaic_2020/') getwd() ######################################################################## # TASK: Extract clinical data columns and recode as required for analysis # corrects the asthma and copd status for patients # creates age_bins and other intervals for clinical params # merges steroid ics data and outcome var based on T1 resp score # The steroid_ics data file is read from outdir and has been manually sourced # TODO: for extra caution add and run checks on the steroid_ics file ######################################################################## #==================== # Input: source data # and steroid ics file # This file contains steroid_ics data # and another outcome variable based on T1_resp score #==================== source("read_data.R") source("colnames_clinical_meds.R") # read: steroid_ics file infile_ics = paste0(outdir, "data_ics.csv") infile_ics clinical_ics = read.csv(infile_ics) str(clinical_ics) ######################################################################## # quick sanity checks table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3 # clear unnecessary variables rm(all_df, adult_df, metadata_all) ######################################################################## # Clinical_data extraction ######################################################################## cat("\nExtracting:", length(clinical_cols), "cols from fp_adults") clinical_df = fp_adults[, clinical_cols] # sanity checks if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df)) }else{ cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!") quit() } table(clinical_df$ia_exac_copd) str(clinical_df) #clinical_df$o2_sat_suppl ######################################################################## #================================== # Check asthma and copd conflict #================================= if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){ cat("PASS: asthma and copd do not conflict") }else{ cat("Conflict detected in asthm and copd filed, attempting to resolve...") # Reassign the copd and asthma status and do some checks table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd)) fp_adults$ia_exac_copd[fp_adults$ia_exac_copd< 1]<- 0 fp_adults$ia_exac_copd[is.na(fp_adults$ia_exac_copd)] <- 0 table(fp_adults$ia_exac_copd); sum(is.na(fp_adults$ia_exac_copd)) # check copd and asthma status table(fp_adults$ia_exac_copd, fp_adults$asthma) check_copd_and_asthma_1<- subset(fp_adults, ia_exac_copd ==1 & asthma == 1) # check this is 3 # reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma fp_adults$asthma[fp_adults$ia_exac_copd == 1 & fp_adults$asthma == 1]= 0 table(fp_adults$ia_exac_copd, fp_adults$asthma) foo<- subset(fp_adults, asthma==1 & ia_exac_copd ==1) # check that its 0 rm(check_copd_and_asthma_1, foo) cat("Check status again...") } #===================================================================== #================================= # resp scores: In, max and t1 & t2 #================================= # count the resp scores max_resp_score_table<- table(clinical_df$max_resp_score) max_resp_score_table T1_resp_score_table<- table(clinical_df$T1_resp_score) T1_resp_score_table T2_resp_score_table<- table(clinical_df$T2_resp_score) T2_resp_score_table Inresp_sev<- table(clinical_df$inresp_sev) Inresp_sev # Reassign the resp score so all 4 are replace by 3 clinical_df$max_resp_score[clinical_df$max_resp_score == 4 ] <- 3 revised_resp_score_table<- table(clinical_df$max_resp_score) revised_resp_score_table clinical_df$T1_resp_score[clinical_df$T1_resp_score ==4 ] <- 3 revised_T1_resp_score_table<- table(clinical_df$T1_resp_score) revised_T1_resp_score_table clinical_df$T2_resp_score[clinical_df$T2_resp_score == 4]<- 3 revised_T2_resp_score_table<- table(clinical_df$T2_resp_score) revised_T2_resp_score_table clinical_df$inresp_sev[clinical_df$inresp_sev == 4]<- 3 revised_Inresp_sev<- table(clinical_df$inresp_sev) revised_Inresp_sev #===================================================================== # Remove these after checking rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) #===================================================================== # Binning # "(": not inclusive # "]": inclusive #======== # age #======== # Create categories of variables clinical_df$age_int = round(clinical_df$age, digits = 0) table(clinical_df$age_int) table(clinical_df$asthma, clinical_df$age_int) min(clinical_df$age_int); max(clinical_df$age_int) max_age_interval = round_any(max(clinical_df$age_int), 10, f = ceiling) max_age_interval min_age = min(clinical_df$age_int); min_age #19 min_age_interval = min_age - 1; min_age_interval #age_bins = cut(clinical_df$age_int, c(0,18,30,40,50,60,70,80,90)) age_bins = cut(clinical_df$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) clinical_df$age_bins = age_bins dim(clinical_df) # 133 28 # age_bins (to keep consistent with the results table) class(clinical_df$age_bins) levels(clinical_df$age_bins) #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" table(clinical_df$asthma, clinical_df$age_bins) # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] #0 25 17 25 14 11 1 #1 11 8 12 5 2 2 if (sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) ){ cat("\nPASS: age_bins assigned successfully") }else{ cat("\nFAIL: no. mismatch when assigning age_bins") quit() } # reassign levels class(clinical_df$age_bins) levels(clinical_df$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") table(clinical_df$asthma, clinical_df$age_bins) table(clinical_df$asthma, clinical_df$age_bins) # (18,30] (30,40] (40,50] (50,80] #0 25 17 25 26 #1 11 8 12 9 sum(table(clinical_df$asthma, clinical_df$age_bins)) == nrow(clinical_df) #=========================== # O2 saturation binning #=========================== clinical_df$o2_sat_admis n1 = sum(is.na(clinical_df$o2_sat_admis)) clinical_df$o2_sat_admis = round(clinical_df$o2_sat_admis, digits = 0) table(clinical_df$o2_sat_admis) tot_o2 = sum(table(clinical_df$o2_sat_admis))- table(clinical_df$o2_sat_admis)[["-1"]] tot_o2 n_text_code = table(clinical_df$o2_sat_admis)[["-1"]] clinical_df$o2_sat_admis[clinical_df$o2_sat_admis <0] <- NA n2 = sum(is.na(clinical_df$o2_sat_admis)) if (n2 == n1 + n_text_code) { cat ("PASS: -1 code converted to NA") } else{ cat("FAIL: something went wrong!") } o2_sat_bin = cut(clinical_df$o2_sat_admis, c(0,92,100)) clinical_df$o2_sat_bin = o2_sat_bin table(clinical_df$o2_sat_bin) sum(table(clinical_df$o2_sat_bin)) == tot_o2 #=========================== # Onset to initial binning #=========================== clinical_df$onset_2_initial max_in = max(clinical_df$onset_2_initial); max_in #23 min_in = min(clinical_df$onset_2_initial) - 1 ; min_in # -6 tot_onset2ini = sum(table(clinical_df$onset_2_initial)) tot_onset2ini onset_initial_bin = cut(clinical_df$onset_2_initial, c(min_in, 4, max_in)) clinical_df$onset_initial_bin = onset_initial_bin sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= if (! is.factor(clinical_df$sfluv)){ clinical_df$sfluv = as.factor(clinical_df$sfluv) } class(clinical_df$sfluv) levels(clinical_df$sfluv) table(clinical_df$sfluv) table(clinical_df$asthma, clinical_df$sfluv) # reassign levels(clinical_df$sfluv) <- c("0", "0", "1") table(clinical_df$asthma, clinical_df$sfluv) #======================= # h1n1v #======================= if (! is.factor(clinical_df$h1n1v)){ clinical_df$h1n1v = as.factor(clinical_df$h1n1v) } class(clinical_df$h1n1v) levels(clinical_df$h1n1v) table(clinical_df$h1n1v) table(clinical_df$asthma, clinical_df$h1n1v) # reassign levels(clinical_df$h1n1v) <- c("0", "0", "1") table(clinical_df$asthma, clinical_df$h1n1v) #======================= # ethnicity #======================= class(clinical_df$ethnicity) # integer table(clinical_df$ethnicity) table(clinical_df$asthma, clinical_df$ethnicity) clinical_df$ethnicity[clinical_df$ethnicity == 4] <- 2 table(clinical_df$ethnicity) table(clinical_df$asthma, clinical_df$ethnicity) #======================= # pneumonia #======================= table(clinical_df$ia_cxr) class(clinical_df$ia_cxr) # integer # ia_cxr 2 ---> yes pneumonia (1) # 1 ---> no (0) # ! 1 or 2 -- > "unknown" # reassign the pneumonia codes #0: not performed #1: normal #2: findings consistent with pneumonia #3: abnormal #-1: not recorded #-2: n/a specified by the clinician # not in the data... #-3: unknown specified by clinician table(clinical_df$ia_cxr) #-3 -1 0 1 2 3 #5 48 13 47 17 3 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist clinical_df$ia_cxr[clinical_df$ia_cxr == -3 | clinical_df$ia_cxr == -1 | clinical_df$ia_cxr == 0 | clinical_df$ia_cxr == 3 ] <- NA table(clinical_df$ia_cxr) # 1 2 #69 47 17 sum(is.na(clinical_df$ia_cxr)) clinical_df$ia_cxr[clinical_df$ia_cxr == 1] <- 0 clinical_df$ia_cxr[clinical_df$ia_cxr == 2] <- 1 table(clinical_df$ia_cxr) # 0 1 #69 47 17 #======================= # smoking [tricky one] #======================= class(clinical_df$smoking) # integer table(clinical_df$asthma, clinical_df$smoking) # orig # -3 -1 1 2 3 4 #0 15 9 22 2 15 30 #1 4 2 13 0 4 17 # -3 -1 1 2 3 4 #0 14 9 20 2 15 30 #1 5 2 15 0 4 17 # never smoking, 4 and 2 -- > no (0) #1 and 3 ---> yes (1) #!-3 and -1 ---- > NA ################# smoking #1: current daily ===> categ smoker(1) #2: occasional =====> categ no smoker(0) #3: ex-smoker ===> categ smoker(1) #4: never =====> categ no smoker(0) #-1: not recorded =====> categ blank (NA) #-2: n/a specified by the clinician =====> categ blank (NA) #-3: unknown specified by clinician=====> categ blank (NA) table(clinical_df$smoking) #-3 -1 1 2 3 4 #19 11 35 2 19 47 # reassign the smoking codes clinical_df$smoking[clinical_df$smoking == 4 | clinical_df$smoking == 2 ] <- 0 clinical_df$smoking[clinical_df$smoking == 1 | clinical_df$smoking == 3 ] <- 1 clinical_df$smoking[clinical_df$smoking == -1 | clinical_df$smoking == -2 | clinical_df$smoking == -3 ] <- NA table(clinical_df$smoking); sum(is.na(clinical_df$smoking)) # 0 1 #30 49 54 table(clinical_df$asthma, clinical_df$smoking) # orig # 0 1 #0 24 32 37 #1 6 17 17 ################################################################ #========================= # Merge: clinical_df and infile ics #========================= merging_cols = intersect( names(clinical_df), names(clinical_ics) ) merging_cols clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics colnames(clinical_df_ics) # change colname of logistic_outcome c1 = which(colnames(clinical_df_ics) == "logistic_outcome") colnames(clinical_df_ics)[c1] <- "t1_resp_recoded" if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics) , "\nChecking ncols...") if ( ncol(clinical_df_ics) == ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) ){ cat("\nPASS: No. of cols match, ncol =", ncol(clinical_df_ics)) } else { cat("\nFAIL: ncols mismatch" , "Expected ncols:", ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) , "\nGot:", ncol(clinical_df_ics)) } } else { cat("\nFAIL: nrows mismatch" , "\nExpected nrows:", nrow(fp_adults)) } # change the factor vars to integers str(clinical_df_ics) factor_vars = lapply(clinical_df_ics, class) == "factor" table(factor_vars) clinical_df_ics[, factor_vars] <- lapply(clinical_df_ics[, factor_vars], as.integer) table(factor_vars) str(clinical_df_ics) #====================== # writing output file #====================== outfile_name_reg = "clinical_df_recoded.csv" outfile_reg = paste0(outdir, outfile_name_reg) cat("\nWriting clinical file for regression:", outfile_reg) #write.csv(clinical_df_ics, file = outfile_reg) ################################################################ rm(age_bins, max_age_interval, max_in, min_in , o2_sat_bin, onset_initial_bin, tot_o2 , tot_onset2ini, meta_data_cols , clinical_df)