#!/usr/bin/Rscript getwd() setwd('~/git/mosaic_2020/') getwd() ######################################################################## # TASK: Extract clinical data columns and recode as required for analysis # corrects the asthma and copd status for patients # creates age_bins and other intervals for clinical params # merges steroid ics data and outcome var based on T1 resp score # The steroid_ics data file is read from outdir and has been manually sourced # TODO: for extra caution add and run checks on the steroid_ics file ######################################################################## #==================== # Input: source data # and steroid ics file # This file contains steroid_ics data # and another outcome variable based on T1_resp score #==================== source("read_data.R") source("colnames_clinical_meds.R") # read: steroid_ics file infile_ics = paste0(outdir, "data_ics.csv") infile_ics clinical_ics = read.csv(infile_ics) str(clinical_ics) ######################################################################## table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) ######################################################################## # Clinical_data extraction ######################################################################## #cat("\nExtracting:", length(meta_clinical_cols), "cols from fp_adults") #clinical_df = fp_adults[, meta_clinical_cols] # sanity checks #if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ # cat("\nPASS: binary data obs are complete, n =", nrow(clinical_df)) #}else{ # cat("\nFAIL: Incomplete data for binary outcomes. Please check and decide!") # quit() #} #table(clinical_df$ia_exac_copd) #str(clinical_df) #clinical_df$o2_sat_suppl ######################################################################## #================================== # Check asthma and copd conflict #================================= if ( table(fp_adults$ia_exac_copd, fp_adults$asthma) [[2,2]] == 0){ cat("PASS: asthma and copd do not conflict") }else{ cat("Conflict detected in asthma and copd filed. Check script: read_data.R") quit() } #===================================================================== #================================= # resp scores: In, max and t1 & t2 #================================= # count the resp scores max_resp_score_table<- table(fp_adults$max_resp_score) max_resp_score_table T1_resp_score_table<- table(fp_adults$T1_resp_score) T1_resp_score_table T2_resp_score_table<- table(fp_adults$T2_resp_score) T2_resp_score_table Inresp_sev<- table(fp_adults$inresp_sev) Inresp_sev # Reassign the resp score so all 4 are replace by 3 fp_adults$max_resp_score[fp_adults$max_resp_score == 4 ] <- 3 revised_resp_score_table<- table(fp_adults$max_resp_score) revised_resp_score_table fp_adults$T1_resp_score[fp_adults$T1_resp_score ==4 ] <- 3 revised_T1_resp_score_table<- table(fp_adults$T1_resp_score) revised_T1_resp_score_table fp_adults$T2_resp_score[fp_adults$T2_resp_score == 4]<- 3 revised_T2_resp_score_table<- table(fp_adults$T2_resp_score) revised_T2_resp_score_table fp_adults$inresp_sev[fp_adults$inresp_sev == 4]<- 3 revised_Inresp_sev<- table(fp_adults$inresp_sev) revised_Inresp_sev #===================================================================== # Remove these after checking rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) #===================================================================== # Binning # "(": not inclusive # "]": inclusive #======== # age #======== # Create categories of variables fp_adults$age_int = round(fp_adults$age, digits = 0) table(fp_adults$age_int) table(fp_adults$asthma, fp_adults$age_int) min(fp_adults$age_int); max(fp_adults$age_int) max_age_interval = round_any(max(fp_adults$age_int), 10, f = ceiling) max_age_interval min_age = min(fp_adults$age_int); min_age #19 min_age_interval = min_age - 1; min_age_interval #age_bins = cut(fp_adults$age_int, c(0,18,30,40,50,60,70,80,90)) age_bins = cut(fp_adults$age_int, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) fp_adults$age_bins = age_bins dim(fp_adults) # 133 28 # age_bins (to keep consistent with the results table) class(fp_adults$age_bins) levels(fp_adults$age_bins) #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" table(fp_adults$asthma, fp_adults$age_bins) # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] #0 25 17 25 14 11 1 #1 11 8 12 5 2 2 if (sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults) ){ cat("\nPASS: age_bins assigned successfully") }else{ cat("\nFAIL: no. mismatch when assigning age_bins") quit() } # reassign levels class(fp_adults$age_bins) levels(fp_adults$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") table(fp_adults$asthma, fp_adults$age_bins) table(fp_adults$asthma, fp_adults$age_bins) # (18,30] (30,40] (40,50] (50,80] #0 25 17 25 26 #1 11 8 12 9 sum(table(fp_adults$asthma, fp_adults$age_bins)) == nrow(fp_adults) table(fp_adults$age_int) fp_adults = subset(fp_adults, select = -c(age_int)) table(fp_adults$age_int) class(fp_adults$age_bins) fp_adults$age_bins #=========================== # O2 saturation binning #=========================== fp_adults$o2_sat_admis n1 = sum(is.na(fp_adults$o2_sat_admis)) fp_adults$o2_sat_admis = round(fp_adults$o2_sat_admis, digits = 0) table(fp_adults$o2_sat_admis) tot_o2 = sum(table(fp_adults$o2_sat_admis))- table(fp_adults$o2_sat_admis)[["-1"]] tot_o2 n_text_code = table(fp_adults$o2_sat_admis)[["-1"]] fp_adults$o2_sat_admis[fp_adults$o2_sat_admis <0] <- NA n2 = sum(is.na(fp_adults$o2_sat_admis)) if (n2 == n1 + n_text_code) { cat ("PASS: -1 code converted to NA") } else{ cat("FAIL: something went wrong!") } o2_sat_bin = cut(fp_adults$o2_sat_admis, c(0,92,100)) fp_adults$o2_sat_bin = o2_sat_bin table(fp_adults$o2_sat_bin) sum(table(fp_adults$o2_sat_bin)) == tot_o2 #=========================== # Onset to initial binning #=========================== fp_adults$onset_2_initial max_in = max(fp_adults$onset_2_initial); max_in #23 min_in = min(fp_adults$onset_2_initial) - 1 ; min_in # -6 tot_onset2ini = sum(table(fp_adults$onset_2_initial)) tot_onset2ini onset_initial_bin = cut(fp_adults$onset_2_initial, c(min_in, 4, max_in)) fp_adults$onset_initial_bin = onset_initial_bin sum(table(fp_adults$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= # reassign as 0 and 1 table(fp_adults$sfluv) table(fp_adults$asthma, fp_adults$sfluv) fp_adults$sfluv = ifelse(fp_adults$sfluv == "yes", 1, 0) table(fp_adults$sfluv) table(fp_adults$asthma, fp_adults$sfluv) # convert to integer str(fp_adults$sfluv) fp_adults$sfluv = as.integer(fp_adults$sfluv) str(fp_adults$sfluv) #======================= # h1n1v #======================= # reassign as 0 and 1 table(fp_adults$h1n1v) table(fp_adults$asthma, fp_adults$h1n1v) fp_adults$h1n1v = ifelse(fp_adults$h1n1v == "yes", 1, 0) table(fp_adults$h1n1v) table(fp_adults$asthma, fp_adults$h1n1v) # convert to integer str(fp_adults$h1n1v) fp_adults$h1n1v = as.integer(fp_adults$h1n1v) str(fp_adults$h1n1v) #======================= # ethnicity #======================= class(fp_adults$ethnicity) # integer table(fp_adults$ethnicity) table(fp_adults$asthma, fp_adults$ethnicity) fp_adults$ethnicity[fp_adults$ethnicity == 4] <- 2 table(fp_adults$ethnicity) table(fp_adults$asthma, fp_adults$ethnicity) #======================= # pneumonia #======================= table(fp_adults$ia_cxr) class(fp_adults$ia_cxr) # integer # ia_cxr 2 ---> yes pneumonia (1) # 1 ---> no (0) # ! 1 or 2 -- > "unknown" # reassign the pneumonia codes #0: not performed #1: normal #2: findings consistent with pneumonia #3: abnormal #-1: not recorded #-2: n/a specified by the clinician # not in the data... #-3: unknown specified by clinician table(fp_adults$ia_cxr) #-3 -1 0 1 2 3 #5 48 13 47 17 3 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist fp_adults$ia_cxr[fp_adults$ia_cxr == -3 | fp_adults$ia_cxr == -1 | fp_adults$ia_cxr == 0 | fp_adults$ia_cxr == 3 ] <- NA table(fp_adults$ia_cxr) # 1 2 #69 47 17 sum(is.na(fp_adults$ia_cxr)) fp_adults$ia_cxr[fp_adults$ia_cxr == 1] <- 0 fp_adults$ia_cxr[fp_adults$ia_cxr == 2] <- 1 table(fp_adults$ia_cxr) # 0 1 #69 47 17 #======================= # smoking [tricky one] #======================= class(fp_adults$smoking) # integer table(fp_adults$asthma, fp_adults$smoking) # orig # -3 -1 1 2 3 4 #0 15 9 22 2 15 30 #1 4 2 13 0 4 17 # -3 -1 1 2 3 4 #0 14 9 20 2 15 30 #1 5 2 15 0 4 17 # never smoking, 4 and 2 -- > no (0) #1 and 3 ---> yes (1) #!-3 and -1 ---- > NA ################# smoking #1: current daily ===> categ smoker(1) #2: occasional =====> categ no smoker(0) #3: ex-smoker ===> categ smoker(1) #4: never =====> categ no smoker(0) #-1: not recorded =====> categ blank (NA) #-2: n/a specified by the clinician =====> categ blank (NA) #-3: unknown specified by clinician=====> categ blank (NA) table(fp_adults$smoking) #-3 -1 1 2 3 4 #19 11 35 2 19 47 # reassign the smoking codes fp_adults$smoking[fp_adults$smoking == 4 | fp_adults$smoking == 2 ] <- 0 fp_adults$smoking[fp_adults$smoking == 1 | fp_adults$smoking == 3 ] <- 1 fp_adults$smoking[fp_adults$smoking == -1 | fp_adults$smoking == -2 | fp_adults$smoking == -3 ] <- NA table(fp_adults$smoking); sum(is.na(fp_adults$smoking)) # 0 1 #30 49 54 table(fp_adults$asthma, fp_adults$smoking) # orig # 0 1 #0 24 32 37 #1 6 17 17 ################################################################ #========================= # Merge: fp_adults and infile ics #========================= merging_cols = intersect( names(fp_adults), names(clinical_ics) ) merging_cols fp_adults_ics = merge(fp_adults, clinical_ics, by = merging_cols, all = T); fp_adults_ics colnames(fp_adults_ics) if (nrow(fp_adults_ics) == nrow(fp_adults) & nrow(clinical_ics)){ cat("\nPASS: No. of rows match, nrow =", nrow(fp_adults_ics) , "\nChecking ncols...") if ( ncol(fp_adults_ics) == ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols) ){ cat("\nPASS: No. of cols match, ncol =", ncol(fp_adults_ics)) } else { cat("\nFAIL: ncols mismatch" , "Expected ncols:", ncol(fp_adults) + ncol(clinical_ics) - length(merging_cols) , "\nGot:", ncol(fp_adults_ics)) } } else { cat("\nFAIL: nrows mismatch" , "\nExpected nrows:", nrow(fp_adults)) } #========================= # add binary outcome for T1 resp score #========================= table(fp_adults_ics$T1_resp_score) fp_adults_ics$t1_resp_recoded = ifelse(fp_adults_ics$T1_resp_score <3, 0, 1) table(fp_adults_ics$t1_resp_recoded) #table(fp_adults_ics$steroid) table(fp_adults_ics$steroid_ics) #========================= # change the factor vars to integers #========================= #str(fp_adults_ics) #factor_vars = lapply(fp_adults_ics, class) == "factor" #table(factor_vars) #fp_adults_ics[, factor_vars] <- lapply(fp_adults_ics[, factor_vars], as.integer) #table(factor_vars) #str(fp_adults_ics) #========================= # clinical_df only #========================= clinical_df_ics = fp_adults_ics[, c(meta_clinical_cols, "steroid_ics")] #========================= # FIXME: decide! remove cols #========================= #fp_adults_ics = subset(fp_adults_ics, select = -c(onset_2_initial)) #========================= # fp_adults_ics: without asthma #========================= #fp_adults_ics_na = fp_adults_ics[fp_adults_ics$asthma == 0,] #========================= # fp_adults_ics: without severity 3 #========================= #table(fp_adults_ics$T1_resp_score) #table(fp_adults_ics$T1_resp_score!=3)# #fp_adults_ics_ns = fp_adults_ics[fp_adults_ics$T1_resp_score!=3,] #table(fp_adults_ics_ns$T1_resp_score) #========================= # cols_added #========================= clinical_cols_added = c("age_bins" , "o2_sat_bin" , "onset_initial_bin" , "steroid_ics" , "t1_resp_recoded") #====================== # writing output file #====================== outfile_name_reg = "fp_adults_recoded.csv" outfile_reg = paste0(outdir, outfile_name_reg) cat("\nWriting clinical file for regression:", outfile_reg) #write.csv(fp_adults_ics, file = outfile_reg) ################################################################ rm(age_bins, max_age_interval, max_in, min_in , min_age, min_age_interval , o2_sat_bin, onset_initial_bin, tot_o2 , n_text_code, n1, n2, tot_onset2ini, infile_ics , fp_adults, clinical_ics) ################################################################