#!/usr/bin/Rscript getwd() setwd('~/git/mosaic_2020/') getwd() ######################################################################## # TASK: Extract relevant columns from mosaic adults data # npa ######################################################################## #==================== # Input: source data for clinical #==================== source("data_extraction_formatting_clinical.R") #source("colnames_clinical_meds.R") #======================================= # Data for mediator to include in regression #======================================= cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults") med_df = fp_adults[, c("mosaic", sig_npa_cols)] # sanity checks if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ cat("PASS: binary data obs are complete, n =", nrow(clinical_df)) }else{ cat("FAIL: Incomplete data for binary outcomes. Please check and decide!") quit() } table(clinical_df$ia_exac_copd) ######################################################################## # Data extraction for regression ######################################################################## common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)] cat("\nMerging clinical and mediator data for regression" ,"\nMerging on column:", common_cols) reg_data = merge(clinical_df, med_df , by = common_cols) if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){ cat("\nNo. of rows match, nrow =", nrow(clinical_df) , "\nChecking ncols...") if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df) - length(common_cols) ){ cat("\nNo. of cols match, ncol =", ncol(reg_data)) } else { cat("FAIL: ncols mismatch" , "Expected ncols:", ncol(clinical_df) + ncol(med_df) - length(common_cols) , "\nGot:", ncol(reg_data)) } } else { cat("FAIL: nrows mismatch" , "\nExpected nrows:", nrow(fp_adults)) } ######################################################################## # Reassign the copd and asthma status and do some checks table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0 reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0 table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) # check copd and asthma status table(reg_data$ia_exac_copd, reg_data$asthma) check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 # reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 table(reg_data$ia_exac_copd, reg_data$asthma) foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0 rm(check_copd_and_asthma_1, foo) #===================================================================== # count the resp scores max_resp_score_table<- table(reg_data$max_resp_score) max_resp_score_table T1_resp_score_table<- table(reg_data$T1_resp_score) T1_resp_score_table T2_resp_score_table<- table(reg_data$T2_resp_score) T2_resp_score_table Inresp_sev<- table(reg_data$inresp_sev) Inresp_sev # Reassign the resp score so all 4 are replace by 3 reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3 revised_resp_score_table<- table(reg_data$max_resp_score) revised_resp_score_table reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3 revised_T1_resp_score_table<- table(reg_data$T1_resp_score) revised_T1_resp_score_table reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3 revised_T2_resp_score_table<- table(reg_data$T2_resp_score) revised_T2_resp_score_table reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3 revised_Inresp_sev<- table(reg_data$inresp_sev) revised_Inresp_sev #===================================================================== # Remove these after checking rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) #===================================================================== # Binning # "(": not inclusive # "]": inclusive #======== # age #======== # Create categories of variables reg_data$age = round(reg_data$age, digits = 0) table(reg_data$age) table(reg_data$asthma, reg_data$age) min(reg_data$age); max(reg_data$age) max_age_interval = round_any(max(reg_data$age), 10, f = ceiling) max_age_interval min_age = min(reg_data$age); min_age #19 min_age_interval = min_age - 1; min_age_interval #age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90)) age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) reg_data$age_bins = age_bins dim(reg_data) # 133 27 # age_bins (to keep consistent with the results table) class(reg_data$age_bins) levels(reg_data$age_bins) #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" table(reg_data$asthma, reg_data$age_bins) # (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] #0 25 17 25 14 11 1 #1 11 8 12 5 3 2 if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){ cat("PASS: age_bins assigned successfully") }else{ cat("FAIL: no. mismatch when assigning age_bins") quit() } # reassign class(reg_data$age_bins) levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") table(reg_data$asthma, reg_data$age_bins) table(reg_data$asthma, reg_data$age_bins) # (18,30] (30,40] (40,50] (50,80] #0 25 17 25 26 #1 11 8 12 9 sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) #=========================== # O2 saturation binning #=========================== reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0) table(reg_data$o2_sat_admis) tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]] tot_o2 o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100)) reg_data$o2_sat_bin = o2_sat_bin table(reg_data$o2_sat_bin) sum(table(reg_data$o2_sat_bin)) == tot_o2 #=========================== # Onset to initial binning #=========================== max_in = max(reg_data$onset_2_initial); max_in #23 min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6 tot_onset2ini = sum(table(reg_data$onset_2_initial)) tot_onset2ini onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in)) reg_data$onset_initial_bin = onset_initial_bin sum(table(reg_data$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= # should be a factor if (! is.factor(reg_data$sfluv)){ reg_data$sfluv = as.factor(reg_data$sfluv) } class(reg_data$sfluv) #[1] "factor" levels(reg_data$sfluv) table(reg_data$asthma, reg_data$sfluv) # reassign levels(reg_data$sfluv) <- c("0", "0", "1") table(reg_data$asthma, reg_data$sfluv) #======================= # h1n1v #======================= # should be a factor if (! is.factor(reg_data$h1n1v)){ reg_data$h1n1v = as.factor(reg_data$h1n1v) } class(reg_data$h1n1v) #[1] "factor" levels(reg_data$h1n1v) table(reg_data$asthma, reg_data$h1n1v) # reassign levels(reg_data$h1n1v) <- c("0", "0", "1") table(reg_data$asthma, reg_data$h1n1v) #======================= # ethnicity #======================= class(reg_data$ethnicity) # integer table(reg_data$asthma, reg_data$ethnicity) reg_data$ethnicity[reg_data$ethnicity == 4] <- 2 table(reg_data$asthma, reg_data$ethnicity) #======================= # pneumonia #======================= class(reg_data$ia_cxr) # integer # ia_cxr 2 ---> yes pneumonia (1) # 1 ---> no (0) # ! 1 or 2 -- > "unkown" # reassign the pneumonia codes #0: not performed #1: normal #2: findings consistent with pneumonia #3: abnormal #-1: not recorded #-2: n/a specified by the clinician # not in the data... #-3: unknown specified by clinician table(reg_data$ia_cxr) #-3 -1 0 1 2 3 #5 48 13 47 17 3 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- "" table(reg_data$ia_cxr) # 1 2 #69 47 17 reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0 reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1 table(reg_data$ia_cxr) # 0 1 #69 47 17 #======================= # smoking [tricky one] #======================= class(reg_data$smoking) # integer table(reg_data$asthma, reg_data$smoking) # orig # -3 -1 1 2 3 4 #0 15 9 22 2 15 30 #1 4 2 13 0 4 17 # -3 -1 1 2 3 4 #0 14 9 20 2 15 30 #1 5 2 15 0 4 17 # never smoking, 4 and 2 -- > no (0) #1 and 3 ---> yes (1) #!-3 and -1 ---- > NA ################# smoking #1: current daily ===> categ smoker(1) #2: occasional =====> categ no smoker(0) #3: ex-smoker ===> categ smoker(1) #4: never =====> categ no smoker(0) #-1: not recorded =====> categ blank (NA) #-2: n/a specified by the clinician =====> categ blank (NA) #-3: unknown specified by clinician=====> categ blank (NA) table(reg_data$smoking) #-3 -1 1 2 3 4 #19 11 35 2 19 47 # reassign the smoking codes reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0 reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1 reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- "" table(reg_data$smoking) # 0 1 #30 49 54 table(reg_data$asthma, reg_data$smoking) # orig # 0 1 #0 24 32 37 #1 6 17 17 # 0 1 #0 23 32 35 #1 7 17 19 ################################################################ #================== # writing output file #================== outfile_name_reg = "reg_data_recoded_with_NA.csv" outfile_reg = paste0(outdir, outfile_name_reg) cat("Writing clinical file for regression:", outfile_reg) #write.csv(reg_data, file = outfile_reg) ################################################################ rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)