diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R index b6ff2c3..8ee3f08 100644 --- a/data_extraction_formatting_clinical.R +++ b/data_extraction_formatting_clinical.R @@ -3,14 +3,30 @@ getwd() setwd('~/git/mosaic_2020/') getwd() ######################################################################## -# TASK: Extract relevant columns from mosaic adults data -# npa +# TASK: Extract clinical data columns and recode as required for analysis +# corrects the asthma and copd status for patients +# creates age_bins and other intervals for clinical params +# merges steroid ics data and outcome var based on T1 resp score +# The steroid_ics data file is read from outdir and has been manually sourced + +# TODO: for extra caution add and run checks on the steroid_ics file ######################################################################## #==================== -# Input: source data +# Input: source data +# and steroid ics file +# This file contains steroid_ics data +# and another outcome variable based on T1_resp score #==================== source("read_data.R") source("colnames_clinical_meds.R") + +# read: steroid_ics file +infile_ics = paste0(outdir, "data_ics.csv") +infile_ics + +clinical_ics = read.csv(infile_ics) +str(clinical_ics) + ######################################################################## # quick sanity checks table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 @@ -21,7 +37,7 @@ table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3 rm(all_df, adult_df, metadata_all) ######################################################################## -# Clinical_data extraction +# Clinical_data extraction ######################################################################## cat("\nExtracting:", length(clinical_cols), "cols from fp_adults") @@ -38,6 +54,11 @@ if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(tab table(clinical_df$ia_exac_copd) ######################################################################## +#================================== +# asthma and copd status correction +# for conflicting field! +#================================= + # Reassign the copd and asthma status and do some checks table(clinical_df$ia_exac_copd); sum(is.na(clinical_df$ia_exac_copd)) @@ -58,6 +79,10 @@ foo<- subset(clinical_df, asthma==1 & ia_exac_copd ==1) # check that its 0 rm(check_copd_and_asthma_1, foo) #===================================================================== +#================================= +# resp scores: In, max and t1 & t2 +#================================= + # count the resp scores max_resp_score_table<- table(clinical_df$max_resp_score) max_resp_score_table @@ -297,15 +322,38 @@ table(clinical_df$asthma, clinical_df$smoking) #1 7 17 19 ################################################################ -#================== + +#========================= +# Merge: clinical_df and infile ics +#========================= +merging_cols = intersect( names(clinical_df), names(clinical_ics) ) + +clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics + +if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){ + cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics) + , "\nChecking ncols...") + if ( ncol(clinical_df_ics) == ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) ){ + cat("\nPASS: No. of cols match, ncol =", ncol(clinical_df_ics)) + } else { + cat("\nFAIL: ncols mismatch" + , "Expected ncols:", ncol(clinical_df) + ncol(clinical_ics) - length(merging_cols) + , "\nGot:", ncol(clinical_df_ics)) + } +} else { + cat("\nFAIL: nrows mismatch" + , "\nExpected nrows:", nrow(fp_adults)) +} + +#====================== # writing output file -#================== +#====================== outfile_name_reg = "clinical_df_recoded.csv" outfile_reg = paste0(outdir, outfile_name_reg) cat("\nWriting clinical file for regression:", outfile_reg) -#write.csv(clinical_df, file = outfile_reg) +#write.csv(clinical_df_ics, file = outfile_reg) ################################################################ rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)