From 08e01abfb5a730599198015fdb4215164c502222 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 24 Nov 2020 13:48:53 +0000 Subject: [PATCH] correcting dtype for sfluv and h1n1v for data formatting clinical --- data_extraction_formatting_clinical.R | 33 ++- data_extraction_formatting_regression.R | 329 ------------------------ logistic_regression.R | 54 ++-- logistic_regression_TEMPLATE.R | 97 ------- 4 files changed, 52 insertions(+), 461 deletions(-) delete mode 100755 data_extraction_formatting_regression.R delete mode 100644 logistic_regression_TEMPLATE.R diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R index 0aef946..0347011 100755 --- a/data_extraction_formatting_clinical.R +++ b/data_extraction_formatting_clinical.R @@ -225,36 +225,35 @@ sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini #======================= # seasonal flu: sfluv #======================= -if (! is.factor(clinical_df$sfluv)){ - clinical_df$sfluv = as.factor(clinical_df$sfluv) -} -class(clinical_df$sfluv) -levels(clinical_df$sfluv) +# reassign as 0 and 1 table(clinical_df$sfluv) table(clinical_df$asthma, clinical_df$sfluv) - -# reassign -levels(clinical_df$sfluv) <- c("0", "0", "1") +clinical_df$sfluv = ifelse(clinical_df$sfluv == "yes", 1, 0) +table(clinical_df$sfluv) table(clinical_df$asthma, clinical_df$sfluv) +# convert to integer +str(clinical_df$sfluv) +clinical_df$sfluv = as.integer(clinical_df$sfluv) +str(clinical_df$sfluv) + #======================= # h1n1v #======================= -if (! is.factor(clinical_df$h1n1v)){ - clinical_df$h1n1v = as.factor(clinical_df$h1n1v) -} - -class(clinical_df$h1n1v) -levels(clinical_df$h1n1v) +# reassign as 0 and 1 table(clinical_df$h1n1v) table(clinical_df$asthma, clinical_df$h1n1v) - -# reassign -levels(clinical_df$h1n1v) <- c("0", "0", "1") +clinical_df$h1n1v = ifelse(clinical_df$h1n1v == "yes", 1, 0) +table(clinical_df$h1n1v) table(clinical_df$asthma, clinical_df$h1n1v) +# convert to integer +str(clinical_df$h1n1v) +clinical_df$h1n1v = as.integer(clinical_df$h1n1v) +str(clinical_df$h1n1v) + #======================= # ethnicity #======================= diff --git a/data_extraction_formatting_regression.R b/data_extraction_formatting_regression.R deleted file mode 100755 index 14dc360..0000000 --- a/data_extraction_formatting_regression.R +++ /dev/null @@ -1,329 +0,0 @@ -#!/usr/bin/Rscript -getwd() -setwd('~/git/mosaic_2020/') -getwd() -######################################################################## -# TASK: Extract relevant columns from mosaic adults data -# npa -######################################################################## -#==================== -# Input: source data for clinical -#==================== -source("data_extraction_formatting_clinical.R") -#source("colnames_clinical_meds.R") -#======================================= -# Data for mediator to include in regression -#======================================= -cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults") - -med_df = fp_adults[, c("mosaic", sig_npa_cols)] - -# sanity checks -if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) == nrow(clinical_df) ){ - cat("PASS: binary data obs are complete, n =", nrow(clinical_df)) -}else{ - cat("FAIL: Incomplete data for binary outcomes. Please check and decide!") - quit() -} - -table(clinical_df$ia_exac_copd) - -######################################################################## -# Data extraction for regression -######################################################################## - -common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)] - -cat("\nMerging clinical and mediator data for regression" - ,"\nMerging on column:", common_cols) - -reg_data = merge(clinical_df, med_df - , by = common_cols) - -if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){ - cat("\nNo. of rows match, nrow =", nrow(clinical_df) - , "\nChecking ncols...") - if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df) - length(common_cols) ){ - cat("\nNo. of cols match, ncol =", ncol(reg_data)) -} else { - cat("FAIL: ncols mismatch" - , "Expected ncols:", ncol(clinical_df) + ncol(med_df) - length(common_cols) - , "\nGot:", ncol(reg_data)) - } -} else { - cat("FAIL: nrows mismatch" - , "\nExpected nrows:", nrow(fp_adults)) -} - -######################################################################## -# Reassign the copd and asthma status and do some checks -table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) - -reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0 -reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0 - -table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd)) - -# check copd and asthma status -table(reg_data$ia_exac_copd, reg_data$asthma) -check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 - -# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma -reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 -table(reg_data$ia_exac_copd, reg_data$asthma) - -foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0 - -rm(check_copd_and_asthma_1, foo) -#===================================================================== -# count the resp scores -max_resp_score_table<- table(reg_data$max_resp_score) -max_resp_score_table - -T1_resp_score_table<- table(reg_data$T1_resp_score) -T1_resp_score_table - -T2_resp_score_table<- table(reg_data$T2_resp_score) -T2_resp_score_table - -Inresp_sev<- table(reg_data$inresp_sev) -Inresp_sev - -# Reassign the resp score so all 4 are replace by 3 -reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3 -revised_resp_score_table<- table(reg_data$max_resp_score) -revised_resp_score_table - -reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3 -revised_T1_resp_score_table<- table(reg_data$T1_resp_score) -revised_T1_resp_score_table - -reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3 -revised_T2_resp_score_table<- table(reg_data$T2_resp_score) -revised_T2_resp_score_table - -reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3 -revised_Inresp_sev<- table(reg_data$inresp_sev) -revised_Inresp_sev -#===================================================================== -# Remove these after checking -rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev - , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev) -#===================================================================== -# Binning -# "(": not inclusive -# "]": inclusive - -#======== -# age -#======== -# Create categories of variables -reg_data$age = round(reg_data$age, digits = 0) -table(reg_data$age) -table(reg_data$asthma, reg_data$age) -min(reg_data$age); max(reg_data$age) - -max_age_interval = round_any(max(reg_data$age), 10, f = ceiling) -max_age_interval -min_age = min(reg_data$age); min_age #19 -min_age_interval = min_age - 1; min_age_interval - -#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90)) -age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval)) -reg_data$age_bins = age_bins -dim(reg_data) # 133 27 - -# age_bins (to keep consistent with the results table) -class(reg_data$age_bins) -levels(reg_data$age_bins) -#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]" -table(reg_data$asthma, reg_data$age_bins) -# (18,30] (30,40] (40,50] (50,60] (60,70] (70,80] -#0 25 17 25 14 11 1 -#1 11 8 12 5 3 2 - -if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){ - cat("PASS: age_bins assigned successfully") -}else{ - cat("FAIL: no. mismatch when assigning age_bins") - quit() -} - -# reassign -class(reg_data$age_bins) -levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]") -table(reg_data$asthma, reg_data$age_bins) -table(reg_data$asthma, reg_data$age_bins) -# (18,30] (30,40] (40,50] (50,80] -#0 25 17 25 26 -#1 11 8 12 9 - -sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) - -#=========================== -# O2 saturation binning -#=========================== -reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0) -table(reg_data$o2_sat_admis) -tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]] -tot_o2 - -o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100)) -reg_data$o2_sat_bin = o2_sat_bin -table(reg_data$o2_sat_bin) - -sum(table(reg_data$o2_sat_bin)) == tot_o2 - -#=========================== -# Onset to initial binning -#=========================== -max_in = max(reg_data$onset_2_initial); max_in #23 -min_in = min(reg_data$onset_2_initial) - 1 ; min_in # -6 - -tot_onset2ini = sum(table(reg_data$onset_2_initial)) -tot_onset2ini - -onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in)) -reg_data$onset_initial_bin = onset_initial_bin - -sum(table(reg_data$onset_initial_bin)) == tot_onset2ini - -#======================= -# seasonal flu: sfluv -#======================= -# should be a factor -if (! is.factor(reg_data$sfluv)){ - reg_data$sfluv = as.factor(reg_data$sfluv) -} -class(reg_data$sfluv) #[1] "factor" - -levels(reg_data$sfluv) -table(reg_data$asthma, reg_data$sfluv) -# reassign -levels(reg_data$sfluv) <- c("0", "0", "1") -table(reg_data$asthma, reg_data$sfluv) - -#======================= -# h1n1v -#======================= -# should be a factor -if (! is.factor(reg_data$h1n1v)){ - reg_data$h1n1v = as.factor(reg_data$h1n1v) -} -class(reg_data$h1n1v) #[1] "factor" - -levels(reg_data$h1n1v) -table(reg_data$asthma, reg_data$h1n1v) -# reassign -levels(reg_data$h1n1v) <- c("0", "0", "1") -table(reg_data$asthma, reg_data$h1n1v) - -#======================= -# ethnicity -#======================= -class(reg_data$ethnicity) # integer -table(reg_data$asthma, reg_data$ethnicity) - -reg_data$ethnicity[reg_data$ethnicity == 4] <- 2 -table(reg_data$asthma, reg_data$ethnicity) - -#======================= -# pneumonia -#======================= -class(reg_data$ia_cxr) # integer -# ia_cxr 2 ---> yes pneumonia (1) -# 1 ---> no (0) -# ! 1 or 2 -- > "unkown" - -# reassign the pneumonia codes -#0: not performed -#1: normal -#2: findings consistent with pneumonia -#3: abnormal -#-1: not recorded -#-2: n/a specified by the clinician # not in the data... -#-3: unknown specified by clinician - - -table(reg_data$ia_cxr) -#-3 -1 0 1 2 3 -#5 48 13 47 17 3 - -# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist -reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- "" -table(reg_data$ia_cxr) -# 1 2 -#69 47 17 - -reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0 -reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1 -table(reg_data$ia_cxr) -# 0 1 -#69 47 17 - -#======================= -# smoking [tricky one] -#======================= -class(reg_data$smoking) # integer -table(reg_data$asthma, reg_data$smoking) - -# orig -# -3 -1 1 2 3 4 -#0 15 9 22 2 15 30 -#1 4 2 13 0 4 17 - -# -3 -1 1 2 3 4 -#0 14 9 20 2 15 30 -#1 5 2 15 0 4 17 - -# never smoking, 4 and 2 -- > no (0) -#1 and 3 ---> yes (1) -#!-3 and -1 ---- > NA - -################# smoking - -#1: current daily ===> categ smoker(1) -#2: occasional =====> categ no smoker(0) -#3: ex-smoker ===> categ smoker(1) -#4: never =====> categ no smoker(0) -#-1: not recorded =====> categ blank (NA) -#-2: n/a specified by the clinician =====> categ blank (NA) -#-3: unknown specified by clinician=====> categ blank (NA) - -table(reg_data$smoking) -#-3 -1 1 2 3 4 -#19 11 35 2 19 47 - -# reassign the smoking codes -reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0 -reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1 -reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- "" - -table(reg_data$smoking) -# 0 1 -#30 49 54 - -table(reg_data$asthma, reg_data$smoking) - -# orig -# 0 1 -#0 24 32 37 -#1 6 17 17 - -# 0 1 -#0 23 32 35 -#1 7 17 19 - -################################################################ -#================== -# writing output file -#================== -outfile_name_reg = "reg_data_recoded_with_NA.csv" -outfile_reg = paste0(outdir, outfile_name_reg) - -cat("Writing clinical file for regression:", outfile_reg) - -#write.csv(reg_data, file = outfile_reg) -################################################################ - -rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols) diff --git a/logistic_regression.R b/logistic_regression.R index e0d7a17..75f5470 100644 --- a/logistic_regression.R +++ b/logistic_regression.R @@ -4,14 +4,8 @@ setwd('~/git/mosaic_2020/') getwd() ######################################################################## # TASK: Run regression analysis -# npa +# clinical params and npa meds ######################################################################## -#================================================================================= -# TO DO: -# Simple stats b/w obesity and non-obesity to consider including in reg analysis -# Include NPA statistically sign params -# Rerun graphs and plots without asthma? -#================================================================================= #==================== # Input: source data @@ -25,27 +19,29 @@ table(fp_adults_na$ia_exac_copd==1 & fp_adults_na$asthma == 1) table(clinical_df_ics$asthma) +if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){ + cat("PASS: extracting clinical and sign npa cols") +} else{ + cat("FAIL: could not find cols to extract") + quit() +} + +fp_adults_reg = fp_adults[, cols_to_extract] +fp_adults_reg_na = fp_adults_na[, cols_to_extract] + #-------------------- # Data reassignment #-------------------- -my_data = clinical_df_ics -my_data_na = clinical_df_ics_na +my_data = fp_adults_reg +my_data_na = fp_adults_reg_na table(my_data$ia_exac_copd==1 & my_data$asthma == 1) table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1) # clear variables -#rm(fp_adults, fp_adults_na) +rm(fp_adults, fp_adults_na, clinical_df_ics, clinical_df_ics_na) ######################################################################### - -if ( names(which(lapply(my_data, class) == "character")) == "mosaic" ){ - cat("Character class for 1 column only:", "mosaic") -}else{ - cat("More than one character class detected: Resolve!") - quit() -} - #============================ # Identifying column types: Reg data #=========================== @@ -57,6 +53,23 @@ my_vars = colnames(my_reg_data) my_vars lapply(my_reg_data, class) + +check_int_vars = my_vars[lapply(my_reg_data, class)%in%c("integer")] +check_num_vars = my_vars[lapply(my_reg_data, class)%in%c("numeric")] +check_charac_vars = my_vars[lapply(my_reg_data, class)%in%c("character")] +check_factor_vars = my_vars[lapply(my_reg_data, class)%in%c("factor")] + +cat("\nNo. of int cols:", length(check_int_vars) + , "\nNo. of num cols:", length(check_num_vars) + , "\nNo. of char cols:", length(check_charac_vars) + , "\nNo. of factor cols:", length(check_factor_vars) + ) + +# convert char vals to int as these should be int +my_reg_data[,check_charac_vars] = lapply(my_reg_data[,check_charac_vars], as.integer) +str(my_reg_data$sfluv) + + numerical_vars = c("age" , "vl_pfu_ul_npa1" , "los" @@ -64,6 +77,11 @@ numerical_vars = c("age" , "onsfindeath" , "o2_sat_admis") + + + + + my_reg_data[numerical_vars] <- lapply(my_reg_data[numerical_vars], as.numeric) my_reg_params = my_vars diff --git a/logistic_regression_TEMPLATE.R b/logistic_regression_TEMPLATE.R deleted file mode 100644 index d483858..0000000 --- a/logistic_regression_TEMPLATE.R +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/Rscript -getwd() -setwd('~/git/mosaic_2020/') -getwd() -######################################################################## -# TASK: Run regression analysis -# npa -######################################################################## -#================================================================================= -# TO DO: -# Simple stats b/w obesity and non-obesity to consider including in reg analysis -# Include NPA statistically sign params -# Rerun graphs and plots without asthma? -#================================================================================= - -#==================== -# Input: source data -#==================== -source("data_extraction_formatting_clinical.R") - -rm(fp_adults, metadata_all) - -######################################################################## -my_data = reg_data -######################################################################### -# check factor of each column -lapply(my_data, class) - -character_vars <- lapply(my_data, class) == "character" -character_vars -table(character_vars) - -factor_vars <- lapply(my_data, class) == "factor" -table(factor_vars) - -my_data[, character_vars] <- lapply(my_data[, character_vars], as.factor) -factor_vars <- lapply(my_data, class) == "factor" -factor_vars -table(factor_vars) - -# check again -lapply(my_data, class) - -table(my_data$ethnicity) -my_data$ethnicity = as.factor(my_data$ethnicity) -class(my_data$ethnicity) - -colnames(my_data) -reg_param = c("age" - , "age_bins" - #, "death" # outcome - , "asthma" - , "obesity" - , "gender" - , "los" - , "o2_sat_admis" - #, "logistic_outcome" - #, "steroid_ics" - , "ethnicity" - , "smoking" - , "sfluv" - , "h1n1v" - , "ia_cxr" - , "max_resp_score" - , "T1_resp_score" - , "com_noasthma" - , "onset_initial_bin") - -for(i in reg_param) { - # print (i) - p_form = as.formula(paste("death ~ ", i ,sep = "")) - model_reg = glm(p_form , family = binomial, data = my_data) - print(summary(model_reg)) - print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) - #print (PseudoR2(model_reg)) - cat("=================================================================================\n") -} - - -full_mod = glm(death ~ asthma + - gender + - age_bins + - los + - #ethnicity + - onset_initial_bin + - o2_sat_bin + - com_noasthma + - obesity + - #ia_cxr + - smoking + - #sfluv + - #h1n1v - max_resp_score + - T1_resp_score + - , family = "binomial", data = my_data) - -summary(full_mod)