correcting dtype for sfluv and h1n1v for data formatting clinical

2020-11-24 13:48:53 +00:00 · 2020-11-24 13:48:53 +00:00 · 08e01abfb5
commit 08e01abfb5
parent 7529549bfc
4 changed files with 52 additions and 461 deletions
--- a/data_extraction_formatting_clinical.R
+++ b/data_extraction_formatting_clinical.R
@ -225,36 +225,35 @@ sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini
 #=======================
 # seasonal flu: sfluv
 #=======================
-if (! is.factor(clinical_df$sfluv)){
+# reassign as 0 and 1
  clinical_df$sfluv  = as.factor(clinical_df$sfluv)
 }
 class(clinical_df$sfluv)
 levels(clinical_df$sfluv)
 table(clinical_df$sfluv)
 table(clinical_df$asthma, clinical_df$sfluv)
-
+clinical_df$sfluv = ifelse(clinical_df$sfluv ==  "yes", 1, 0)
-# reassign
+table(clinical_df$sfluv)
 levels(clinical_df$sfluv) <- c("0", "0", "1")
 table(clinical_df$asthma, clinical_df$sfluv)
 # convert to integer
 str(clinical_df$sfluv)
 clinical_df$sfluv = as.integer(clinical_df$sfluv)
 str(clinical_df$sfluv)
 #=======================
 # h1n1v
 #=======================
-if (! is.factor(clinical_df$h1n1v)){
+# reassign as 0 and 1
  clinical_df$h1n1v  = as.factor(clinical_df$h1n1v)
 }
 class(clinical_df$h1n1v)
 levels(clinical_df$h1n1v)
 table(clinical_df$h1n1v)
 table(clinical_df$asthma, clinical_df$h1n1v)
-
+clinical_df$h1n1v = ifelse(clinical_df$h1n1v ==  "yes", 1, 0)
-# reassign
+table(clinical_df$h1n1v)
 levels(clinical_df$h1n1v) <- c("0", "0", "1")
 table(clinical_df$asthma, clinical_df$h1n1v)
 # convert to integer
 str(clinical_df$h1n1v)
 clinical_df$h1n1v = as.integer(clinical_df$h1n1v)
 str(clinical_df$h1n1v)
 #=======================
 # ethnicity
 #=======================
--- a/data_extraction_formatting_regression.R
+++ b/data_extraction_formatting_regression.R
@ -1,329 +0,0 @@
 #!/usr/bin/Rscript   
 getwd()
 setwd('~/git/mosaic_2020/')
 getwd()
 ########################################################################
 # TASK: Extract relevant columns from mosaic adults data
 # npa
 ########################################################################
 #====================
 # Input: source data for clinical
 #====================
 source("data_extraction_formatting_clinical.R")
 #source("colnames_clinical_meds.R")
 #=======================================
 # Data for mediator to include in regression
 #=======================================
 cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults")
 med_df = fp_adults[, c("mosaic", sig_npa_cols)]
 # sanity checks
 if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) ==  nrow(clinical_df) ){
  cat("PASS: binary data obs are complete, n =", nrow(clinical_df))
 }else{
  cat("FAIL: Incomplete data for binary outcomes. Please check and decide!")
  quit()
 }
 table(clinical_df$ia_exac_copd)
 ########################################################################
 # Data extraction for regression
 ########################################################################
 common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)]
 cat("\nMerging clinical and mediator data for regression"
    ,"\nMerging on column:", common_cols)
 reg_data = merge(clinical_df, med_df
                 , by = common_cols)
 if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){
  cat("\nNo. of rows match, nrow =", nrow(clinical_df)
      , "\nChecking ncols...")
  if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df)  - length(common_cols) ){
    cat("\nNo. of cols match, ncol =", ncol(reg_data))
 } else {
  cat("FAIL: ncols mismatch"
      , "Expected ncols:", ncol(clinical_df) + ncol(med_df)  - length(common_cols)
      , "\nGot:", ncol(reg_data))
  }
 } else {
  cat("FAIL: nrows mismatch"
           , "\nExpected nrows:", nrow(fp_adults))
 }
 ########################################################################
 # Reassign the copd and asthma status and do some checks 
 table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
 reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
 reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
 table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
 # check copd and asthma status
 table(reg_data$ia_exac_copd, reg_data$asthma)
 check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 
 # reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
 reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 
 table(reg_data$ia_exac_copd, reg_data$asthma)
 foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
 rm(check_copd_and_asthma_1, foo)
 #=====================================================================
 # count the resp scores 
 max_resp_score_table<- table(reg_data$max_resp_score)
 max_resp_score_table
 T1_resp_score_table<- table(reg_data$T1_resp_score)
 T1_resp_score_table
 T2_resp_score_table<- table(reg_data$T2_resp_score)
 T2_resp_score_table
 Inresp_sev<- table(reg_data$inresp_sev)
 Inresp_sev
 # Reassign the resp score so all 4 are replace by 3
 reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
 revised_resp_score_table<- table(reg_data$max_resp_score)
 revised_resp_score_table
 reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
 revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
 revised_T1_resp_score_table
 reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
 revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
 revised_T2_resp_score_table
 reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
 revised_Inresp_sev<- table(reg_data$inresp_sev)
 revised_Inresp_sev
 #=====================================================================
 # Remove these after checking
 rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
   , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
 #=====================================================================
 # Binning
 # "(": not inclusive
 # "]": inclusive
 #========
 # age
 #========
 # Create categories of variables
 reg_data$age = round(reg_data$age, digits = 0)
 table(reg_data$age)
 table(reg_data$asthma, reg_data$age)
 min(reg_data$age); max(reg_data$age)
 max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)  
 max_age_interval 
 min_age = min(reg_data$age); min_age #19
 min_age_interval = min_age - 1; min_age_interval
 #age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
 age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
 reg_data$age_bins = age_bins
 dim(reg_data) # 133 27
 # age_bins (to keep consistent with the results table)
 class(reg_data$age_bins)
 levels(reg_data$age_bins)
 #"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
 table(reg_data$asthma, reg_data$age_bins)
 #     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
 #0      25      17      25      14      11      1
 #1      11       8      12       5       3      2
 if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){
  cat("PASS: age_bins assigned successfully")
 }else{
  cat("FAIL: no. mismatch when assigning age_bins")
  quit()
 }
 # reassign
 class(reg_data$age_bins)
 levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
 table(reg_data$asthma, reg_data$age_bins)
 table(reg_data$asthma, reg_data$age_bins)
 #     (18,30] (30,40] (40,50] (50,80]
 #0      25      17      25      26
 #1      11       8      12      9
 sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
 #===========================
 # O2 saturation binning
 #===========================
 reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
 table(reg_data$o2_sat_admis)
 tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
 tot_o2
 o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
 reg_data$o2_sat_bin = o2_sat_bin
 table(reg_data$o2_sat_bin)
 sum(table(reg_data$o2_sat_bin)) == tot_o2
 #===========================
 # Onset to initial binning
 #===========================
 max_in = max(reg_data$onset_2_initial); max_in #23
 min_in = min(reg_data$onset_2_initial) - 1 ; min_in  # -6
 tot_onset2ini = sum(table(reg_data$onset_2_initial))
 tot_onset2ini
 onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
 reg_data$onset_initial_bin = onset_initial_bin
 sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
 #=======================
 # seasonal flu: sfluv
 #=======================
 # should be a factor
 if (! is.factor(reg_data$sfluv)){
  reg_data$sfluv  = as.factor(reg_data$sfluv)
 }
 class(reg_data$sfluv) #[1] "factor"
 levels(reg_data$sfluv)
 table(reg_data$asthma, reg_data$sfluv)
 # reassign
 levels(reg_data$sfluv) <- c("0", "0", "1")
 table(reg_data$asthma, reg_data$sfluv)
 #=======================
 # h1n1v
 #=======================
 # should be a factor
 if (! is.factor(reg_data$h1n1v)){
  reg_data$h1n1v  = as.factor(reg_data$h1n1v)
 }
 class(reg_data$h1n1v) #[1] "factor"
 levels(reg_data$h1n1v)
 table(reg_data$asthma, reg_data$h1n1v)
 # reassign
 levels(reg_data$h1n1v) <- c("0", "0", "1")
 table(reg_data$asthma, reg_data$h1n1v)
 #=======================
 # ethnicity
 #=======================
 class(reg_data$ethnicity) # integer
 table(reg_data$asthma, reg_data$ethnicity)
 reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
 table(reg_data$asthma, reg_data$ethnicity)
 #=======================
 # pneumonia
 #=======================
 class(reg_data$ia_cxr) # integer
 # ia_cxr 2 ---> yes pneumonia (1)
 # 1 ---> no (0)
 # ! 1 or 2 -- > "unkown"
 # reassign the pneumonia codes
 #0: not performed 
 #1: normal
 #2: findings consistent with pneumonia
 #3: abnormal
 #-1: not recorded
 #-2: n/a specified by the clinician # not in the data...
 #-3: unknown specified by clinician
 table(reg_data$ia_cxr)
 #-3 -1  0  1  2  3 
 #5 48 13 47 17  3 
 # change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
 reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
 table(reg_data$ia_cxr)
 #    1  2 
 #69 47 17
 reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
 reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
 table(reg_data$ia_cxr)
 #    0  1 
 #69 47 17 
 #=======================
 # smoking [tricky one]
 #=======================
 class(reg_data$smoking) # integer
 table(reg_data$asthma, reg_data$smoking)
 # orig
 #   -3 -1  1  2  3   4
 #0  15 9  22  2  15  30
 #1  4  2  13  0  4   17
 #  -3 -1  1  2  3  4
 #0 14  9 20  2 15 30
 #1  5  2 15  0  4 17
 # never smoking, 4 and 2 -- > no (0)
 #1 and 3 ---> yes (1)
 #!-3 and -1 ---- > NA
 ################# smoking
 #1: current daily ===> categ smoker(1)
 #2: occasional =====> categ no smoker(0)
 #3: ex-smoker ===> categ smoker(1)
 #4: never =====> categ no smoker(0)
 #-1: not recorded =====> categ blank (NA)
 #-2: n/a specified by the clinician =====> categ blank (NA)
 #-3: unknown specified by clinician=====> categ blank (NA)
 table(reg_data$smoking)
 #-3 -1  1  2  3  4 
 #19 11 35  2 19 47 
 # reassign the smoking codes
 reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
 reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
 reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
 table(reg_data$smoking)
 #    0  1 
 #30 49 54  
 table(reg_data$asthma, reg_data$smoking)
 # orig
 #      0  1
 #0 24 32 37
 #1  6 17 17
 #      0  1
 #0 23 32 35
 #1  7 17 19
 ################################################################
 #==================
 # writing output file
 #==================
 outfile_name_reg = "reg_data_recoded_with_NA.csv"
 outfile_reg = paste0(outdir, outfile_name_reg)
 cat("Writing clinical file for regression:", outfile_reg)
 #write.csv(reg_data, file = outfile_reg)
 ################################################################
 rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
--- a/logistic_regression.R
+++ b/logistic_regression.R
@ -4,14 +4,8 @@ setwd('~/git/mosaic_2020/')
 getwd()
 ########################################################################
 # TASK: Run regression analysis
-# npa
+# clinical params and npa meds
 ########################################################################
 #=================================================================================
 # TO DO:
 # Simple stats b/w obesity and non-obesity to consider including in reg analysis
 # Include NPA statistically sign params
 # Rerun graphs and plots without asthma?
 #=================================================================================
 #====================
 # Input: source data
@ -25,27 +19,29 @@ table(fp_adults_na$ia_exac_copd==1 & fp_adults_na$asthma == 1)
 table(clinical_df_ics$asthma)
 if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){
  cat("PASS: extracting clinical and sign npa cols")
 } else{
  cat("FAIL: could not find cols to extract")
  quit()
 }
 fp_adults_reg = fp_adults[, cols_to_extract]
 fp_adults_reg_na = fp_adults_na[, cols_to_extract]
 #--------------------
 # Data reassignment
 #--------------------
-my_data = clinical_df_ics
+my_data = fp_adults_reg
-my_data_na = clinical_df_ics_na
+my_data_na = fp_adults_reg_na
 table(my_data$ia_exac_copd==1 & my_data$asthma == 1)
 table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1)
 # clear variables
-#rm(fp_adults, fp_adults_na)
+rm(fp_adults, fp_adults_na, clinical_df_ics, clinical_df_ics_na)
 #########################################################################
 if ( names(which(lapply(my_data, class) == "character")) == "mosaic" ){
  cat("Character class for 1 column only:", "mosaic")
 }else{
  cat("More than one character class detected: Resolve!")
  quit()
 }
 #============================
 # Identifying column types: Reg data
 #===========================
@ -57,6 +53,23 @@ my_vars = colnames(my_reg_data)
 my_vars
 lapply(my_reg_data, class)
 check_int_vars = my_vars[lapply(my_reg_data, class)%in%c("integer")]
 check_num_vars = my_vars[lapply(my_reg_data, class)%in%c("numeric")]
 check_charac_vars =  my_vars[lapply(my_reg_data, class)%in%c("character")]
 check_factor_vars =  my_vars[lapply(my_reg_data, class)%in%c("factor")]
 cat("\nNo. of int cols:", length(check_int_vars)
    , "\nNo. of num cols:", length(check_num_vars)
    , "\nNo. of char cols:", length(check_charac_vars)
    , "\nNo. of factor cols:", length(check_factor_vars)
    )
 # convert char vals to int as these should be int
 my_reg_data[,check_charac_vars] =  lapply(my_reg_data[,check_charac_vars], as.integer)
 str(my_reg_data$sfluv)
 numerical_vars = c("age"
                   , "vl_pfu_ul_npa1"
                   , "los"
@ -64,6 +77,11 @@ numerical_vars = c("age"
                   , "onsfindeath"
                   , "o2_sat_admis")
 my_reg_data[numerical_vars] <- lapply(my_reg_data[numerical_vars], as.numeric)
 my_reg_params = my_vars
--- a/logistic_regression_TEMPLATE.R
+++ b/logistic_regression_TEMPLATE.R
@ -1,97 +0,0 @@
 #!/usr/bin/Rscript   
 getwd()
 setwd('~/git/mosaic_2020/')
 getwd()
 ########################################################################
 # TASK: Run regression analysis
 # npa
 ########################################################################
 #=================================================================================
 # TO DO:
 # Simple stats b/w obesity and non-obesity to consider including in reg analysis
 # Include NPA statistically sign params
 # Rerun graphs and plots without asthma?
 #=================================================================================
 #====================
 # Input: source data
 #====================
 source("data_extraction_formatting_clinical.R")
 rm(fp_adults, metadata_all)
 ########################################################################
 my_data = reg_data
 #########################################################################
 # check factor of each column
 lapply(my_data, class)
 character_vars <- lapply(my_data, class) == "character"
 character_vars
 table(character_vars)
 factor_vars <- lapply(my_data, class) == "factor"
 table(factor_vars)
 my_data[, character_vars] <- lapply(my_data[, character_vars], as.factor)
 factor_vars <- lapply(my_data, class) == "factor"
 factor_vars
 table(factor_vars)
 # check again
 lapply(my_data, class)
 table(my_data$ethnicity)
 my_data$ethnicity = as.factor(my_data$ethnicity)
 class(my_data$ethnicity)
 colnames(my_data)
 reg_param = c("age"
              , "age_bins"
              #, "death" # outcome
              , "asthma"
              , "obesity"
              , "gender"
              , "los"
              , "o2_sat_admis"
              #, "logistic_outcome"
              #, "steroid_ics"
              , "ethnicity"
              , "smoking"
              , "sfluv"
              , "h1n1v"
              , "ia_cxr"
              , "max_resp_score"
              , "T1_resp_score"
              , "com_noasthma"
              , "onset_initial_bin")
 for(i in reg_param) { 
  # print (i)
  p_form = as.formula(paste("death ~ ", i ,sep = ""))
  model_reg = glm(p_form , family = binomial, data = my_data)
  print(summary(model_reg))
  print(exp(cbind(OR = coef(model_reg), confint(model_reg))))
  #print (PseudoR2(model_reg))
  cat("=================================================================================\n")
 }
 full_mod = glm(death ~ asthma +
                 gender + 
                 age_bins + 
                 los + 
                 #ethnicity +
                 onset_initial_bin +
                 o2_sat_bin +
                 com_noasthma +
                 obesity +
                 #ia_cxr +
                 smoking +
                 #sfluv +
                 #h1n1v
                 max_resp_score +
                 T1_resp_score +
               , family = "binomial", data = my_data)
 summary(full_mod)