From 08e01abfb5a730599198015fdb4215164c502222 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 24 Nov 2020 13:48:53 +0000
Subject: [PATCH] correcting dtype for sfluv and h1n1v for data formatting
 clinical

---
 data_extraction_formatting_clinical.R   |  33 ++-
 data_extraction_formatting_regression.R | 329 ------------------------
 logistic_regression.R                   |  54 ++--
 logistic_regression_TEMPLATE.R          |  97 -------
 4 files changed, 52 insertions(+), 461 deletions(-)
 delete mode 100755 data_extraction_formatting_regression.R
 delete mode 100644 logistic_regression_TEMPLATE.R

diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R
index 0aef946..0347011 100755
--- a/data_extraction_formatting_clinical.R
+++ b/data_extraction_formatting_clinical.R
@@ -225,36 +225,35 @@ sum(table(clinical_df$onset_initial_bin)) == tot_onset2ini
 #=======================
 # seasonal flu: sfluv
 #=======================
-if (! is.factor(clinical_df$sfluv)){
-  clinical_df$sfluv  = as.factor(clinical_df$sfluv)
-}
-class(clinical_df$sfluv)
-levels(clinical_df$sfluv)
+# reassign as 0 and 1
 
 table(clinical_df$sfluv)
 table(clinical_df$asthma, clinical_df$sfluv)
-
-# reassign
-levels(clinical_df$sfluv) <- c("0", "0", "1")
+clinical_df$sfluv = ifelse(clinical_df$sfluv ==  "yes", 1, 0)
+table(clinical_df$sfluv)
 table(clinical_df$asthma, clinical_df$sfluv)
 
+# convert to integer
+str(clinical_df$sfluv)
+clinical_df$sfluv = as.integer(clinical_df$sfluv)
+str(clinical_df$sfluv)
+
 #=======================
 # h1n1v
 #=======================
-if (! is.factor(clinical_df$h1n1v)){
-  clinical_df$h1n1v  = as.factor(clinical_df$h1n1v)
-}
-
-class(clinical_df$h1n1v)
-levels(clinical_df$h1n1v)
+# reassign as 0 and 1
 
 table(clinical_df$h1n1v)
 table(clinical_df$asthma, clinical_df$h1n1v)
-
-# reassign
-levels(clinical_df$h1n1v) <- c("0", "0", "1")
+clinical_df$h1n1v = ifelse(clinical_df$h1n1v ==  "yes", 1, 0)
+table(clinical_df$h1n1v)
 table(clinical_df$asthma, clinical_df$h1n1v)
 
+# convert to integer
+str(clinical_df$h1n1v)
+clinical_df$h1n1v = as.integer(clinical_df$h1n1v)
+str(clinical_df$h1n1v)
+
 #=======================
 # ethnicity
 #=======================
diff --git a/data_extraction_formatting_regression.R b/data_extraction_formatting_regression.R
deleted file mode 100755
index 14dc360..0000000
--- a/data_extraction_formatting_regression.R
+++ /dev/null
@@ -1,329 +0,0 @@
-#!/usr/bin/Rscript   
-getwd()
-setwd('~/git/mosaic_2020/')
-getwd()
-########################################################################
-# TASK: Extract relevant columns from mosaic adults data
-# npa
-########################################################################
-#====================
-# Input: source data for clinical
-#====================
-source("data_extraction_formatting_clinical.R")
-#source("colnames_clinical_meds.R")
-#=======================================
-# Data for mediator to include in regression
-#=======================================
-cat("Extracting", length(sig_npa_cols), "mediator cols from fp_adults")
-
-med_df = fp_adults[, c("mosaic", sig_npa_cols)]
-
-# sanity checks
-if ( sum(table(clinical_df$obesity)) & sum(table(clinical_df$age>=18)) & sum(table(clinical_df$death)) & sum(table(clinical_df$asthma)) ==  nrow(clinical_df) ){
-  cat("PASS: binary data obs are complete, n =", nrow(clinical_df))
-}else{
-  cat("FAIL: Incomplete data for binary outcomes. Please check and decide!")
-  quit()
-}
-
-table(clinical_df$ia_exac_copd)
-
-########################################################################
-# Data extraction for regression
-########################################################################
-
-common_cols = names(clinical_df)[names(clinical_df)%in%names(med_df)]
-
-cat("\nMerging clinical and mediator data for regression"
-    ,"\nMerging on column:", common_cols)
-
-reg_data = merge(clinical_df, med_df
-                 , by = common_cols)
-
-if (nrow(reg_data) == nrow(clinical_df) & nrow(med_df)){
-  cat("\nNo. of rows match, nrow =", nrow(clinical_df)
-      , "\nChecking ncols...")
-  if ( ncol(reg_data) == ncol(clinical_df) + ncol(med_df)  - length(common_cols) ){
-    cat("\nNo. of cols match, ncol =", ncol(reg_data))
-} else {
-  cat("FAIL: ncols mismatch"
-      , "Expected ncols:", ncol(clinical_df) + ncol(med_df)  - length(common_cols)
-      , "\nGot:", ncol(reg_data))
-  }
-} else {
-  cat("FAIL: nrows mismatch"
-           , "\nExpected nrows:", nrow(fp_adults))
-}
-
-########################################################################
-# Reassign the copd and asthma status and do some checks 
-table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
-
-reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
-reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
-
-table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
-
-# check copd and asthma status
-table(reg_data$ia_exac_copd, reg_data$asthma)
-check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 
-
-# reassign these 3 so these are treated as non-asthmatics as copd with asthma is NOT TRUE asthma
-reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 
-table(reg_data$ia_exac_copd, reg_data$asthma)
-
-foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
-
-rm(check_copd_and_asthma_1, foo)
-#=====================================================================
-# count the resp scores 
-max_resp_score_table<- table(reg_data$max_resp_score)
-max_resp_score_table
-
-T1_resp_score_table<- table(reg_data$T1_resp_score)
-T1_resp_score_table
-
-T2_resp_score_table<- table(reg_data$T2_resp_score)
-T2_resp_score_table
-
-Inresp_sev<- table(reg_data$inresp_sev)
-Inresp_sev
-
-# Reassign the resp score so all 4 are replace by 3
-reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
-revised_resp_score_table<- table(reg_data$max_resp_score)
-revised_resp_score_table
-
-reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
-revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
-revised_T1_resp_score_table
-
-reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
-revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
-revised_T2_resp_score_table
-
-reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
-revised_Inresp_sev<- table(reg_data$inresp_sev)
-revised_Inresp_sev
-#=====================================================================
-# Remove these after checking
-rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
-   , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
-#=====================================================================
-# Binning
-# "(": not inclusive
-# "]": inclusive
-
-#========
-# age
-#========
-# Create categories of variables
-reg_data$age = round(reg_data$age, digits = 0)
-table(reg_data$age)
-table(reg_data$asthma, reg_data$age)
-min(reg_data$age); max(reg_data$age)
-
-max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)  
-max_age_interval 
-min_age = min(reg_data$age); min_age #19
-min_age_interval = min_age - 1; min_age_interval
-
-#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
-age_bins = cut(reg_data$age, c(min_age_interval, 30, 40, 50, 60, 70, max_age_interval))
-reg_data$age_bins = age_bins
-dim(reg_data) # 133 27
-
-# age_bins (to keep consistent with the results table)
-class(reg_data$age_bins)
-levels(reg_data$age_bins)
-#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
-table(reg_data$asthma, reg_data$age_bins)
-#     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
-#0      25      17      25      14      11      1
-#1      11       8      12       5       3      2
-
-if (sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data) ){
-  cat("PASS: age_bins assigned successfully")
-}else{
-  cat("FAIL: no. mismatch when assigning age_bins")
-  quit()
-}
-
-# reassign
-class(reg_data$age_bins)
-levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
-table(reg_data$asthma, reg_data$age_bins)
-table(reg_data$asthma, reg_data$age_bins)
-#     (18,30] (30,40] (40,50] (50,80]
-#0      25      17      25      26
-#1      11       8      12      9
-
-sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
-
-#===========================
-# O2 saturation binning
-#===========================
-reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
-table(reg_data$o2_sat_admis)
-tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
-tot_o2
-
-o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
-reg_data$o2_sat_bin = o2_sat_bin
-table(reg_data$o2_sat_bin)
-
-sum(table(reg_data$o2_sat_bin)) == tot_o2
-
-#===========================
-# Onset to initial binning
-#===========================
-max_in = max(reg_data$onset_2_initial); max_in #23
-min_in = min(reg_data$onset_2_initial) - 1 ; min_in  # -6
-
-tot_onset2ini = sum(table(reg_data$onset_2_initial))
-tot_onset2ini
-
-onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
-reg_data$onset_initial_bin = onset_initial_bin
-
-sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
-
-#=======================
-# seasonal flu: sfluv
-#=======================
-# should be a factor
-if (! is.factor(reg_data$sfluv)){
-  reg_data$sfluv  = as.factor(reg_data$sfluv)
-}
-class(reg_data$sfluv) #[1] "factor"
-
-levels(reg_data$sfluv)
-table(reg_data$asthma, reg_data$sfluv)
-# reassign
-levels(reg_data$sfluv) <- c("0", "0", "1")
-table(reg_data$asthma, reg_data$sfluv)
-
-#=======================
-# h1n1v
-#=======================
-# should be a factor
-if (! is.factor(reg_data$h1n1v)){
-  reg_data$h1n1v  = as.factor(reg_data$h1n1v)
-}
-class(reg_data$h1n1v) #[1] "factor"
-
-levels(reg_data$h1n1v)
-table(reg_data$asthma, reg_data$h1n1v)
-# reassign
-levels(reg_data$h1n1v) <- c("0", "0", "1")
-table(reg_data$asthma, reg_data$h1n1v)
-
-#=======================
-# ethnicity
-#=======================
-class(reg_data$ethnicity) # integer
-table(reg_data$asthma, reg_data$ethnicity)
-
-reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
-table(reg_data$asthma, reg_data$ethnicity)
-
-#=======================
-# pneumonia
-#=======================
-class(reg_data$ia_cxr) # integer
-# ia_cxr 2 ---> yes pneumonia (1)
-# 1 ---> no (0)
-# ! 1 or 2 -- > "unkown"
-
-# reassign the pneumonia codes
-#0: not performed 
-#1: normal
-#2: findings consistent with pneumonia
-#3: abnormal
-#-1: not recorded
-#-2: n/a specified by the clinician # not in the data...
-#-3: unknown specified by clinician
-
-
-table(reg_data$ia_cxr)
-#-3 -1  0  1  2  3 
-#5 48 13 47 17  3 
-
-# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
-reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
-table(reg_data$ia_cxr)
-#    1  2 
-#69 47 17
-
-reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
-reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
-table(reg_data$ia_cxr)
-#    0  1 
-#69 47 17 
-
-#=======================
-# smoking [tricky one]
-#=======================
-class(reg_data$smoking) # integer
-table(reg_data$asthma, reg_data$smoking)
-
-# orig
-#   -3 -1  1  2  3   4
-#0  15 9  22  2  15  30
-#1  4  2  13  0  4   17
-
-#  -3 -1  1  2  3  4
-#0 14  9 20  2 15 30
-#1  5  2 15  0  4 17
-
-# never smoking, 4 and 2 -- > no (0)
-#1 and 3 ---> yes (1)
-#!-3 and -1 ---- > NA
-
-################# smoking
-
-#1: current daily ===> categ smoker(1)
-#2: occasional =====> categ no smoker(0)
-#3: ex-smoker ===> categ smoker(1)
-#4: never =====> categ no smoker(0)
-#-1: not recorded =====> categ blank (NA)
-#-2: n/a specified by the clinician =====> categ blank (NA)
-#-3: unknown specified by clinician=====> categ blank (NA)
-
-table(reg_data$smoking)
-#-3 -1  1  2  3  4 
-#19 11 35  2 19 47 
-
-# reassign the smoking codes
-reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
-reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
-reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
-
-table(reg_data$smoking)
-#    0  1 
-#30 49 54  
-
-table(reg_data$asthma, reg_data$smoking)
-
-# orig
-#      0  1
-#0 24 32 37
-#1  6 17 17
-
-#      0  1
-#0 23 32 35
-#1  7 17 19
-
-################################################################
-#==================
-# writing output file
-#==================
-outfile_name_reg = "reg_data_recoded_with_NA.csv"
-outfile_reg = paste0(outdir, outfile_name_reg)
-
-cat("Writing clinical file for regression:", outfile_reg)
-
-#write.csv(reg_data, file = outfile_reg)
-################################################################
-
-rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
diff --git a/logistic_regression.R b/logistic_regression.R
index e0d7a17..75f5470 100644
--- a/logistic_regression.R
+++ b/logistic_regression.R
@@ -4,14 +4,8 @@ setwd('~/git/mosaic_2020/')
 getwd()
 ########################################################################
 # TASK: Run regression analysis
-# npa
+# clinical params and npa meds
 ########################################################################
-#=================================================================================
-# TO DO:
-# Simple stats b/w obesity and non-obesity to consider including in reg analysis
-# Include NPA statistically sign params
-# Rerun graphs and plots without asthma?
-#=================================================================================
 
 #====================
 # Input: source data
@@ -25,27 +19,29 @@ table(fp_adults_na$ia_exac_copd==1 & fp_adults_na$asthma == 1)
 
 table(clinical_df_ics$asthma)
 
+if ( length(cols_to_extract) == length(clinical_cols) + length(sig_npa_cols) ){
+  cat("PASS: extracting clinical and sign npa cols")
+} else{
+  cat("FAIL: could not find cols to extract")
+  quit()
+}
+
+fp_adults_reg = fp_adults[, cols_to_extract]
+fp_adults_reg_na = fp_adults_na[, cols_to_extract]
+
 #--------------------
 # Data reassignment
 #--------------------
-my_data = clinical_df_ics
-my_data_na = clinical_df_ics_na
+my_data = fp_adults_reg
+my_data_na = fp_adults_reg_na
 
 table(my_data$ia_exac_copd==1 & my_data$asthma == 1)
 table(my_data_na$ia_exac_copd==1 & my_data_na$asthma == 1)
 
 # clear variables
-#rm(fp_adults, fp_adults_na)
+rm(fp_adults, fp_adults_na, clinical_df_ics, clinical_df_ics_na)
 
 #########################################################################
-
-if ( names(which(lapply(my_data, class) == "character")) == "mosaic" ){
-  cat("Character class for 1 column only:", "mosaic")
-}else{
-  cat("More than one character class detected: Resolve!")
-  quit()
-}
-
 #============================
 # Identifying column types: Reg data
 #===========================
@@ -57,6 +53,23 @@ my_vars = colnames(my_reg_data)
 my_vars
 
 lapply(my_reg_data, class)
+
+check_int_vars = my_vars[lapply(my_reg_data, class)%in%c("integer")]
+check_num_vars = my_vars[lapply(my_reg_data, class)%in%c("numeric")]
+check_charac_vars =  my_vars[lapply(my_reg_data, class)%in%c("character")]
+check_factor_vars =  my_vars[lapply(my_reg_data, class)%in%c("factor")]
+
+cat("\nNo. of int cols:", length(check_int_vars)
+    , "\nNo. of num cols:", length(check_num_vars)
+    , "\nNo. of char cols:", length(check_charac_vars)
+    , "\nNo. of factor cols:", length(check_factor_vars)
+    )
+
+# convert char vals to int as these should be int
+my_reg_data[,check_charac_vars] =  lapply(my_reg_data[,check_charac_vars], as.integer)
+str(my_reg_data$sfluv)
+
+
 numerical_vars = c("age"
                    , "vl_pfu_ul_npa1"
                    , "los"
@@ -64,6 +77,11 @@ numerical_vars = c("age"
                    , "onsfindeath"
                    , "o2_sat_admis")
 
+
+
+
+
+
 my_reg_data[numerical_vars] <- lapply(my_reg_data[numerical_vars], as.numeric)
 
 my_reg_params = my_vars
diff --git a/logistic_regression_TEMPLATE.R b/logistic_regression_TEMPLATE.R
deleted file mode 100644
index d483858..0000000
--- a/logistic_regression_TEMPLATE.R
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/Rscript   
-getwd()
-setwd('~/git/mosaic_2020/')
-getwd()
-########################################################################
-# TASK: Run regression analysis
-# npa
-########################################################################
-#=================================================================================
-# TO DO:
-# Simple stats b/w obesity and non-obesity to consider including in reg analysis
-# Include NPA statistically sign params
-# Rerun graphs and plots without asthma?
-#=================================================================================
-
-#====================
-# Input: source data
-#====================
-source("data_extraction_formatting_clinical.R")
-
-rm(fp_adults, metadata_all)
-
-########################################################################
-my_data = reg_data
-#########################################################################
-# check factor of each column
-lapply(my_data, class)
-
-character_vars <- lapply(my_data, class) == "character"
-character_vars
-table(character_vars)
-
-factor_vars <- lapply(my_data, class) == "factor"
-table(factor_vars)
-
-my_data[, character_vars] <- lapply(my_data[, character_vars], as.factor)
-factor_vars <- lapply(my_data, class) == "factor"
-factor_vars
-table(factor_vars)
-
-# check again
-lapply(my_data, class)
-
-table(my_data$ethnicity)
-my_data$ethnicity = as.factor(my_data$ethnicity)
-class(my_data$ethnicity)
-
-colnames(my_data)
-reg_param = c("age"
-              , "age_bins"
-              #, "death" # outcome
-              , "asthma"
-              , "obesity"
-              , "gender"
-              , "los"
-              , "o2_sat_admis"
-              #, "logistic_outcome"
-              #, "steroid_ics"
-              , "ethnicity"
-              , "smoking"
-              , "sfluv"
-              , "h1n1v"
-              , "ia_cxr"
-              , "max_resp_score"
-              , "T1_resp_score"
-              , "com_noasthma"
-              , "onset_initial_bin")
-
-for(i in reg_param) { 
-  # print (i)
-  p_form = as.formula(paste("death ~ ", i ,sep = ""))
-  model_reg = glm(p_form , family = binomial, data = my_data)
-  print(summary(model_reg))
-  print(exp(cbind(OR = coef(model_reg), confint(model_reg))))
-  #print (PseudoR2(model_reg))
-  cat("=================================================================================\n")
-}
-
-
-full_mod = glm(death ~ asthma +
-                 gender + 
-                 age_bins + 
-                 los + 
-                 #ethnicity +
-                 onset_initial_bin +
-                 o2_sat_bin +
-                 com_noasthma +
-                 obesity +
-                 #ia_cxr +
-                 smoking +
-                 #sfluv +
-                 #h1n1v
-                 max_resp_score +
-                 T1_resp_score +
-               , family = "binomial", data = my_data)
-
-summary(full_mod)