diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R
new file mode 100644
index 0000000..c6c5add
--- /dev/null
+++ b/data_extraction_formatting_clinical.R
@@ -0,0 +1,323 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/mosaic_2020/')
+getwd()
+########################################################################
+# TASK: Extract relevant columns from mosaic adults data
+# npa
+########################################################################
+#====================
+# Input: source data
+#====================
+source("read_data.R")
+
+# extract the flu positive population
+fp_adults = adult_df[adult_df$flustat == 1,]
+
+table(adult_df$ia_exac_copd)
+table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 
+
+table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3
+
+# clear unnecessary variables
+rm(all_df)
+rm(adult_df)
+########################################################################
+cols_to_extract = c("mosaic"
+                    , "ia_exac_copd"
+                    , "death"
+                    #, "obese2" #inc peaeds, but once you subset data for adults, its the same!
+                    , "obesity"
+                    , "flustat"
+                    , "sfluv"
+                    , "h1n1v"
+                    , "age"
+                    , "gender"
+                    , "asthma"
+                    , "vl_pfu_ul_npa1"
+                    , "los"
+                    , "onset2final"
+                    , "onsfindeath"
+                    , "onset_2_initial"
+                    , "o2_sat_admis"
+                    , "o2_sat_suppl"
+                    , "ethnicity"
+                    , "smoking"
+                    , "ia_cxr"
+                    , "max_resp_score"
+                    , "T1_resp_score"
+                    , "com_noasthma"
+                    , "T2_resp_score"
+                    , "inresp_sev"
+                    , "steroid")
+
+#npa_data = 
+
+reg_data = fp_adults[, cols_to_extract]
+
+# sanity checks
+table(reg_data$obesity)
+#table(reg_data$obese2)
+
+table(reg_data$age>=18)
+table(reg_data$death)
+table(reg_data$asthma)
+table(reg_data$ia_exac_copd)
+
+########################################################################
+# Reassign the copd and asthma status and do some checks 
+table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
+
+reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
+reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0
+
+table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))
+
+# check copd and asthma status
+table(reg_data$ia_exac_copd, reg_data$asthma)
+check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 
+
+# reassign these 4 so these are treated as non-asthmatics as copd  with asthma is not TRUE asthma
+reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 
+table(reg_data$ia_exac_copd, reg_data$asthma)
+
+foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0
+
+rm(check_copd_and_asthma_1, foo)
+#=====================================================================
+# count the resp scores 
+max_resp_score_table<- table(reg_data$max_resp_score)
+max_resp_score_table
+
+T1_resp_score_table<- table(reg_data$T1_resp_score)
+T1_resp_score_table
+
+T2_resp_score_table<- table(reg_data$T2_resp_score)
+T2_resp_score_table
+
+Inresp_sev<- table(reg_data$inresp_sev)
+Inresp_sev
+
+# Reassign the resp score so all 4 are replace by 3
+reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
+revised_resp_score_table<- table(reg_data$max_resp_score)
+revised_resp_score_table
+
+reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
+revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
+revised_T1_resp_score_table
+
+reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
+revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
+revised_T2_resp_score_table
+
+reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
+revised_Inresp_sev<- table(reg_data$inresp_sev)
+revised_Inresp_sev
+#=====================================================================
+# Remove these after checking
+rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
+   , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
+#=====================================================================
+
+
+##### age
+# Create categories of variables
+reg_data$age = round(reg_data$age, digits = 0)
+table(reg_data$age)
+table(reg_data$asthma, reg_data$age)
+min(reg_data$age); max(reg_data$age)
+
+library(plyr)
+max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)  
+max_age_interval 
+
+#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
+age_bins = cut(reg_data$age, c(18, 30, 40, 50, 60, 70, max_age_interval))
+reg_data$age_bins = age_bins
+dim(reg_data) # 133 27
+
+#age_bins (to keep consistent with the results table)
+class(reg_data$age_bins)
+levels(reg_data$age_bins)
+#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
+table(reg_data$asthma, reg_data$age_bins)
+#     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
+#0      25      17      23      14      10       1
+#1      11       8      14       5       3      2
+
+sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
+
+#reassign
+levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
+table(reg_data$asthma, reg_data$age_bins)
+table(reg_data$asthma, reg_data$age_bins)
+#(18,30] (30,40] (40,50] (50,60]
+#0      25      17      23      25
+#1      11       8      14      10
+
+sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)
+
+##### O2 saturation binning
+reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
+table(reg_data$o2_sat_admis)
+tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
+tot_o2
+
+o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
+reg_data$o2_sat_bin = o2_sat_bin
+table(reg_data$o2_sat_bin)
+
+sum(table(reg_data$o2_sat_bin)) == tot_o2
+
+##### Onset to initial binning = "(==not inclusive)
+max_in = max(reg_data$onset_2_initial); max_in #23
+min_in = min(reg_data$onset_2_initial) - 1 ; min_in  # -6
+
+tot_onset2ini = sum(table(reg_data$onset_2_initial))
+tot_onset2ini
+
+onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
+reg_data$onset_initial_bin = onset_initial_bin
+
+sum(table(reg_data$onset_initial_bin)) == tot_onset2ini
+
+#=======================
+# seasonal flu: sfluv
+#=======================
+# should be a factor
+if (! is.factor(reg_data$sfluv)){
+  reg_data$sfluv  = as.factor(reg_data$sfluv)
+}
+class(reg_data$sfluv) #[1] "factor"
+
+levels(reg_data$sfluv)
+table(reg_data$asthma, reg_data$sfluv)
+# reassign
+levels(reg_data$sfluv) <- c("0", "0", "1")
+table(reg_data$asthma, reg_data$sfluv)
+
+#=======================
+# h1n1v
+#=======================
+# should be a factor
+if (! is.factor(reg_data$h1n1v)){
+  reg_data$h1n1v  = as.factor(reg_data$h1n1v)
+}
+class(reg_data$h1n1v) #[1] "factor"
+
+levels(reg_data$h1n1v)
+table(reg_data$asthma, reg_data$h1n1v)
+# reassign
+levels(reg_data$h1n1v) <- c("0", "0", "1")
+table(reg_data$asthma, reg_data$h1n1v)
+
+#=======================
+# ethnicity
+#=======================
+class(reg_data$ethnicity) # integer
+table(reg_data$asthma, reg_data$ethnicity)
+
+reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
+table(reg_data$asthma, reg_data$ethnicity)
+
+#=======================
+# pneumonia
+#=======================
+class(reg_data$ia_cxr) # integer
+# ia_cxr 2 ---> yes pneumonia (1)
+# 1 ---> no (0)
+# ! 1 or 2 -- > "unkown"
+
+# reassign the pneumonia codes
+#0: not performed 
+#1: normal
+#2: findings consistent with pneumonia
+#3: abnormal
+#-1: not recorded
+#-2: n/a specified by the clinician # not in the data...
+#-3: unknown specified by clinician
+
+
+table(reg_data$ia_cxr)
+#-3 -1  0  1  2  3 
+#5 48 13 47 17  3 
+
+# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
+reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
+table(reg_data$ia_cxr)
+#    1  2 
+#69 47 17
+
+reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
+reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
+table(reg_data$ia_cxr)
+#    0  1 
+#69 47 17 
+
+#=======================
+# smoking [tricky one]
+#=======================
+class(reg_data$smoking) # integer
+table(reg_data$asthma, reg_data$smoking)
+
+# orig
+#   -3 -1  1  2  3   4
+#0  15 9  22  2  15  30
+#1  4  2  13  0  4   17
+
+#  -3 -1  1  2  3  4
+#0 14  9 20  2 15 30
+#1  5  2 15  0  4 17
+
+# never smoking, 4 and 2 -- > no (0)
+#1 and 3 ---> yes (1)
+#!-3 and -1 ---- > NA
+
+################# smoking
+
+#1: current daily ===> categ smoker(1)
+#2: occasional =====> categ no smoker(0)
+#3: ex-smoker ===> categ smoker(1)
+#4: never =====> categ no smoker(0)
+#-1: not recorded =====> categ blank (NA)
+#-2: n/a specified by the clinician =====> categ blank (NA)
+#-3: unknown specified by clinician=====> categ blank (NA)
+
+table(reg_data$smoking)
+#-3 -1  1  2  3  4 
+#19 11 35  2 19 47 
+
+# reassign the smoking codes
+reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
+reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
+reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""
+
+table(reg_data$smoking)
+#    0  1 
+#30 49 54  
+
+table(reg_data$asthma, reg_data$smoking)
+
+# orig
+#      0  1
+#0 24 32 37
+#1  6 17 17
+
+#      0  1
+#0 23 32 35
+#1  7 17 19
+
+################################################################
+#==================
+# writing output file
+#==================
+outfile_name_reg = "reg_data_recoded_with_NA.csv"
+outfile_reg = paste0(outdir, outfile_name_reg)
+
+cat("Writing clinical file for regression:", outfile_reg)
+
+#write.csv(reg_data, file = outfile_reg)
+################################################################
+
+rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
diff --git a/logistic_regression.R b/logistic_regression.R
new file mode 100755
index 0000000..d483858
--- /dev/null
+++ b/logistic_regression.R
@@ -0,0 +1,97 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/mosaic_2020/')
+getwd()
+########################################################################
+# TASK: Run regression analysis
+# npa
+########################################################################
+#=================================================================================
+# TO DO:
+# Simple stats b/w obesity and non-obesity to consider including in reg analysis
+# Include NPA statistically sign params
+# Rerun graphs and plots without asthma?
+#=================================================================================
+
+#====================
+# Input: source data
+#====================
+source("data_extraction_formatting_clinical.R")
+
+rm(fp_adults, metadata_all)
+
+########################################################################
+my_data = reg_data
+#########################################################################
+# check factor of each column
+lapply(my_data, class)
+
+character_vars <- lapply(my_data, class) == "character"
+character_vars
+table(character_vars)
+
+factor_vars <- lapply(my_data, class) == "factor"
+table(factor_vars)
+
+my_data[, character_vars] <- lapply(my_data[, character_vars], as.factor)
+factor_vars <- lapply(my_data, class) == "factor"
+factor_vars
+table(factor_vars)
+
+# check again
+lapply(my_data, class)
+
+table(my_data$ethnicity)
+my_data$ethnicity = as.factor(my_data$ethnicity)
+class(my_data$ethnicity)
+
+colnames(my_data)
+reg_param = c("age"
+              , "age_bins"
+              #, "death" # outcome
+              , "asthma"
+              , "obesity"
+              , "gender"
+              , "los"
+              , "o2_sat_admis"
+              #, "logistic_outcome"
+              #, "steroid_ics"
+              , "ethnicity"
+              , "smoking"
+              , "sfluv"
+              , "h1n1v"
+              , "ia_cxr"
+              , "max_resp_score"
+              , "T1_resp_score"
+              , "com_noasthma"
+              , "onset_initial_bin")
+
+for(i in reg_param) { 
+  # print (i)
+  p_form = as.formula(paste("death ~ ", i ,sep = ""))
+  model_reg = glm(p_form , family = binomial, data = my_data)
+  print(summary(model_reg))
+  print(exp(cbind(OR = coef(model_reg), confint(model_reg))))
+  #print (PseudoR2(model_reg))
+  cat("=================================================================================\n")
+}
+
+
+full_mod = glm(death ~ asthma +
+                 gender + 
+                 age_bins + 
+                 los + 
+                 #ethnicity +
+                 onset_initial_bin +
+                 o2_sat_bin +
+                 com_noasthma +
+                 obesity +
+                 #ia_cxr +
+                 smoking +
+                 #sfluv +
+                 #h1n1v
+                 max_resp_score +
+                 T1_resp_score +
+               , family = "binomial", data = my_data)
+
+summary(full_mod)