#!/usr/bin/Rscript   
getwd()
setwd('~/git/mosaic_2020/')
getwd()
########################################################################
# TASK: Extract relevant columns from mosaic adults data
# npa
########################################################################
#====================
# Input: source data
#====================
source("read_data.R")
source("reg_cols_extraction.R")
########################################################################
#==========
# 
#==========
# extract the flu positive population
fp_adults = adult_df[adult_df$flustat == 1,]

########################################################################
table(adult_df$ia_exac_copd)
table(adult_df$ia_exac_copd==1 & adult_df$asthma == 1) # check this is 4 

table(fp_adults$ia_exac_copd==1 & fp_adults$asthma == 1) # check this is 3

# clear unnecessary variables
rm(all_df)
rm(adult_df)


########################################################################
reg_data = fp_adults[, cols_to_extract]

# sanity checks
table(reg_data$obesity)
#table(reg_data$obese2)

table(reg_data$age>=18)
table(reg_data$death)
table(reg_data$asthma)
table(reg_data$ia_exac_copd)

########################################################################
# Reassign the copd and asthma status and do some checks 
table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))

reg_data$ia_exac_copd[reg_data$ia_exac_copd< 1]<- 0
reg_data$ia_exac_copd[is.na(reg_data$ia_exac_copd)] <- 0

table(reg_data$ia_exac_copd); sum(is.na(reg_data$ia_exac_copd))

# check copd and asthma status
table(reg_data$ia_exac_copd, reg_data$asthma)
check_copd_and_asthma_1<- subset(reg_data, ia_exac_copd ==1 & asthma == 1) # check this is 3 

# reassign these 4 so these are treated as non-asthmatics as copd  with asthma is not TRUE asthma
reg_data$asthma[reg_data$ia_exac_copd == 1 & reg_data$asthma == 1]= 0 
table(reg_data$ia_exac_copd, reg_data$asthma)

foo<- subset(reg_data, asthma==1 & ia_exac_copd ==1) # check that its 0

rm(check_copd_and_asthma_1, foo)
#=====================================================================
# count the resp scores 
max_resp_score_table<- table(reg_data$max_resp_score)
max_resp_score_table

T1_resp_score_table<- table(reg_data$T1_resp_score)
T1_resp_score_table

T2_resp_score_table<- table(reg_data$T2_resp_score)
T2_resp_score_table

Inresp_sev<- table(reg_data$inresp_sev)
Inresp_sev

# Reassign the resp score so all 4 are replace by 3
reg_data$max_resp_score[reg_data$max_resp_score ==4 ] <- 3
revised_resp_score_table<- table(reg_data$max_resp_score)
revised_resp_score_table

reg_data$T1_resp_score[reg_data$T1_resp_score ==4 ] <- 3
revised_T1_resp_score_table<- table(reg_data$T1_resp_score)
revised_T1_resp_score_table

reg_data$T2_resp_score[reg_data$T2_resp_score == 4]<- 3
revised_T2_resp_score_table<- table(reg_data$T2_resp_score)
revised_T2_resp_score_table

reg_data$inresp_sev[reg_data$inresp_sev == 4]<- 3
revised_Inresp_sev<- table(reg_data$inresp_sev)
revised_Inresp_sev
#=====================================================================
# Remove these after checking
rm(max_resp_score_table, T1_resp_score_table, T2_resp_score_table, Inresp_sev
   , revised_resp_score_table, revised_T1_resp_score_table, revised_T2_resp_score_table, revised_Inresp_sev)
#=====================================================================


##### age
# Create categories of variables
reg_data$age = round(reg_data$age, digits = 0)
table(reg_data$age)
table(reg_data$asthma, reg_data$age)
min(reg_data$age); max(reg_data$age)

library(plyr)
max_age_interval = round_any(max(reg_data$age), 10, f = ceiling)  
max_age_interval 

#age_bins = cut(reg_data$age, c(0,18,30,40,50,60,70,80,90))
age_bins = cut(reg_data$age, c(18, 30, 40, 50, 60, 70, max_age_interval))
reg_data$age_bins = age_bins
dim(reg_data) # 133 27

#age_bins (to keep consistent with the results table)
class(reg_data$age_bins)
levels(reg_data$age_bins)
#"(18,30]" "(30,40]" "(40,50]" "(50,60]" "(60,70]" "(70,80]"
table(reg_data$asthma, reg_data$age_bins)
#     (18,30] (30,40] (40,50] (50,60] (60,70] (70,80]
#0      25      17      23      14      10       1
#1      11       8      14       5       3      2

sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)

#reassign
levels(reg_data$age_bins) <- c("(18,30]","(30,40]","(40,50]","(50,80]","(50,80]","(50,80]")
table(reg_data$asthma, reg_data$age_bins)
table(reg_data$asthma, reg_data$age_bins)
#(18,30] (30,40] (40,50] (50,60]
#0      25      17      23      25
#1      11       8      14      10

sum(table(reg_data$asthma, reg_data$age_bins)) == nrow(reg_data)

##### O2 saturation binning
reg_data$o2_sat_admis = round(reg_data$o2_sat_admis, digits = 0)
table(reg_data$o2_sat_admis)
tot_o2 = sum(table(reg_data$o2_sat_admis))- table(reg_data$o2_sat_admis)[["-1"]]
tot_o2

o2_sat_bin = cut(reg_data$o2_sat_admis, c(0,92,100))
reg_data$o2_sat_bin = o2_sat_bin
table(reg_data$o2_sat_bin)

sum(table(reg_data$o2_sat_bin)) == tot_o2

##### Onset to initial binning = "(==not inclusive)
max_in = max(reg_data$onset_2_initial); max_in #23
min_in = min(reg_data$onset_2_initial) - 1 ; min_in  # -6

tot_onset2ini = sum(table(reg_data$onset_2_initial))
tot_onset2ini

onset_initial_bin = cut(reg_data$onset_2_initial, c(min_in, 4, max_in))
reg_data$onset_initial_bin = onset_initial_bin

sum(table(reg_data$onset_initial_bin)) == tot_onset2ini

#=======================
# seasonal flu: sfluv
#=======================
# should be a factor
if (! is.factor(reg_data$sfluv)){
  reg_data$sfluv  = as.factor(reg_data$sfluv)
}
class(reg_data$sfluv) #[1] "factor"

levels(reg_data$sfluv)
table(reg_data$asthma, reg_data$sfluv)
# reassign
levels(reg_data$sfluv) <- c("0", "0", "1")
table(reg_data$asthma, reg_data$sfluv)

#=======================
# h1n1v
#=======================
# should be a factor
if (! is.factor(reg_data$h1n1v)){
  reg_data$h1n1v  = as.factor(reg_data$h1n1v)
}
class(reg_data$h1n1v) #[1] "factor"

levels(reg_data$h1n1v)
table(reg_data$asthma, reg_data$h1n1v)
# reassign
levels(reg_data$h1n1v) <- c("0", "0", "1")
table(reg_data$asthma, reg_data$h1n1v)

#=======================
# ethnicity
#=======================
class(reg_data$ethnicity) # integer
table(reg_data$asthma, reg_data$ethnicity)

reg_data$ethnicity[reg_data$ethnicity == 4] <- 2
table(reg_data$asthma, reg_data$ethnicity)

#=======================
# pneumonia
#=======================
class(reg_data$ia_cxr) # integer
# ia_cxr 2 ---> yes pneumonia (1)
# 1 ---> no (0)
# ! 1 or 2 -- > "unkown"

# reassign the pneumonia codes
#0: not performed 
#1: normal
#2: findings consistent with pneumonia
#3: abnormal
#-1: not recorded
#-2: n/a specified by the clinician # not in the data...
#-3: unknown specified by clinician


table(reg_data$ia_cxr)
#-3 -1  0  1  2  3 
#5 48 13 47 17  3 

# change these first else recoding 0 will be a problem as 0 already exists, mind you -2 categ doesn't exist
reg_data$ia_cxr[reg_data$ia_cxr == -3 | reg_data$ia_cxr == -1 | reg_data$ia_cxr == 0 | reg_data$ia_cxr == 3 ] <- ""
table(reg_data$ia_cxr)
#    1  2 
#69 47 17

reg_data$ia_cxr[reg_data$ia_cxr == 1] <- 0
reg_data$ia_cxr[reg_data$ia_cxr == 2] <- 1
table(reg_data$ia_cxr)
#    0  1 
#69 47 17 

#=======================
# smoking [tricky one]
#=======================
class(reg_data$smoking) # integer
table(reg_data$asthma, reg_data$smoking)

# orig
#   -3 -1  1  2  3   4
#0  15 9  22  2  15  30
#1  4  2  13  0  4   17

#  -3 -1  1  2  3  4
#0 14  9 20  2 15 30
#1  5  2 15  0  4 17

# never smoking, 4 and 2 -- > no (0)
#1 and 3 ---> yes (1)
#!-3 and -1 ---- > NA

################# smoking

#1: current daily ===> categ smoker(1)
#2: occasional =====> categ no smoker(0)
#3: ex-smoker ===> categ smoker(1)
#4: never =====> categ no smoker(0)
#-1: not recorded =====> categ blank (NA)
#-2: n/a specified by the clinician =====> categ blank (NA)
#-3: unknown specified by clinician=====> categ blank (NA)

table(reg_data$smoking)
#-3 -1  1  2  3  4 
#19 11 35  2 19 47 

# reassign the smoking codes
reg_data$smoking[reg_data$smoking == 4 | reg_data$smoking == 2 ] <- 0
reg_data$smoking[reg_data$smoking == 1 | reg_data$smoking == 3 ] <- 1
reg_data$smoking[reg_data$smoking == -1 | reg_data$smoking == -2 | reg_data$smoking == -3 ] <- ""

table(reg_data$smoking)
#    0  1 
#30 49 54  

table(reg_data$asthma, reg_data$smoking)

# orig
#      0  1
#0 24 32 37
#1  6 17 17

#      0  1
#0 23 32 35
#1  7 17 19

################################################################
#==================
# writing output file
#==================
outfile_name_reg = "reg_data_recoded_with_NA.csv"
outfile_reg = paste0(outdir, outfile_name_reg)

cat("Writing clinical file for regression:", outfile_reg)

#write.csv(reg_data, file = outfile_reg)
################################################################

rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)