#!/usr/bin/Rscript   
getwd()
setwd("~/git/mosaic_2020/")
getwd()
############################################################
# TASK: unpaired (time) analysis of clinical data
# data: clincial data of flu positive adult patients
# group: obesity
############################################################
#my_sample_type = "npa"

#=============
# Input
#=============
source("data_extraction_formatting_clinical.R")
str(clinical_df)

clinical_df$sfluv = as.integer(clinical_df$sfluv)
clinical_df$h1n1v  = as.integer(clinical_df$h1n1v)

#=============
# Output
#=============
outfile_name_clin_categ = paste0("flu_stats_clin_categ.csv")
outfile_clin_categ = paste0(outdir_stats, outfile_name_clin_categ)
outfile_clin_categ

########################################################################
# Chisq or fisher's test for clinical data b/w obseity groups
########################################################################
categorical_cols = c( "death"
                   #, "obesity"
                   #, "flustat"
                   , "sfluv"
                   , "h1n1v"
                   , "gender"
                   , "asthma"
                   #, "o2_sat_suppl" ---> not recoded!?
                   , "ethnicity"
                   , "smoking"
                   , "ia_cxr"
                   , "max_resp_score"
                   , "T1_resp_score"
                   , "com_noasthma"
                   , "T2_resp_score"
                   , "inresp_sev"
                   , "steroid")


metadata_cols = c("mosaic", "obesity")

categ_df = clinical_df[, c(metadata_cols, categorical_cols)]

# quick test
tab = table(categ_df$obesity, categ_df$death)
tab
chisq.test(tab)

pivot_cols = metadata_cols
#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
expected_rows_categ_lf = nrow(categ_df) * ( length(categ_df) - length(pivot_cols) ); expected_rows_categ_lf

colnames(categ_df)

keycol <- "clinical_categ_params"
valuecol <- "value"
gathercols <- c("death"
                #, "flustat"
                , "sfluv", "h1n1v"
                , "gender", "asthma"
                #, "o2_sat_suppl"
                , "ethnicity"
                , "smoking", "ia_cxr", "max_resp_score", "T1_resp_score"
                , "com_noasthma", "T2_resp_score", "inresp_sev", "steroid")

categ_lf = gather_(categ_df, keycol, valuecol, gathercols)
#categ_lf = gather(categ_df, clinical_categ_params, value, death:T2_resp_score, factor_key = F)

if( nrow(categ_lf) == expected_rows_categ_lf){
  cat("PASS: long format data created successfully"
      , "\nnrow:", nrow(categ_lf)
      , "\nncol:", ncol(categ_lf))
}

#=============================================
# Chisq test: clinical categorical vars
#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
#=============================================


#=============================================
# wf data
#=============================================
stats_df = subset(categ_df, select = -c(mosaic))
drop_cols = c("mosaic", "obesity")

my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%drop_cols]

if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){
  cat("PASS: variables for chisq test successfully extracted")
}else{
  cat("FAIL: length mismatch when extracting variables for chisq")
  quit()
}

# quick test
tab = table(stats_df$obesity, stats_df$smoking)
tab
test1 = chisq.test(tab)

chisq_df = data.frame()
for (i in my_categ_cols){
  #print(i)
  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA)
  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_chi_test = chisq.test(tab)
  print(my_chi_test)
  
  # extracting results
  my_param_name = i
  my_n_obs = sum(my_chi_test$observed)
  my_method = my_chi_test$method
  my_test_statistic = my_chi_test$statistic[[1]]
  my_pval = my_chi_test$p.value
  
  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method 
  df$test_statistic = my_test_statistic
  df$p = my_pval
  
  print(df)
  chisq_df = rbind(chisq_df, df)
}

chisq_df$p_format = format.pval(chisq_df$p)
chisq_df$p_signif = chisq_df$p

chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "."
                                                      , p_signif <=0.0001 ~ '****'
                                                      , p_signif <=0.001 ~ '***'
                                                      , p_signif <=0.01 ~ '**'
                                                      , p_signif <0.05 ~ '*'
                                                      , p_signif == 0.1 ~ '.'
                                                      , TRUE ~ 'ns'))
#=============================================
# Fishers test
#=============================================
# quick test
tab2 = table(stats_df$obesity, stats_df$smoking)
tab2
#test2 = fisher_test(tab2); test2
test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3

stats_df_copy = stats_df
int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric")
int_vars
stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor)
str(stats_df_copy)

two_lev = lapply(stats_df_copy, nlevels) == 2
fisher_cols_df1  = names(two_lev)[two_lev == TRUE]
fisher_cols_df1

drop_cols = c("mosaic", "obesity")
fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%drop_cols]
fisher_cols_df1

fisher_df1 = data.frame()

for (i in fisher_cols_df1){
  cat(i, "\n===============\n")
  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA)
  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_fisher_test = fisher.test(tab)
  print(my_fisher_test)
  
  # extracting results
  my_param_name = i
  my_n_obs = sum(tab)
  my_method = my_fisher_test$method
  my_test_statistic = my_fisher_test$statistic[[1]] # FIXME|
  my_pval = my_fisher_test$p.value
  my_ci_low = my_fisher_test$conf.int[[1]]
  my_ci_hi = my_fisher_test$conf.int[[2]]
  
  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method 
  df$test_statistic = my_test_statistic
  df$p = my_pval
  df$ci_low = my_ci_low
  df$ci_high = my_ci_hi
  
  print(df)
  fisher_df1 = rbind(fisher_df1, df)
}

#=============

three_lev = lapply(stats_df_copy, nlevels) == 3
fisher_cols_df2  = names(three_lev)[three_lev == TRUE]
fisher_cols_df2

#drop_cols = c("mosaic", "obesity")
#fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%drop_cols]
#fisher_cols_df2

fisher_df2 = data.frame()

for (i in fisher_cols_df2){
  cat(i, "\n===============\n")
  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA)
  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_fisher_test = fisher.test(tab)
  print(my_fisher_test)
  
  # extracting results
  my_param_name = i
  my_n_obs = sum(tab)
  my_method = my_fisher_test$method
  my_test_statistic = ">2 categories"
  my_pval = my_fisher_test$p.value
  
  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method 
  df$test_statistic = my_test_statistic
  df$p = my_pval
  
  print(df)
  fisher_df2 = rbind(fisher_df2, df)
}


########################################################################
#******************
# write output file
#******************
cat("Chisq and fishers test results in:", outfile_clin_categ)
#write.csv(stats_categ_df_f, outfile_clin_categ, row.names = FALSE)