mosaic_2020/flu_stats_contingency.R

#!/usr/bin/Rscript
getwd()
setwd("~/git/mosaic_2020/")
getwd()
############################################################
# TASK: Contingency table analysis i.e chisq and fishers
# data: clincial data of flu positive adult patients
# group: obesity

# Chisq test
#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
############################################################

#=============
# Input
#=============
source("data_extraction_formatting_clinical.R")
str(clinical_df_ics)

clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv)
clinical_df_ics$h1n1v  = as.integer(clinical_df_ics$h1n1v)

#=============
# Output
#=============
outfile_name_clin_categ = paste0("flu_stats_clin_categ.csv")
outfile_clin_categ = paste0(outdir_stats, outfile_name_clin_categ)
outfile_clin_categ

########################################################################
# Chisq or fisher's test for clinical data b/w obseity groups
########################################################################
categorical_cols = c( "death"
                   #, "obesity"
                   #, "flustat"
                   , "sfluv"
                   , "h1n1v"
                   , "gender"
                   , "asthma"
                   #, "o2_sat_suppl" ---> not recoded!?
                   , "ethnicity"
                   , "smoking"
                   , "ia_cxr"
                   , "max_resp_score"
                   , "T1_resp_score"
                   , "com_noasthma"
                   , "T2_resp_score"
                   , "inresp_sev"
                   , "steroid"
                   , "age_bins"
                   , "o2_sat_bin"
                   , "onset_initial_bin"
                   , "t1_resp_recoded"
                   , "steroid_ics")

metadata_cols = c("mosaic", "obesity")

categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)]

# quick test
tab = table(categ_df$obesity, categ_df$death)
tab
chisq.test(tab)

#========================================================
# Data: for chisq and fisher
#========================================================
stats_df = subset(categ_df, select = -c(mosaic))

my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols]

if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){
  cat("PASS: variables for chisq test successfully extracted")
}else{
  cat("FAIL: length mismatch when extracting variables for chisq")
  quit()
}
########################################################################
#--------------------
# chisq test
#--------------------
chisq_df = data.frame()
for (i in my_categ_cols){
  #print(i)
  df = data.frame(clinical_categ_params = NA
                  , n_obs = NA
                  , method = NA
                  #, test_statistic = NA
                  , p = NA)

  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_chi_test = chisq.test(tab)
  print(my_chi_test)

  # extracting results
  my_param_name = i
  my_n_obs = sum(my_chi_test$observed)
  my_method = my_chi_test$method
  #my_test_statistic = my_chi_test$statistic[[1]]
  my_pval = my_chi_test$p.value

  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method
  #df$test_statistic = my_test_statistic
  df$p = my_pval

  print(df)
  chisq_df = rbind(chisq_df, df)
}

# formatting
chisq_df$p_format = format.pval(chisq_df$p)
chisq_df$p_signif = chisq_df$p

chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "."
                                                      , p_signif <=0.0001 ~ '****'
                                                      , p_signif <=0.001 ~ '***'
                                                      , p_signif <=0.01 ~ '**'
                                                      , p_signif <0.05 ~ '*'
                                                      , p_signif == 0.1 ~ '.'
                                                      , TRUE ~ 'ns'))

# quick test
tab = table(stats_df$obesity, stats_df$smoking); tab
test1 = chisq.test(tab); test1
test2 = chisq_test(tab); test2 #rstatix
sum(tab)

#================================================================================
#--------------------
# Fishers test: without any OR
# see standalone script for ORs
#--------------------

fisher_df = data.frame()

for (i in my_categ_cols){
  cat(i, "\n===============\n")
  df = data.frame(clinical_categ_params = NA
                  , n_obs = NA
                  , method = NA
                  #, test_statistic = NA
                  , p = NA)

  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_fisher_test = fisher.test(tab)
  print(my_fisher_test)

  # extracting results
  my_param_name = i
  my_n_obs = sum(tab)
  my_method = my_fisher_test$method
  #my_test_statistic = my_fisher_test$estimate[[1]]
  my_pval = my_fisher_test$p.value

  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method
  #df$test_statistic = my_test_statistic
  df$p = my_pval

  print(df)
  fisher_df = rbind(fisher_df, df)
}

# formatting
fisher_df$p_format = format.pval(fisher_df$p)
fisher_df$p_signif = fisher_df$p

fisher_df = dplyr::mutate(fisher_df, p_signif = case_when(p_signif == 0.05 ~ "."
                                                        , p_signif <=0.0001 ~ '****'
                                                        , p_signif <=0.001 ~ '***'
                                                        , p_signif <=0.01 ~ '**'
                                                        , p_signif <0.05 ~ '*'
                                                        , p_signif == 0.1 ~ '.'
                                                        , TRUE ~ 'ns'))

# quick test
tab_f = table(stats_df$obesity, stats_df$smoking); tab_f
test1_f = fisher.test(tab); test1_f
test2_f = fisher_test(tab); test2_f #rstatix
sum(tab_f)
test1_f$estimate

#================================================================================
#--------------------------------
# Combining chisq and fishers df
#---------------------------------
if ( all(colnames(chisq_df) == colnames(fisher_df)) &&  all(dim(chisq_df) == dim(fisher_df)) ){
  cat("Colnames AND dim match for both chisq and fisher df"
      , "\nNo. of rows:", nrow(chisq_df)
      , "\nNo. of cols:", ncol(chisq_df)
      , "\nProceeding to rbind")

  comb_stats_categ_df = rbind(chisq_df, fisher_df)

  cat("\nCombined df dim:")
  print(dim(comb_stats_categ_df))

}

# sort df according to p_signif AND clinical_categ_params
comb_stats_categ_df_f =  comb_stats_categ_df[order(comb_stats_categ_df$p_signif
                                                           , comb_stats_categ_df$clinical_categ_params),]
########################################################################
#******************
# write output file
#******************
cat("Chisq and fishers test results in:", outfile_clin_categ)
write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)