mosaic_2020/flu_stats_contingency.R

217 lines
7.1 KiB
R
Executable file

#!/usr/bin/Rscript
getwd()
setwd("~/git/mosaic_2020/")
getwd()
############################################################
# TASK: Contingency table analysis i.e chisq and fishers
# data: clincial data of flu positive adult patients
# group: obesity
# Chisq test
#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
############################################################
#=============
# Input
#=============
source("data_extraction_formatting_clinical.R")
str(clinical_df_ics)
clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv)
clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v)
#=============
# Output
#=============
outfile_name_clin_categ = paste0("flu_stats_clin_categ.csv")
outfile_clin_categ = paste0(outdir_stats, outfile_name_clin_categ)
outfile_clin_categ
########################################################################
# Chisq or fisher's test for clinical data b/w obseity groups
########################################################################
categorical_cols = c( "death"
#, "obesity"
#, "flustat"
, "sfluv"
, "h1n1v"
, "gender"
, "asthma"
#, "o2_sat_suppl" ---> not recoded!?
, "ethnicity"
, "smoking"
, "ia_cxr"
, "max_resp_score"
, "T1_resp_score"
, "com_noasthma"
, "T2_resp_score"
, "inresp_sev"
, "steroid"
, "age_bins"
, "o2_sat_bin"
, "onset_initial_bin"
, "t1_resp_recoded"
, "steroid_ics")
metadata_cols = c("mosaic", "obesity")
categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)]
# quick test
tab = table(categ_df$obesity, categ_df$death)
tab
chisq.test(tab)
#========================================================
# Data: for chisq and fisher
#========================================================
stats_df = subset(categ_df, select = -c(mosaic))
my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols]
if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){
cat("PASS: variables for chisq test successfully extracted")
}else{
cat("FAIL: length mismatch when extracting variables for chisq")
quit()
}
########################################################################
#--------------------
# chisq test
#--------------------
chisq_df = data.frame()
for (i in my_categ_cols){
#print(i)
df = data.frame(clinical_categ_params = NA
, n_obs = NA
, method = NA
#, test_statistic = NA
, p = NA)
my_param_val = (get(i, stats_df))
tab = table(stats_df$obesity, my_param_val)
print(tab)
my_chi_test = chisq.test(tab)
print(my_chi_test)
# extracting results
my_param_name = i
my_n_obs = sum(my_chi_test$observed)
my_method = my_chi_test$method
#my_test_statistic = my_chi_test$statistic[[1]]
my_pval = my_chi_test$p.value
# assiging to loop df
df$clinical_categ_params = my_param_name
df$n_obs = my_n_obs
df$method = my_method
#df$test_statistic = my_test_statistic
df$p = my_pval
print(df)
chisq_df = rbind(chisq_df, df)
}
# formatting
chisq_df$p_format = format.pval(chisq_df$p)
chisq_df$p_signif = chisq_df$p
chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "."
, p_signif <=0.0001 ~ '****'
, p_signif <=0.001 ~ '***'
, p_signif <=0.01 ~ '**'
, p_signif <0.05 ~ '*'
, p_signif == 0.1 ~ '.'
, TRUE ~ 'ns'))
# quick test
tab = table(stats_df$obesity, stats_df$smoking); tab
test1 = chisq.test(tab); test1
test2 = chisq_test(tab); test2 #rstatix
sum(tab)
#================================================================================
#--------------------
# Fishers test: without any OR
# see standalone script for ORs
#--------------------
fisher_df = data.frame()
for (i in my_categ_cols){
cat(i, "\n===============\n")
df = data.frame(clinical_categ_params = NA
, n_obs = NA
, method = NA
#, test_statistic = NA
, p = NA)
my_param_val = (get(i, stats_df))
tab = table(stats_df$obesity, my_param_val)
print(tab)
my_fisher_test = fisher.test(tab)
print(my_fisher_test)
# extracting results
my_param_name = i
my_n_obs = sum(tab)
my_method = my_fisher_test$method
#my_test_statistic = my_fisher_test$estimate[[1]]
my_pval = my_fisher_test$p.value
# assiging to loop df
df$clinical_categ_params = my_param_name
df$n_obs = my_n_obs
df$method = my_method
#df$test_statistic = my_test_statistic
df$p = my_pval
print(df)
fisher_df = rbind(fisher_df, df)
}
# formatting
fisher_df$p_format = format.pval(fisher_df$p)
fisher_df$p_signif = fisher_df$p
fisher_df = dplyr::mutate(fisher_df, p_signif = case_when(p_signif == 0.05 ~ "."
, p_signif <=0.0001 ~ '****'
, p_signif <=0.001 ~ '***'
, p_signif <=0.01 ~ '**'
, p_signif <0.05 ~ '*'
, p_signif == 0.1 ~ '.'
, TRUE ~ 'ns'))
# quick test
tab_f = table(stats_df$obesity, stats_df$smoking); tab_f
test1_f = fisher.test(tab); test1_f
test2_f = fisher_test(tab); test2_f #rstatix
sum(tab_f)
test1_f$estimate
#================================================================================
#--------------------------------
# Combining chisq and fishers df
#---------------------------------
if ( all(colnames(chisq_df) == colnames(fisher_df)) && all(dim(chisq_df) == dim(fisher_df)) ){
cat("Colnames AND dim match for both chisq and fisher df"
, "\nNo. of rows:", nrow(chisq_df)
, "\nNo. of cols:", ncol(chisq_df)
, "\nProceeding to rbind")
comb_stats_categ_df = rbind(chisq_df, fisher_df)
cat("\nCombined df dim:")
print(dim(comb_stats_categ_df))
}
# sort df according to p_signif AND clinical_categ_params
comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif
, comb_stats_categ_df$clinical_categ_params),]
########################################################################
#******************
# write output file
#******************
cat("Chisq and fishers test results in:", outfile_clin_categ)
write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)