added flu_stats_fisher_only.R containing separate dfs based on var factor levels
This commit is contained in:
parent
74769b9a38
commit
a6cbaab40a
1 changed files with 181 additions and 0 deletions
181
flu_stats_fisher_only.R
Executable file
181
flu_stats_fisher_only.R
Executable file
|
@ -0,0 +1,181 @@
|
||||||
|
#!/usr/bin/Rscript
|
||||||
|
getwd()
|
||||||
|
setwd("~/git/mosaic_2020/")
|
||||||
|
getwd()
|
||||||
|
############################################################
|
||||||
|
# TASK: Contingency table analysis i.e chisq and fishers
|
||||||
|
# data: clincial data of flu positive adult patients
|
||||||
|
# group: obesity
|
||||||
|
|
||||||
|
# Chisq test
|
||||||
|
#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
#=============
|
||||||
|
# Input
|
||||||
|
#=============
|
||||||
|
source("data_extraction_formatting_clinical.R")
|
||||||
|
str(clinical_df_ics)
|
||||||
|
|
||||||
|
clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv)
|
||||||
|
clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v)
|
||||||
|
|
||||||
|
#=============
|
||||||
|
# Output
|
||||||
|
#=============
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
# Fisher's test for clinical data b/w obesity groups
|
||||||
|
########################################################################
|
||||||
|
categorical_cols = c( "death"
|
||||||
|
#, "obesity"
|
||||||
|
#, "flustat"
|
||||||
|
, "sfluv"
|
||||||
|
, "h1n1v"
|
||||||
|
, "gender"
|
||||||
|
, "asthma"
|
||||||
|
#, "o2_sat_suppl" ---> not recoded!?
|
||||||
|
, "ethnicity"
|
||||||
|
, "smoking"
|
||||||
|
, "ia_cxr"
|
||||||
|
, "max_resp_score"
|
||||||
|
, "T1_resp_score"
|
||||||
|
, "com_noasthma"
|
||||||
|
, "T2_resp_score"
|
||||||
|
, "inresp_sev"
|
||||||
|
, "steroid"
|
||||||
|
, "age_bins"
|
||||||
|
, "o2_sat_bin"
|
||||||
|
, "onset_initial_bin"
|
||||||
|
, "t1_resp_recoded"
|
||||||
|
, "steroid_ics")
|
||||||
|
|
||||||
|
metadata_cols = c("mosaic", "obesity")
|
||||||
|
|
||||||
|
categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)]
|
||||||
|
|
||||||
|
my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols]
|
||||||
|
|
||||||
|
if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){
|
||||||
|
cat("PASS: variables for chisq test successfully extracted")
|
||||||
|
}else{
|
||||||
|
cat("FAIL: length mismatch when extracting variables for chisq")
|
||||||
|
quit()
|
||||||
|
}
|
||||||
|
|
||||||
|
#========================================================
|
||||||
|
# Data: for fisher:
|
||||||
|
#2 levels: with OR
|
||||||
|
#3 levels: without OR
|
||||||
|
#========================================================
|
||||||
|
stats_df = subset(categ_df, select = -c(mosaic))
|
||||||
|
|
||||||
|
stats_df_copy = stats_df
|
||||||
|
int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric")
|
||||||
|
int_vars
|
||||||
|
stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor)
|
||||||
|
str(stats_df_copy)
|
||||||
|
|
||||||
|
#-----------
|
||||||
|
# 2 levels
|
||||||
|
#-----------
|
||||||
|
two_lev = lapply(stats_df_copy, nlevels) == 2
|
||||||
|
fisher_cols_df1 = names(two_lev)[two_lev == TRUE]
|
||||||
|
fisher_cols_df1
|
||||||
|
|
||||||
|
cat("\nTotal no. of cols:", ncol(stats_df_copy)
|
||||||
|
, "\nNo. of vars with 2 factor levels:", length(fisher_cols_df1)
|
||||||
|
, "\nThese are:\n"
|
||||||
|
, fisher_cols_df1)
|
||||||
|
|
||||||
|
fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%metadata_cols]
|
||||||
|
fisher_cols_df1
|
||||||
|
|
||||||
|
fisher_df1 = data.frame()
|
||||||
|
|
||||||
|
for (i in fisher_cols_df1){
|
||||||
|
cat(i, "\n===============\n")
|
||||||
|
df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA)
|
||||||
|
my_param_val = (get(i, stats_df))
|
||||||
|
tab = table(stats_df$obesity, my_param_val)
|
||||||
|
print(tab)
|
||||||
|
my_fisher_test = fisher.test(tab)
|
||||||
|
print(my_fisher_test)
|
||||||
|
|
||||||
|
# extracting results
|
||||||
|
my_param_name = i
|
||||||
|
my_n_obs = sum(tab)
|
||||||
|
my_method = my_fisher_test$method
|
||||||
|
my_test_statistic = my_fisher_test$estimate[[1]] # FIXME|
|
||||||
|
my_pval = my_fisher_test$p.value
|
||||||
|
my_ci_low = my_fisher_test$conf.int[[1]]
|
||||||
|
my_ci_hi = my_fisher_test$conf.int[[2]]
|
||||||
|
|
||||||
|
# assiging to loop df
|
||||||
|
df$clinical_categ_params = my_param_name
|
||||||
|
df$n_obs = my_n_obs
|
||||||
|
df$method = my_method
|
||||||
|
df$test_statistic = my_test_statistic
|
||||||
|
df$p = my_pval
|
||||||
|
df$ci_low = my_ci_low
|
||||||
|
df$ci_high = my_ci_hi
|
||||||
|
|
||||||
|
print(df)
|
||||||
|
fisher_df1 = rbind(fisher_df1, df) # FIXME, test_statistic col not getting created
|
||||||
|
}
|
||||||
|
|
||||||
|
#-----------
|
||||||
|
# >2 levels
|
||||||
|
#-----------
|
||||||
|
multi_lev = lapply(stats_df_copy, nlevels) > 2
|
||||||
|
fisher_cols_df2 = names(multi_lev)[multi_lev == TRUE]
|
||||||
|
fisher_cols_df2
|
||||||
|
|
||||||
|
cat("\nTotal no. of cols:", ncol(stats_df_copy)
|
||||||
|
, "\nNo. of vars with >2 factor levels:", length(fisher_cols_df2)
|
||||||
|
, "\nThese are:\n"
|
||||||
|
, fisher_cols_df2)
|
||||||
|
|
||||||
|
|
||||||
|
fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%metadata_cols]
|
||||||
|
fisher_cols_df2
|
||||||
|
|
||||||
|
fisher_df2 = data.frame()
|
||||||
|
|
||||||
|
for (i in fisher_cols_df2){
|
||||||
|
cat(i, "\n===============\n")
|
||||||
|
df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA)
|
||||||
|
my_param_val = (get(i, stats_df))
|
||||||
|
tab = table(stats_df$obesity, my_param_val)
|
||||||
|
print(tab)
|
||||||
|
my_fisher_test = fisher.test(tab)
|
||||||
|
print(my_fisher_test)
|
||||||
|
|
||||||
|
# extracting results
|
||||||
|
my_param_name = i
|
||||||
|
my_n_obs = sum(tab)
|
||||||
|
my_method = my_fisher_test$method
|
||||||
|
my_test_statistic = "NA i.e >2 categories"
|
||||||
|
my_pval = my_fisher_test$p.value
|
||||||
|
|
||||||
|
# assiging to loop df
|
||||||
|
df$clinical_categ_params = my_param_name
|
||||||
|
df$n_obs = my_n_obs
|
||||||
|
df$method = my_method
|
||||||
|
df$test_statistic = my_test_statistic
|
||||||
|
df$p = my_pval
|
||||||
|
|
||||||
|
print(df)
|
||||||
|
fisher_df2 = rbind(fisher_df2, df)
|
||||||
|
}
|
||||||
|
#=======================================================================
|
||||||
|
# quick tests
|
||||||
|
tab2 = table(stats_df$obesity, stats_df$smoking)
|
||||||
|
tab2
|
||||||
|
test2 = fisher_test(tab2); test2 # rstatix, gives n but no other estimates
|
||||||
|
test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3
|
||||||
|
|
||||||
|
########################################################################
|
||||||
|
#******************
|
||||||
|
# write output file
|
||||||
|
#******************
|
Loading…
Add table
Add a link
Reference in a new issue