From a6cbaab40a57d7b1023bcb128c412d6ce32ceb3c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 19 Nov 2020 18:05:57 +0000 Subject: [PATCH] added flu_stats_fisher_only.R containing separate dfs based on var factor levels --- flu_stats_fisher_only.R | 181 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100755 flu_stats_fisher_only.R diff --git a/flu_stats_fisher_only.R b/flu_stats_fisher_only.R new file mode 100755 index 0000000..eade64d --- /dev/null +++ b/flu_stats_fisher_only.R @@ -0,0 +1,181 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: Contingency table analysis i.e chisq and fishers +# data: clincial data of flu positive adult patients +# group: obesity + +# Chisq test +#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA +############################################################ + +#============= +# Input +#============= +source("data_extraction_formatting_clinical.R") +str(clinical_df_ics) + +clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv) +clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v) + +#============= +# Output +#============= + +######################################################################## +# Fisher's test for clinical data b/w obesity groups +######################################################################## +categorical_cols = c( "death" + #, "obesity" + #, "flustat" + , "sfluv" + , "h1n1v" + , "gender" + , "asthma" + #, "o2_sat_suppl" ---> not recoded!? + , "ethnicity" + , "smoking" + , "ia_cxr" + , "max_resp_score" + , "T1_resp_score" + , "com_noasthma" + , "T2_resp_score" + , "inresp_sev" + , "steroid" + , "age_bins" + , "o2_sat_bin" + , "onset_initial_bin" + , "t1_resp_recoded" + , "steroid_ics") + +metadata_cols = c("mosaic", "obesity") + +categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)] + +my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols] + +if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){ + cat("PASS: variables for chisq test successfully extracted") +}else{ + cat("FAIL: length mismatch when extracting variables for chisq") + quit() +} + +#======================================================== +# Data: for fisher: +#2 levels: with OR +#3 levels: without OR +#======================================================== +stats_df = subset(categ_df, select = -c(mosaic)) + +stats_df_copy = stats_df +int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric") +int_vars +stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor) +str(stats_df_copy) + +#----------- +# 2 levels +#----------- +two_lev = lapply(stats_df_copy, nlevels) == 2 +fisher_cols_df1 = names(two_lev)[two_lev == TRUE] +fisher_cols_df1 + +cat("\nTotal no. of cols:", ncol(stats_df_copy) + , "\nNo. of vars with 2 factor levels:", length(fisher_cols_df1) + , "\nThese are:\n" + , fisher_cols_df1) + +fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%metadata_cols] +fisher_cols_df1 + +fisher_df1 = data.frame() + +for (i in fisher_cols_df1){ + cat(i, "\n===============\n") + df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA) + my_param_val = (get(i, stats_df)) + tab = table(stats_df$obesity, my_param_val) + print(tab) + my_fisher_test = fisher.test(tab) + print(my_fisher_test) + + # extracting results + my_param_name = i + my_n_obs = sum(tab) + my_method = my_fisher_test$method + my_test_statistic = my_fisher_test$estimate[[1]] # FIXME| + my_pval = my_fisher_test$p.value + my_ci_low = my_fisher_test$conf.int[[1]] + my_ci_hi = my_fisher_test$conf.int[[2]] + + # assiging to loop df + df$clinical_categ_params = my_param_name + df$n_obs = my_n_obs + df$method = my_method + df$test_statistic = my_test_statistic + df$p = my_pval + df$ci_low = my_ci_low + df$ci_high = my_ci_hi + + print(df) + fisher_df1 = rbind(fisher_df1, df) # FIXME, test_statistic col not getting created +} + +#----------- +# >2 levels +#----------- +multi_lev = lapply(stats_df_copy, nlevels) > 2 +fisher_cols_df2 = names(multi_lev)[multi_lev == TRUE] +fisher_cols_df2 + +cat("\nTotal no. of cols:", ncol(stats_df_copy) + , "\nNo. of vars with >2 factor levels:", length(fisher_cols_df2) + , "\nThese are:\n" + , fisher_cols_df2) + + +fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%metadata_cols] +fisher_cols_df2 + +fisher_df2 = data.frame() + +for (i in fisher_cols_df2){ + cat(i, "\n===============\n") + df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA) + my_param_val = (get(i, stats_df)) + tab = table(stats_df$obesity, my_param_val) + print(tab) + my_fisher_test = fisher.test(tab) + print(my_fisher_test) + + # extracting results + my_param_name = i + my_n_obs = sum(tab) + my_method = my_fisher_test$method + my_test_statistic = "NA i.e >2 categories" + my_pval = my_fisher_test$p.value + + # assiging to loop df + df$clinical_categ_params = my_param_name + df$n_obs = my_n_obs + df$method = my_method + df$test_statistic = my_test_statistic + df$p = my_pval + + print(df) + fisher_df2 = rbind(fisher_df2, df) +} +#======================================================================= +# quick tests +tab2 = table(stats_df$obesity, stats_df$smoking) +tab2 +test2 = fisher_test(tab2); test2 # rstatix, gives n but no other estimates +test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3 + +######################################################################## +#****************** +# write output file +#******************