#!/usr/bin/Rscript getwd() setwd("~/git/mosaic_2020/") getwd() ############################################################ # TASK: Contingency table analysis i.e chisq and fishers # data: clincial data of flu positive adult patients # group: obesity # Chisq test #https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA ############################################################ #============= # Input #============= source("data_extraction_formatting_clinical.R") str(clinical_df_ics) clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv) clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v) #============= # Output #============= outfile_name_clin_categ = paste0("flu_stats_clin_categ.csv") outfile_clin_categ = paste0(outdir_stats, outfile_name_clin_categ) outfile_clin_categ ######################################################################## # Chisq or fisher's test for clinical data b/w obseity groups ######################################################################## categorical_cols = c( "death" #, "obesity" #, "flustat" , "sfluv" , "h1n1v" , "gender" , "asthma" #, "o2_sat_suppl" ---> not recoded!? , "ethnicity" , "smoking" , "ia_cxr" , "max_resp_score" , "T1_resp_score" , "com_noasthma" , "T2_resp_score" , "inresp_sev" , "steroid" , "age_bins" , "o2_sat_bin" , "onset_initial_bin" , "t1_resp_recoded" , "steroid_ics") metadata_cols = c("mosaic", "obesity") categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)] # quick test tab = table(categ_df$obesity, categ_df$death) tab chisq.test(tab) #======================================================== # Data: for chisq and fisher #======================================================== stats_df = subset(categ_df, select = -c(mosaic)) my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols] if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){ cat("PASS: variables for chisq test successfully extracted") }else{ cat("FAIL: length mismatch when extracting variables for chisq") quit() } ######################################################################## #-------------------- # chisq test #-------------------- chisq_df = data.frame() for (i in my_categ_cols){ #print(i) df = data.frame(clinical_categ_params = NA , n_obs = NA , method = NA #, test_statistic = NA , p = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_chi_test = chisq.test(tab) print(my_chi_test) # extracting results my_param_name = i my_n_obs = sum(my_chi_test$observed) my_method = my_chi_test$method #my_test_statistic = my_chi_test$statistic[[1]] my_pval = my_chi_test$p.value # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method #df$test_statistic = my_test_statistic df$p = my_pval print(df) chisq_df = rbind(chisq_df, df) } # formatting chisq_df$p_format = format.pval(chisq_df$p) chisq_df$p_signif = chisq_df$p chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "." , p_signif <=0.0001 ~ '****' , p_signif <=0.001 ~ '***' , p_signif <=0.01 ~ '**' , p_signif <0.05 ~ '*' , p_signif == 0.1 ~ '.' , TRUE ~ 'ns')) # quick test tab = table(stats_df$obesity, stats_df$smoking); tab test1 = chisq.test(tab); test1 test2 = chisq_test(tab); test2 #rstatix sum(tab) #================================================================================ #-------------------- # Fishers test: without any OR # see standalone script for ORs #-------------------- fisher_df = data.frame() for (i in my_categ_cols){ cat(i, "\n===============\n") df = data.frame(clinical_categ_params = NA , n_obs = NA , method = NA #, test_statistic = NA , p = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_fisher_test = fisher.test(tab) print(my_fisher_test) # extracting results my_param_name = i my_n_obs = sum(tab) my_method = my_fisher_test$method #my_test_statistic = my_fisher_test$estimate[[1]] my_pval = my_fisher_test$p.value # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method #df$test_statistic = my_test_statistic df$p = my_pval print(df) fisher_df = rbind(fisher_df, df) } # formatting fisher_df$p_format = format.pval(fisher_df$p) fisher_df$p_signif = fisher_df$p fisher_df = dplyr::mutate(fisher_df, p_signif = case_when(p_signif == 0.05 ~ "." , p_signif <=0.0001 ~ '****' , p_signif <=0.001 ~ '***' , p_signif <=0.01 ~ '**' , p_signif <0.05 ~ '*' , p_signif == 0.1 ~ '.' , TRUE ~ 'ns')) # quick test tab_f = table(stats_df$obesity, stats_df$smoking); tab_f test1_f = fisher.test(tab); test1_f test2_f = fisher_test(tab); test2_f #rstatix sum(tab_f) test1_f$estimate #================================================================================ #-------------------------------- # Combining chisq and fishers df #--------------------------------- if ( all(colnames(chisq_df) == colnames(fisher_df)) && all(dim(chisq_df) == dim(fisher_df)) ){ cat("Colnames AND dim match for both chisq and fisher df" , "\nNo. of rows:", nrow(chisq_df) , "\nNo. of cols:", ncol(chisq_df) , "\nProceeding to rbind") comb_stats_categ_df = rbind(chisq_df, fisher_df) cat("\nCombined df dim:") print(dim(comb_stats_categ_df)) } # sort df according to p_signif AND clinical_categ_params comb_stats_categ_df_f = comb_stats_categ_df[order(comb_stats_categ_df$p_signif , comb_stats_categ_df$clinical_categ_params),] ######################################################################## #****************** # write output file #****************** cat("Chisq and fishers test results in:", outfile_clin_categ) #write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE) #================== #0 date not recorded #-1 not recorded #-2 n/a specified by clinician #-3 unknown specified by chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$smoking)) chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$max_resp_score)) chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$T1_resp_score)) chisq_test(table(clinical_df_ics$obesity, clinical_df_ics$t1_resp_recoded)) obese_df = clinical_df_ics[clinical_df_ics$obesity == 1,] not_ob_df = clinical_df_ics[clinical_df_ics$obesity == 0,] wilcox.test(obese_df$age, not_ob_df$age, paired = F) wilcox.test(obese_df$los, not_ob_df$los, paired = F) wilcox.test(obese_df$o2_sat_admis, not_ob_df$o2_sat_admis, paired = F) wilcox.test(obese_df$onset_2_initial, not_ob_df$onset_2_initial, paired = F) wilcox.test(obese_df$onset2final, not_ob_df$onset2final, paired = F) wilcox.test(obese_df$onsfindeath, not_ob_df$onsfindeath, paired = F) clinical_df_ics$age clinical_df_ics$los clinical_df_ics$o2_sat_admis #***** (already bin) clinical_df_ics$onset_2_initial # ***** (already bin) clinical_df_ics$onset2final clinical_df_ics$onsfindeath