#!/usr/bin/Rscript getwd() setwd("~/git/mosaic_2020/") getwd() ############################################################ # TASK: Contingency table analysis i.e chisq and fishers # data: clincial data of flu positive adult patients # group: obesity # Chisq test #https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA ############################################################ #============= # Input #============= source("data_extraction_formatting_clinical.R") str(clinical_df_ics) clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv) clinical_df_ics$h1n1v = as.integer(clinical_df_ics$h1n1v) #============= # Output #============= ######################################################################## # Fisher's test for clinical data b/w obesity groups ######################################################################## categorical_cols = c( "death" #, "obesity" #, "flustat" , "sfluv" , "h1n1v" , "gender" , "asthma" #, "o2_sat_suppl" ---> not recoded!? , "ethnicity" , "smoking" , "ia_cxr" , "max_resp_score" , "T1_resp_score" , "com_noasthma" , "T2_resp_score" , "inresp_sev" , "steroid" , "age_bins" , "o2_sat_bin" , "onset_initial_bin" , "t1_resp_recoded" , "steroid_ics") metadata_cols = c("mosaic", "obesity") categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)] my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols] if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){ cat("PASS: variables for chisq test successfully extracted") }else{ cat("FAIL: length mismatch when extracting variables for chisq") quit() } #======================================================== # Data: for fisher: #2 levels: with OR #3 levels: without OR #======================================================== stats_df = subset(categ_df, select = -c(mosaic)) stats_df_copy = stats_df int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric") int_vars stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor) str(stats_df_copy) #----------- # 2 levels #----------- two_lev = lapply(stats_df_copy, nlevels) == 2 fisher_cols_df1 = names(two_lev)[two_lev == TRUE] fisher_cols_df1 cat("\nTotal no. of cols:", ncol(stats_df_copy) , "\nNo. of vars with 2 factor levels:", length(fisher_cols_df1) , "\nThese are:\n" , fisher_cols_df1) fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%metadata_cols] fisher_cols_df1 fisher_df1 = data.frame() for (i in fisher_cols_df1){ cat(i, "\n===============\n") df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_fisher_test = fisher.test(tab) print(my_fisher_test) # extracting results my_param_name = i my_n_obs = sum(tab) my_method = my_fisher_test$method my_test_statistic = my_fisher_test$estimate[[1]] # FIXME| my_pval = my_fisher_test$p.value my_ci_low = my_fisher_test$conf.int[[1]] my_ci_hi = my_fisher_test$conf.int[[2]] # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method df$test_statistic = my_test_statistic df$p = my_pval df$ci_low = my_ci_low df$ci_high = my_ci_hi print(df) fisher_df1 = rbind(fisher_df1, df) # FIXME, test_statistic col not getting created } #----------- # >2 levels #----------- multi_lev = lapply(stats_df_copy, nlevels) > 2 fisher_cols_df2 = names(multi_lev)[multi_lev == TRUE] fisher_cols_df2 cat("\nTotal no. of cols:", ncol(stats_df_copy) , "\nNo. of vars with >2 factor levels:", length(fisher_cols_df2) , "\nThese are:\n" , fisher_cols_df2) fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%metadata_cols] fisher_cols_df2 fisher_df2 = data.frame() for (i in fisher_cols_df2){ cat(i, "\n===============\n") df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA) my_param_val = (get(i, stats_df)) tab = table(stats_df$obesity, my_param_val) print(tab) my_fisher_test = fisher.test(tab) print(my_fisher_test) # extracting results my_param_name = i my_n_obs = sum(tab) my_method = my_fisher_test$method my_test_statistic = "NA i.e >2 categories" my_pval = my_fisher_test$p.value # assiging to loop df df$clinical_categ_params = my_param_name df$n_obs = my_n_obs df$method = my_method df$test_statistic = my_test_statistic df$p = my_pval print(df) fisher_df2 = rbind(fisher_df2, df) } #======================================================================= # quick tests tab2 = table(stats_df$obesity, stats_df$smoking) tab2 test2 = fisher_test(tab2); test2 # rstatix, gives n but no other estimates test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3 ######################################################################## #****************** # write output file #******************