From 269918d696b79d07ba1271e053e92e4197d786bb Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Thu, 19 Nov 2020 17:48:08 +0000
Subject: [PATCH] separated fisher test only and renamed logistic_outcome col

---
 data_extraction_formatting_clinical.R |  13 +-
 flu_stats_contingency.R               | 215 +++++++++++---------------
 2 files changed, 98 insertions(+), 130 deletions(-)

diff --git a/data_extraction_formatting_clinical.R b/data_extraction_formatting_clinical.R
index 95ad909..5bb10bf 100644
--- a/data_extraction_formatting_clinical.R
+++ b/data_extraction_formatting_clinical.R
@@ -330,6 +330,12 @@ merging_cols = intersect( names(clinical_df), names(clinical_ics) )
 
 clinical_df_ics = merge(clinical_df, clinical_ics, by = merging_cols, all = T); clinical_df_ics 
 
+colnames(clinical_df_ics)
+
+# change colname of logistic_outcome
+c1 = which(colnames(clinical_df_ics) == "logistic_outcome")
+colnames(clinical_df_ics)[c1] <- "t1_resp_recoded"
+
 if (nrow(clinical_df_ics) == nrow(clinical_df) & nrow(clinical_ics)){
   cat("\nPASS: No. of rows match, nrow =", nrow(clinical_df_ics)
       , "\nChecking ncols...")
@@ -353,7 +359,10 @@ outfile_reg = paste0(outdir, outfile_name_reg)
 
 cat("\nWriting clinical file for regression:", outfile_reg)
 
-#write.csv(clinical_df_ics, file = outfile_reg)
+write.csv(clinical_df_ics, file = outfile_reg)
 ################################################################
 
-rm(age_bins, max_age_interval, max_in, min_in, o2_sat_bin, onset_initial_bin, tot_o2, tot_onset2ini, meta_data_cols)
+rm(age_bins, max_age_interval, max_in, min_in
+   , o2_sat_bin, onset_initial_bin, tot_o2
+   , tot_onset2ini, meta_data_cols
+   , clinical_df)
diff --git a/flu_stats_contingency.R b/flu_stats_contingency.R
index bf22982..a64ad95 100755
--- a/flu_stats_contingency.R
+++ b/flu_stats_contingency.R
@@ -3,20 +3,22 @@ getwd()
 setwd("~/git/mosaic_2020/")
 getwd()
 ############################################################
-# TASK: unpaired (time) analysis of clinical data
+# TASK: Contingency table analysis i.e chisq and fishers
 # data: clincial data of flu positive adult patients
 # group: obesity
+
+# Chisq test
+#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
 ############################################################
-#my_sample_type = "npa"
 
 #=============
 # Input
 #=============
 source("data_extraction_formatting_clinical.R")
-str(clinical_df)
+str(clinical_df_ics)
 
-clinical_df$sfluv = as.integer(clinical_df$sfluv)
-clinical_df$h1n1v  = as.integer(clinical_df$h1n1v)
+clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv)
+clinical_df_ics$h1n1v  = as.integer(clinical_df_ics$h1n1v)
 
 #=============
 # Output
@@ -44,58 +46,28 @@ categorical_cols = c( "death"
                    , "com_noasthma"
                    , "T2_resp_score"
                    , "inresp_sev"
-                   , "steroid")
-
+                   , "steroid"
+                   , "age_bins"
+                   , "o2_sat_bin"
+                   , "onset_initial_bin"
+                   , "t1_resp_recoded"
+                   , "steroid_ics")
 
 metadata_cols = c("mosaic", "obesity")
 
-categ_df = clinical_df[, c(metadata_cols, categorical_cols)]
+categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)]
 
 # quick test
 tab = table(categ_df$obesity, categ_df$death)
 tab
 chisq.test(tab)
 
-pivot_cols = metadata_cols
-#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
-expected_rows_categ_lf = nrow(categ_df) * ( length(categ_df) - length(pivot_cols) ); expected_rows_categ_lf
-
-colnames(categ_df)
-
-keycol <- "clinical_categ_params"
-valuecol <- "value"
-gathercols <- c("death"
-                #, "flustat"
-                , "sfluv", "h1n1v"
-                , "gender", "asthma"
-                #, "o2_sat_suppl"
-                , "ethnicity"
-                , "smoking", "ia_cxr", "max_resp_score", "T1_resp_score"
-                , "com_noasthma", "T2_resp_score", "inresp_sev", "steroid")
-
-categ_lf = gather_(categ_df, keycol, valuecol, gathercols)
-#categ_lf = gather(categ_df, clinical_categ_params, value, death:T2_resp_score, factor_key = F)
-
-if( nrow(categ_lf) == expected_rows_categ_lf){
-  cat("PASS: long format data created successfully"
-      , "\nnrow:", nrow(categ_lf)
-      , "\nncol:", ncol(categ_lf))
-}
-
-#=============================================
-# Chisq test: clinical categorical vars
-#https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
-#=============================================
-
-
-
-#=============================================
-# wf data
-#=============================================
+#========================================================
+# Data: for chisq and fisher
+#========================================================
 stats_df = subset(categ_df, select = -c(mosaic))
-drop_cols = c("mosaic", "obesity")
 
-my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%drop_cols]
+my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols]
 
 if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){
   cat("PASS: variables for chisq test successfully extracted")
@@ -103,16 +75,19 @@ if ( length(my_categ_cols) == ncol(categ_df) - length(drop_cols) ){
   cat("FAIL: length mismatch when extracting variables for chisq")
   quit()
 }
-
-# quick test
-tab = table(stats_df$obesity, stats_df$smoking)
-tab
-test1 = chisq.test(tab)
-
+########################################################################
+#--------------------
+# chisq test
+#--------------------
 chisq_df = data.frame()
 for (i in my_categ_cols){
   #print(i)
-  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA)
+  df = data.frame(clinical_categ_params = NA
+                  , n_obs = NA
+                  , method = NA
+                  #, test_statistic = NA
+                  , p = NA)
+  
   my_param_val = (get(i, stats_df))
   tab = table(stats_df$obesity, my_param_val)
   print(tab)
@@ -123,20 +98,21 @@ for (i in my_categ_cols){
   my_param_name = i
   my_n_obs = sum(my_chi_test$observed)
   my_method = my_chi_test$method
-  my_test_statistic = my_chi_test$statistic[[1]]
+  #my_test_statistic = my_chi_test$statistic[[1]]
   my_pval = my_chi_test$p.value
   
   # assiging to loop df
   df$clinical_categ_params = my_param_name
   df$n_obs = my_n_obs
   df$method = my_method 
-  df$test_statistic = my_test_statistic
+  #df$test_statistic = my_test_statistic
   df$p = my_pval
   
   print(df)
   chisq_df = rbind(chisq_df, df)
 }
 
+# formatting
 chisq_df$p_format = format.pval(chisq_df$p)
 chisq_df$p_signif = chisq_df$p
 
@@ -147,34 +123,29 @@ chisq_df = dplyr::mutate(chisq_df, p_signif = case_when(p_signif == 0.05 ~ "."
                                                       , p_signif <0.05 ~ '*'
                                                       , p_signif == 0.1 ~ '.'
                                                       , TRUE ~ 'ns'))
-#=============================================
-# Fishers test
-#=============================================
+
 # quick test
-tab2 = table(stats_df$obesity, stats_df$smoking)
-tab2
-#test2 = fisher_test(tab2); test2
-test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3
+tab = table(stats_df$obesity, stats_df$smoking); tab
+test1 = chisq.test(tab); test1
+test2 = chisq_test(tab); test2 #rstatix
+sum(tab)
 
-stats_df_copy = stats_df
-int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric")
-int_vars
-stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor)
-str(stats_df_copy)
+#================================================================================
+#--------------------
+# Fishers test: without any OR
+# see standalone script for this!
+#--------------------
 
-two_lev = lapply(stats_df_copy, nlevels) == 2
-fisher_cols_df1  = names(two_lev)[two_lev == TRUE]
-fisher_cols_df1
+fisher_df = data.frame()
 
-drop_cols = c("mosaic", "obesity")
-fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%drop_cols]
-fisher_cols_df1
-
-fisher_df1 = data.frame()
-
-for (i in fisher_cols_df1){
+for (i in my_categ_cols){
   cat(i, "\n===============\n")
-  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA)
+  df = data.frame(clinical_categ_params = NA
+                  , n_obs = NA
+                  , method = NA
+                  #, test_statistic = NA
+                  , p = NA)
+  
   my_param_val = (get(i, stats_df))
   tab = table(stats_df$obesity, my_param_val)
   print(tab)
@@ -185,74 +156,62 @@ for (i in fisher_cols_df1){
   my_param_name = i
   my_n_obs = sum(tab)
   my_method = my_fisher_test$method
-  my_test_statistic = my_fisher_test$statistic[[1]] # FIXME|
+  #my_test_statistic = my_fisher_test$estimate[[1]]
   my_pval = my_fisher_test$p.value
-  my_ci_low = my_fisher_test$conf.int[[1]]
-  my_ci_hi = my_fisher_test$conf.int[[2]]
-  
+
   # assiging to loop df
   df$clinical_categ_params = my_param_name
   df$n_obs = my_n_obs
   df$method = my_method 
-  df$test_statistic = my_test_statistic
+  #df$test_statistic = my_test_statistic
   df$p = my_pval
-  df$ci_low = my_ci_low
-  df$ci_high = my_ci_hi
-  
+
   print(df)
-  fisher_df1 = rbind(fisher_df1, df)
+  fisher_df = rbind(fisher_df, df)
 }
 
-#=============
+# formatting
+fisher_df$p_format = format.pval(fisher_df$p)
+fisher_df$p_signif = fisher_df$p
 
-three_lev = lapply(stats_df_copy, nlevels) == 3
-fisher_cols_df2  = names(three_lev)[three_lev == TRUE]
-fisher_cols_df2
+fisher_df = dplyr::mutate(fisher_df, p_signif = case_when(p_signif == 0.05 ~ "."
+                                                        , p_signif <=0.0001 ~ '****'
+                                                        , p_signif <=0.001 ~ '***'
+                                                        , p_signif <=0.01 ~ '**'
+                                                        , p_signif <0.05 ~ '*'
+                                                        , p_signif == 0.1 ~ '.'
+                                                        , TRUE ~ 'ns'))
 
-#drop_cols = c("mosaic", "obesity")
-#fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%drop_cols]
-#fisher_cols_df2
+# quick test
+tab_f = table(stats_df$obesity, stats_df$smoking); tab_f
+test1_f = fisher.test(tab); test1_f
+test2_f = fisher_test(tab); test2_f #rstatix
+sum(tab_f)
+test1_f$estimate
 
-fisher_df2 = data.frame()
-
-for (i in fisher_cols_df2){
-  cat(i, "\n===============\n")
-  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA)
-  my_param_val = (get(i, stats_df))
-  tab = table(stats_df$obesity, my_param_val)
-  print(tab)
-  my_fisher_test = fisher.test(tab)
-  print(my_fisher_test)
+#================================================================================
+#--------------------------------
+# Combining chisq and fishers df
+#---------------------------------
+if ( all(colnames(chisq_df) == colnames(fisher_df)) &&  all(dim(chisq_df) == dim(fisher_df)) ){
+  cat("Colnames AND dim match for both chisq and fisher df"
+      , "\nNo. of rows:", nrow(chisq_df)
+      , "\nNo. of cols:", ncol(chisq_df)
+      , "\nProceeding to rbind")
   
-  # extracting results
-  my_param_name = i
-  my_n_obs = sum(tab)
-  my_method = my_fisher_test$method
-  my_test_statistic = ">2 categories"
-  my_pval = my_fisher_test$p.value
+  comb_stats_categ_df = rbind(chisq_df, fisher_df)
   
-  # assiging to loop df
-  df$clinical_categ_params = my_param_name
-  df$n_obs = my_n_obs
-  df$method = my_method 
-  df$test_statistic = my_test_statistic
-  df$p = my_pval
-  
-  print(df)
-  fisher_df2 = rbind(fisher_df2, df)
+  cat("\nCombined df dim:")
+  print(dim(comb_stats_categ_df))
+
 }
 
-
-
-
-
-
-
-
-
+# sort df according to p_signif AND clinical_categ_params
+comb_stats_categ_df_f =  comb_stats_categ_df[order(comb_stats_categ_df$p_signif
+                                                           , comb_stats_categ_df$clinical_categ_params),]
 ########################################################################
 #******************
 # write output file
 #******************
 cat("Chisq and fishers test results in:", outfile_clin_categ)
-#write.csv(stats_categ_df_f, outfile_clin_categ, row.names = FALSE)
+write.csv(comb_stats_categ_df_f, outfile_clin_categ, row.names = FALSE)