added flu_stats_fisher_only.R containing separate dfs based on var factor levels

2020-11-19 18:05:57 +00:00 · 2020-11-19 18:05:57 +00:00 · a6cbaab40a
commit a6cbaab40a
parent 74769b9a38
1 changed files with 181 additions and 0 deletions
--- a/flu_stats_fisher_only.R
+++ b/flu_stats_fisher_only.R
@ -0,0 +1,181 @@
 #!/usr/bin/Rscript   
 getwd()
 setwd("~/git/mosaic_2020/")
 getwd()
 ############################################################
 # TASK: Contingency table analysis i.e chisq and fishers
 # data: clincial data of flu positive adult patients
 # group: obesity
 # Chisq test
 #https://www.google.com/search?q=chisq+test+on+long+format+data+in+R+using+group+by&source=lmns&bih=828&biw=1280&client=firefox-b-d&hl=en-GB&sa=X&ved=2ahUKEwjItpL7xI7tAhUGTBoKHXlSBa8Q_AUoAHoECAEQAA
 ############################################################
 #=============
 # Input
 #=============
 source("data_extraction_formatting_clinical.R")
 str(clinical_df_ics)
 clinical_df_ics$sfluv = as.integer(clinical_df_ics$sfluv)
 clinical_df_ics$h1n1v  = as.integer(clinical_df_ics$h1n1v)
 #=============
 # Output
 #=============
 ########################################################################
 # Fisher's test for clinical data b/w obesity groups
 ########################################################################
 categorical_cols = c( "death"
                      #, "obesity"
                      #, "flustat"
                      , "sfluv"
                      , "h1n1v"
                      , "gender"
                      , "asthma"
                      #, "o2_sat_suppl" ---> not recoded!?
                      , "ethnicity"
                      , "smoking"
                      , "ia_cxr"
                      , "max_resp_score"
                      , "T1_resp_score"
                      , "com_noasthma"
                      , "T2_resp_score"
                      , "inresp_sev"
                      , "steroid"
                      , "age_bins"
                      , "o2_sat_bin"
                      , "onset_initial_bin"
                      , "t1_resp_recoded"
                      , "steroid_ics")
 metadata_cols = c("mosaic", "obesity")
 categ_df = clinical_df_ics[, c(metadata_cols, categorical_cols)]
 my_categ_cols = colnames(categ_df)[!colnames(categ_df)%in%metadata_cols]
 if ( length(my_categ_cols) == ncol(categ_df) - length(metadata_cols) ){
  cat("PASS: variables for chisq test successfully extracted")
 }else{
  cat("FAIL: length mismatch when extracting variables for chisq")
  quit()
 }
 #========================================================
 # Data: for fisher:
 #2 levels: with OR
 #3 levels: without OR
 #========================================================
 stats_df = subset(categ_df, select = -c(mosaic))
 stats_df_copy = stats_df
 int_vars <- lapply(stats_df_copy, class)%in%c("integer", "numeric")
 int_vars
 stats_df_copy[, int_vars] <- lapply(stats_df_copy[, int_vars], as.factor)
 str(stats_df_copy)
 #-----------
 # 2 levels
 #-----------
 two_lev = lapply(stats_df_copy, nlevels) == 2
 fisher_cols_df1  = names(two_lev)[two_lev == TRUE]
 fisher_cols_df1
 cat("\nTotal no. of cols:", ncol(stats_df_copy)
    , "\nNo. of vars with 2 factor levels:", length(fisher_cols_df1)
    , "\nThese are:\n"
    , fisher_cols_df1)
 fisher_cols_df1 = fisher_cols_df1[!fisher_cols_df1%in%metadata_cols]
 fisher_cols_df1
 fisher_df1 = data.frame()
 for (i in fisher_cols_df1){
  cat(i, "\n===============\n")
  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA, p = NA, ci_low = NA, ci_high = NA)
  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_fisher_test = fisher.test(tab)
  print(my_fisher_test)
  # extracting results
  my_param_name = i
  my_n_obs = sum(tab)
  my_method = my_fisher_test$method
  my_test_statistic = my_fisher_test$estimate[[1]] # FIXME|
  my_pval = my_fisher_test$p.value
  my_ci_low = my_fisher_test$conf.int[[1]]
  my_ci_hi = my_fisher_test$conf.int[[2]]
  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method 
  df$test_statistic = my_test_statistic
  df$p = my_pval
  df$ci_low = my_ci_low
  df$ci_high = my_ci_hi
  print(df)
  fisher_df1 = rbind(fisher_df1, df) # FIXME, test_statistic col not getting created
 }
 #-----------
 # >2 levels
 #-----------
 multi_lev = lapply(stats_df_copy, nlevels) > 2
 fisher_cols_df2  = names(multi_lev)[multi_lev == TRUE]
 fisher_cols_df2
 cat("\nTotal no. of cols:", ncol(stats_df_copy)
    , "\nNo. of vars with >2 factor levels:", length(fisher_cols_df2)
    , "\nThese are:\n"
    , fisher_cols_df2)
 fisher_cols_df2 = fisher_cols_df2[!fisher_cols_df2%in%metadata_cols]
 fisher_cols_df2
 fisher_df2 = data.frame()
 for (i in fisher_cols_df2){
  cat(i, "\n===============\n")
  df = data.frame(clinical_categ_params = NA, n_obs = NA, method = NA, test_statistic = NA , p = NA)
  my_param_val = (get(i, stats_df))
  tab = table(stats_df$obesity, my_param_val)
  print(tab)
  my_fisher_test = fisher.test(tab)
  print(my_fisher_test)
  # extracting results
  my_param_name = i
  my_n_obs = sum(tab)
  my_method = my_fisher_test$method
  my_test_statistic = "NA i.e >2 categories"
  my_pval = my_fisher_test$p.value
  # assiging to loop df
  df$clinical_categ_params = my_param_name
  df$n_obs = my_n_obs
  df$method = my_method 
  df$test_statistic = my_test_statistic
  df$p = my_pval
  print(df)
  fisher_df2 = rbind(fisher_df2, df)
 }
 #=======================================================================
 # quick tests
 tab2 = table(stats_df$obesity, stats_df$smoking)
 tab2
 test2 = fisher_test(tab2); test2 # rstatix, gives n but no other estimates
 test3 = fisher.test(stats_df$obesity, stats_df$com_noasthma); test3
 ########################################################################
 #******************
 # write output file
 #******************