reformatting code to select needed df for analysis

2020-11-20 11:43:03 +00:00 · 2020-11-20 11:43:03 +00:00 · b72c4df796
commit b72c4df796
parent a6cbaab40a
7 changed files with 243 additions and 102 deletions
--- a/flu_stats_unpaired_clinical.R
+++ b/flu_stats_unpaired_clinical.R
@ -25,27 +25,28 @@ outfile_clinical_unpaired
 # Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis 
 # No correction required
 ########################################################################
-
+str(clinical_df_ics)
 numerical_cols = c("age"
-                   #, "vl_pfu_ul_npa1"
+                   , "vl_pfu_ul_npa1"
                   , "los"
                   , "onset2final"
                   , "onsfindeath"
-                   , "onset_2_initial"
-                   , "o2_sat_admis")
+                   #, "onset_2_initial" # already bin
+                   #, "o2_sat_admis"# already bin
+)

 metadata_cols = c("mosaic", "obesity")

-clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)]
+clinical_df_numerical = clinical_df_ics[, c(metadata_cols, numerical_cols)]

 pivot_cols = metadata_cols
 #pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
 expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf

-
+# lf data colnames
 keycol <- "clinical_params"
 valuecol <- "value"
-gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis")
+gathercols <- numerical_cols 

 clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols)

@ -70,12 +71,15 @@ stats_un_clinical = compare_means(value~obesity
                            #, data = clinical_lf_comp
                            , paired = FALSE)

+head(stats_un_clinical)

+# rstatix
 stat_df <- clinical_lf %>%
  group_by(clinical_params) %>%
  wilcox_test(value ~ obesity, paired = F) %>%
  add_significance("p")
 stat_df$p_format = round(stat_df$p, digits = 3)
+stat_df

 #----------------------------------------
 # calculate n_obs for each clinical param: Overall
@ -101,31 +105,39 @@ n_all_gp  = merge(n_all, n_gp
 #----------------------------------------
 # calculate n_obs for each clinical param: complete cases
 #----------------------------------------
-n_comp = data.frame(table(clinical_lf_comp$clinical_params))
+n_comp = data.frame(table(clinical_lf$clinical_params))
 colnames(n_comp) = c("clinical_params", "n_complete")
 n_comp$clinical_params = as.character(n_comp$clinical_params)
 n_comp

-n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf
+n_gp_comp_lf = data.frame(table(clinical_lf$clinical_params
+                                , clinical_lf$obesity)); n_gp_comp_lf
 n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp
 colnames(n_gp_comp)
 colnames(n_gp_comp) = c("clinical_params"
                   , paste0("n_complete_gp", colnames(n_gp_comp)[2])
                   , paste0("n_complete_gp", colnames(n_gp_comp)[3]))

-
+#---------
+# merge 1
+#---------
 n_comp_gp  = merge(n_comp, n_gp_comp
                  , by = intersect( names(n_comp), names(n_gp_comp))
                  , all = T)
+n_comp_gp

+#---------
+# merge 2
+#---------
 merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols

 n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df

-#==================================
-# Merge: merge stats + n_obs df
-#===================================
+#----------------------------------
+# Merge 3: merge stats + n_obs df
+#----------------------------------
 merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols
+
 if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) {
  cat("PASS: merging stats and n_obs on column/s:", merging_cols)
  stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T)
@ -188,6 +200,7 @@ if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%co
  quit()
 }  
 # assign nice column names like replace "." with "_"
+# same ordering as my_col_order2, just minor formatting
 colnames(stats_clinical_df_f) = c("clinical_params"
                                   , "method"
                                   , "group1"
@ -208,4 +221,4 @@ colnames(stats_clinical_df_f)
 # write output file
 #******************
 cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired)
-#write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)
+write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)