dded paired unpaired stats scripts

2020-10-23 11:32:51 +01:00 · 2020-10-23 11:32:51 +01:00 · 7add917155
commit 7add917155
parent b0c06b9704
4 changed files with 333 additions and 7 deletions
--- a/stats_paired.R
+++ b/stats_paired.R
@ -0,0 +1,111 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/covid_analysis/')
+getwd()
+############################################################
+# TASK: basic plots
+
+# useful links:
+# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization
+############################################################
+# source data
+source("read_data.R")
+############################################################
+#=========================
+# output: paired_analysis
+#=========================
+stats_time_paired = paste0(outdir_stats, "stats_paired_v3.csv")
+
+############################################################
+# data assignment for stats
+wf = wf_data
+lf = lf_data
+########################################################################
+# Pairwise stats by timepoint: wilcoxon paired analysis with correction
+########################################################################
+# with adjustment: fdr and BH are identical
+my_adjust_method = "BH"
+stats_by_timepoint = compare_means(value~timepoint, group.by = "mediator"
+                                   , data = lf
+                                   , paired = TRUE
+                                   , p.adjust.method = my_adjust_method)
+
+
+# check: satisfied!!!!
+wilcox.test(wf$sESelectin_ngmL_t1, wf$sESelectin_ngmL_t2, paired = T)
+wilcox.test(wf$sRAGE_pgmL_t1, wf$sRAGE_pgmL_t2, paired = T)
+
+# delete unnecessary column
+stats_by_timepoint = subset(stats_by_timepoint, select = -c(.y.))
+
+# reflect stats method correctly
+stats_by_timepoint$method
+stats_by_timepoint$method = gsub("Wilcoxon", "Wilcoxon_paired", stats_by_timepoint$method)
+stats_by_timepoint$method
+
+# replace "." in colnames with "_"
+colnames(stats_by_timepoint)
+#names(stats_by_timepoint) = gsub("\.", "_", names(stats_by_timepoint)) # weird!!!!
+
+colnames(stats_by_timepoint) = c("mediator"
+                                 ,"group1"
+                                 ,"group2"
+                                 ,"p"
+                                 ,"p_adj"
+                                 ,"p_format"
+                                 ,"p_signif"
+                                 ,"method" ) 
+
+colnames(stats_by_timepoint)
+
+# add an extra column for padjust_signif
+stats_by_timepoint$padjust_signif = round(stats_by_timepoint$p_adj, digits = 2)
+
+# add appropriate symbols for padjust_signif
+#stats_by_timepoint = stats_by_timepoint %>%
+#  mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "."
+#                                    , padjust_signif <0.05 ~ '*'
+#                                    , padjust_signif <=0.01 ~ '**'
+#                                    , padjust_signif <=0.001 ~ '***'
+#                                    , padjust_signif <=0.0001 ~ '****'
+#                                    , TRUE ~ 'ns'))
+                                    
+stats_by_timepoint = dplyr::mutate(stats_by_timepoint, padjust_signif = case_when(padjust_signif == 0.05 ~ "."
+                                    , padjust_signif <=0.0001 ~ '****'
+                                    , padjust_signif <=0.001 ~ '***'
+                                    , padjust_signif <=0.01 ~ '**'
+                                    , padjust_signif <0.05 ~ '*'
+                                    , TRUE ~ 'ns'))
+
+# reorder columns
+print("preparing to reorder columns...")
+colnames(stats_by_timepoint)
+my_col_order2 = c("mediator"
+                  , "group1"
+                  , "group2"
+                  , "method"
+                  , "p"
+                  , "p_format"
+                  , "p_signif"
+                  , "p_adj"
+                  , "padjust_signif")
+
+if( length(my_col_order2) == ncol(stats_by_timepoint) && isin(my_col_order2, colnames(stats_by_timepoint)) ){
+  print("PASS: Reordering columns...")
+  stats_by_timepoint_f = stats_by_timepoint[, my_col_order2]
+  print("Successful: column reordering")
+  print("formatted df called:'stats_by_timepoint_f'")
+  cat('\nformatted df has the following dimensions\n')
+  print(dim(stats_by_timepoint_f ))
+} else{
+  cat(paste0("FAIL:Cannot reorder columns, length mismatch"
+             , "\nExpected column order for: ", ncol(stats_by_timepoint)
+             , "\nGot:", length(my_col_order2)))
+  quit()
+}    
+
+#******************
+# write output file
+#******************
+cat("Paired stats by timepoint will be:", stats_time_paired)
+write.csv(stats_by_timepoint_f, stats_time_paired, row.names = FALSE)
--- a/stats_unpaired.R
+++ b/stats_unpaired.R
@ -0,0 +1,211 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/covid_analysis/')
+getwd()
+############################################################
+# TASK: basic plots
+
+# useful links:
+# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization
+############################################################
+# source data
+source("read_data.R")
+############################################################
+#============================
+# Output: unpaired analysis
+#============================
+stats_time_unpaired = paste0(outdir_stats, "stats_unpaired_v3.csv")
+
+############################################################
+# data assignment for stats
+wf = wf_data
+lf = lf_data
+
+########################################################################
+# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis with correction
+#######################################################################
+# with adjustment: fdr and BH are identical
+my_adjust_method = "BH"
+
+#==============
+# unpaired: t1
+#==============
+lf_t1 = lf[lf$timepoint == "t1",]
+stats_un_t1 = compare_means(value~outcomes, group.by = "mediator"
+                                   , data = lf_t1 
+                                   , paired = FALSE
+                                   , p.adjust.method = my_adjust_method)
+
+stats_un_t1$timepoint = "t1"
+
+stats_un_t1 = as.data.frame(stats_un_t1)
+class(stats_un_t1)
+
+# check: satisfied!!!!
+wilcox.test(wf$sESelectin_ngmL_t1[wf$outcomes == 0], wf$sESelectin_ngmL_t1[wf$outcomes == 1]
+            , paired = FALSE)
+
+wilcox.test(wf$PF_units_t1[wf$outcomes==0], wf$PF_units_t1[wf$outcomes == 1]
+            , paired = FALSE)
+
+#==============
+# unpaired: t2
+#==============
+lf_t2 = lf[lf$timepoint == "t2",]
+stats_un_t2 = compare_means(value~outcomes, group.by = "mediator"
+                            , data = lf_t2 
+                            , paired = FALSE
+                            , p.adjust.method = my_adjust_method)
+
+stats_un_t2$timepoint = "t2"
+
+stats_un_t2 = as.data.frame(stats_un_t2)
+class(stats_un_t2)
+
+# check: satisfied!!!!
+wilcox.test(wf$sESelectin_ngmL_t2[wf$outcomes == 0], wf$sESelectin_ngmL_t2[wf$outcomes == 1]
+            , paired = FALSE)
+
+wilcox.test(wf$PF_units_t2[wf$outcomes==0], wf$PF_units_t2[wf$outcomes == 1]
+            , paired = FALSE)
+
+#==============
+# unpaired: t3
+#==============
+lf_t3 = lf[lf$timepoint == "t3",]
+stats_un_t3 = compare_means(value~outcomes, group.by = "mediator"
+                            , data = lf_t3 
+                            , paired = FALSE
+                            , p.adjust.method = my_adjust_method)
+
+stats_un_t3$timepoint = "t3"
+
+stats_un_t3 = as.data.frame(stats_un_t3)
+class(stats_un_t3)
+
+# check: satisfied!!!!
+wilcox.test(wf$sESelectin_ngmL_t3[wf$outcomes == 0], wf$sESelectin_ngmL_t3[wf$outcomes == 1]
+            , paired = FALSE)
+
+wilcox.test(wf$PF_units_t3[wf$outcomes==0], wf$PF_units_t3[wf$outcomes == 1]
+            , paired = FALSE)
+
+
+#==============
+# Rbind these dfs
+#==============
+str(stats_un_t1);str(stats_un_t2); str(stats_un_t3)
+
+n_dfs = 3
+
+if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && 
+     all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) {
+  expected_rows = nrow(stats_un_t1) * n_dfs
+  expected_cols = ncol(stats_un_t1)
+  print("PASS: expected_rows and cols variables generated for downstream sanity checks")
+}else{
+  cat("FAIL: dfs have different no. of rows and cols"
+      , "\nCheck harcoded value of n_dfs"
+      , "\nexpected_rows and cols could not be generated")
+  quit()
+}
+
+
+if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){
+  print("PASS: colnames match. Rbind the 3 dfs...")
+  combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3)
+} else{
+  cat("FAIL: cannot combined dfs. Colnames don't match!")
+  quit()
+}
+
+
+if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){
+  cat("PASS: combined_df has expected dimension"
+      , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats)
+      , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) )
+}else{
+  cat("FAIL: combined_df dimension mismatch")
+  quit()
+}
+
+#===============================================================
+# formatting df
+# delete unnecessary column
+combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.))
+
+# reflect stats method correctly
+combined_unpaired_stats$method
+combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method)
+combined_unpaired_stats$method
+
+# replace "." in colnames with "_"
+colnames(combined_unpaired_stats)
+#names(combined_unpaired_stats) = gsub("\.", "_", names(combined_unpaired_stats)) # weird!!!!
+
+colnames(combined_unpaired_stats) = c("mediator"
+                                      ,"group1"
+                                      ,"group2"
+                                      ,"p"
+                                      ,"p_adj"
+                                      ,"p_format"
+                                      ,"p_signif"
+                                      ,"method"
+                                      , "timepoint") 
+
+colnames(combined_unpaired_stats)
+
+# add an extra column for padjust_signif
+combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2)
+
+# add appropriate symbols for padjust_signif
+#combined_unpaired_stats = combined_unpaired_stats %>%
+#  mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "."
+#                                    , padjust_signif <0.05 ~ '*'
+#                                    , padjust_signif <=0.01 ~ '**'
+#                                    , padjust_signif <=0.001 ~ '***'
+#                                    , padjust_signif <=0.0001 ~ '****'
+#                                    , TRUE ~ 'ns'))
+
+combined_unpaired_stats  = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "."
+                                    , padjust_signif <=0.0001 ~ '****'
+                                    , padjust_signif <=0.001 ~ '***'
+                                    , padjust_signif <=0.01 ~ '**'
+                                    , padjust_signif <0.05 ~ '*'
+                                    , TRUE ~ 'ns'))
+
+
+# reorder columns
+print("preparing to reorder columns...")
+colnames(combined_unpaired_stats)
+my_col_order2 = c("mediator"
+                  , "timepoint"
+                  , "group1"
+                  , "group2"
+                  , "method"
+                  , "p"
+                  , "p_format"
+                  , "p_signif"
+                  , "p_adj"
+                  , "padjust_signif")
+
+if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){
+  print("PASS: Reordering columns...")
+  combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2]
+  print("Successful: column reordering")
+  print("formatted df called:'combined_unpaired_stats_f'")
+  cat('\nformatted df has the following dimensions\n')
+  print(dim(combined_unpaired_stats_f ))
+} else{
+  cat(paste0("FAIL:Cannot reorder columns, length mismatch"
+             , "\nExpected column order for: ", ncol(combined_unpaired_stats)
+             , "\nGot:", length(my_col_order2)))
+  quit()
+}    
+
+#******************
+# write output file
+#******************
+cat("UNpaired stats for groups will be:", stats_time_unpaired)
+write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE)
+
--- a/summary_stats_by_time_outcome.R
+++ b/summary_stats_by_time_outcome.R
@ -10,15 +10,16 @@ getwd()
 ############################################################
 # source data
 source("read_data.R")
-#==========================================================
-# define output filenames
+############################################################
+#=========================================
+# output: summary stats by time + outcome
+#=========================================
 summary_stats_time_outcome = paste0(outdir_stats, "summary_stats_timepoint_outcome_v3.csv")

-#==========================================================
+############################################################
 # data assignment for stats
 wf = wf_data
 lf = lf_data
-
 ############################################################

 #=======================================================
--- a/summary_stats_time.R
+++ b/summary_stats_time.R
@ -10,14 +10,17 @@ getwd()
 ############################################################
 # source data
 source("read_data.R")
-#==========================================================
-# define output filenames
+############################################################
+#===============================
+# output: summary stats by time
+#===============================
 summary_stats_timepoint_combined = paste0(outdir_stats, "summary_stats_timepoint_v3.csv")

-#==========================================================
+############################################################
 # data assignment for stats
 wf = wf_data
 lf = lf_data
+########################################################################
 #=======================================================
 # summary stats by timepoint and outcome: each mediator
 #=======================================================