From b0c06b97049ee1ebec6f3df0965eb59bc08b01ac Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 23 Oct 2020 11:11:21 +0100
Subject: [PATCH] added summary stats by time and outcome

---
 summary_stats_by_time_outcome.R | 199 ++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 summary_stats_by_time_outcome.R

diff --git a/summary_stats_by_time_outcome.R b/summary_stats_by_time_outcome.R
new file mode 100644
index 0000000..b712422
--- /dev/null
+++ b/summary_stats_by_time_outcome.R
@@ -0,0 +1,199 @@
+#!/usr/bin/Rscript   
+getwd()
+setwd('~/git/covid_analysis/')
+getwd()
+############################################################
+# TASK: basic plots
+
+# useful links:
+# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization
+############################################################
+# source data
+source("read_data.R")
+#==========================================================
+# define output filenames
+summary_stats_time_outcome = paste0(outdir_stats, "summary_stats_timepoint_outcome_v3.csv")
+
+#==========================================================
+# data assignment for stats
+wf = wf_data
+lf = lf_data
+
+############################################################
+
+#=======================================================
+# summary stats by timepoint and outcome: each mediator
+#=======================================================
+#******************************
+# all major summary stats: by time and outcome
+#******************************
+gp_stats = groupedstats::grouped_summary(
+  data = lf,
+  grouping.vars = c(mediator, timepoint, outcomes),
+  measures = value,
+  measures.type = "numeric")
+
+# convert to df
+gp_stats_df = as.data.frame(gp_stats)
+orig_ncols = ncol(gp_stats_df)
+
+print("Dropping skim cols...")
+ncols_drop = 2
+gp_stats_df  = subset(gp_stats_df , select = -c(skim_type, skim_variable))
+
+if(ncol(gp_stats_df) == orig_ncols - ncols_drop){
+  
+  print('PASS: cols dropped successfully and dimemsions matched')
+} else{
+  print("FAIL: dim mismatch. check harcoded value of ncol_drop")
+}
+
+class(gp_stats_df); str(gp_stats_df)
+
+# convert to factor to be consistent with to allow intersecting cols for merge
+gp_stats_df$mediator = as.character(gp_stats_df$mediator)
+gp_stats_df$timepoint = as.character(gp_stats_df$timepoint)
+
+class(gp_stats_df); str(gp_stats_df)
+
+#===============================================================
+
+#******************************
+# summary stats not covered 
+# by groupedstats
+#******************************
+gp_gmean = lf %>%
+  group_by(mediator, timepoint, outcomes) %>%
+  dplyr::summarise(total_subjects = n()
+            , mean = mean(value, na.rm = T)
+            , sd = sd(value, na.rm = T)
+            , mean_CI = qwraps2::frmtci(mean_ci(value, na_rm =T))
+            , gmean_SD = gmean_sd(value, na_rm = T, denote_sd = "paren")
+            , gmean_CI = qwraps2::frmtci(gmeanCI(value, na.rm = T)) 
+            , median =  median(value, na.rm = T)
+            , median_iqr = median_iqr(value, na_rm = T)
+            , skew = skewness(value, na.rm = T)
+            , kurtosis = kurtosis(value, na.rm = T)
+  )
+
+class(gp_gmean); str(gp_gmean)
+
+#***************
+# convert to df
+#***************
+gp_gmean_df = as.data.frame(gp_gmean)
+class(gp_gmean_df); str(gp_gmean_df)
+
+#***************
+# format cols displaying na_rm info, this is covered by complete column
+#***************
+gp_gmean_df$gmean_SD <- gsub("[0-9]+; ", "", gp_gmean_df$gmean_SD)
+gp_gmean_df$median_iqr <- gsub("[0-9]+; ", "", gp_gmean_df$median_iqr)
+
+############################################################
+
+#*******************
+# merge the two dfs
+#*******************
+common_cols = intersect(names(gp_gmean_df), names(gp_stats_df))
+print(common_cols)
+
+# sanity check
+for (i in common_cols){
+  #print(i)
+  if( all.equal(gp_gmean_df[i], gp_stats_df[i]) ){
+    print('PASS')
+    expected_rows = nlevels(factor(gp_gmean_df$mediator)) * nlevels(factor(gp_gmean_df$timepoint)) * nlevels(factor(gp_gmean_df$outcomes))
+  } else{
+    print('something went wrong!')
+  }
+}
+
+# combining dfs
+if (nrow(gp_gmean_df) && nrow(gp_stats_df) == expected_rows){
+  expected_cols = ncol(gp_gmean_df) + ncol(gp_stats_df) - length(common_cols)
+  print('combining dfs...')
+  
+  # combined_df of stats
+  combined_stats_df = merge(gp_stats_df, gp_gmean_df, by = common_cols, all = T )
+  
+  cat(paste0('nrows combined df: ', dim(combined_stats_df)[1])) 
+  cat(paste0('\nncols combined df: ', dim(combined_stats_df)[2])) 
+}
+
+# make complete_to_percentage
+combined_stats_df$complete_cases_percent = round(combined_stats_df$complete*100, digits = 2)
+head(combined_stats_df$complete_cases_percent)
+
+# rearrange cols
+orig_colnames = colnames(combined_stats_df)
+orig_colnames
+
+# rearrange columns
+my_col_order = c("mediator" ,"timepoint", "outcomes"
+                 , "n", "total_subjects" ,"missing"   
+                 , "complete", "complete_cases_percent"
+                 , "mean", "mean.conf.low" ,"mean.conf.high", "mean_CI"
+                 , "sd", "std.error"
+                 , "min" ,"median","p25","p75"   
+                 , "median_iqr","max", "gmean_SD"     
+                 , "gmean_CI","skew","kurtosis")
+
+length(my_col_order)
+
+if( (length(orig_colnames) == length(my_col_order)) && all(my_col_order %in% orig_colnames) ) {
+  cat("PASS: ncols match and colnames BOTH match for reordering"
+        , "\nReordering columns...")
+  # combined_df reordered
+  combined_stats_df_formatted =  combined_stats_df[, my_col_order]
+  print("Successful: column reordering")
+  print("formatted df called:'combined_stats_df_formatted'")
+  cat('\nformatted df has the following dimensions\n')
+  print(dim(combined_stats_df_formatted ))
+} else{
+  cat(paste0("FAIL:Cannot reorder columns, length mismatch"
+             , "\nExpected column order for: ", ncol(combined_stats_df)
+             , "\nGot:", length(my_col_order)))
+  quit()
+}    
+
+# rename cols
+orig_colnames_f = colnames(combined_stats_df_formatted)
+orig_colnames_f
+
+# rename cols
+new_colnames = c("mediator" ,"timepoint", "outcomes"
+                 , "n", "n_total_subjects" ,"n_missing"   
+                 , "complete", "complete_cases_percent"
+                 , "mean","mean_ci_low" ,"mean_ci_high", "mean_ci"
+                 , "sd", "std_error"
+                 , "min" ,"median","p25","p75"   
+                 , "median_iqr","max", "gmean_SD"     
+                 , "gmean_CI","skew","kurtosis")
+
+
+
+if(length(new_colnames) == length(orig_colnames_f)){
+  print("PASS: renaming columns...")
+  colnames(combined_stats_df_formatted) = new_colnames 
+  print("Columns renamed successfully")
+}
+
+
+# delete redundant cols
+#combined_stats_df_formatted = subset(combined_stats_df_formatted, select = -c(mean_ci, median_iqr))
+
+#cols_to_remove = c("complete", "mean_ci", "median_iqr")
+combined_stats_df_f = subset(combined_stats_df_formatted, select = -c(complete, mean_ci, median_iqr) )
+
+############################################################
+
+#****************
+# write file out
+#*****************
+cat("Summary stats by time and outcome will be:", summary_stats_time_outcome)
+write.csv(combined_stats_df_f, summary_stats_time_outcome, row.names = FALSE)
+
+############################################################
+
+