added summary stats by time and outcome

2020-10-23 11:11:21 +01:00 · 2020-10-23 11:11:21 +01:00 · b0c06b9704
commit b0c06b9704
parent ce962e7475
1 changed files with 199 additions and 0 deletions
--- a/summary_stats_by_time_outcome.R
+++ b/summary_stats_by_time_outcome.R
@ -0,0 +1,199 @@
 #!/usr/bin/Rscript   
 getwd()
 setwd('~/git/covid_analysis/')
 getwd()
 ############################################################
 # TASK: basic plots
 # useful links:
 # http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization
 ############################################################
 # source data
 source("read_data.R")
 #==========================================================
 # define output filenames
 summary_stats_time_outcome = paste0(outdir_stats, "summary_stats_timepoint_outcome_v3.csv")
 #==========================================================
 # data assignment for stats
 wf = wf_data
 lf = lf_data
 ############################################################
 #=======================================================
 # summary stats by timepoint and outcome: each mediator
 #=======================================================
 #******************************
 # all major summary stats: by time and outcome
 #******************************
 gp_stats = groupedstats::grouped_summary(
  data = lf,
  grouping.vars = c(mediator, timepoint, outcomes),
  measures = value,
  measures.type = "numeric")
 # convert to df
 gp_stats_df = as.data.frame(gp_stats)
 orig_ncols = ncol(gp_stats_df)
 print("Dropping skim cols...")
 ncols_drop = 2
 gp_stats_df  = subset(gp_stats_df , select = -c(skim_type, skim_variable))
 if(ncol(gp_stats_df) == orig_ncols - ncols_drop){
  print('PASS: cols dropped successfully and dimemsions matched')
 } else{
  print("FAIL: dim mismatch. check harcoded value of ncol_drop")
 }
 class(gp_stats_df); str(gp_stats_df)
 # convert to factor to be consistent with to allow intersecting cols for merge
 gp_stats_df$mediator = as.character(gp_stats_df$mediator)
 gp_stats_df$timepoint = as.character(gp_stats_df$timepoint)
 class(gp_stats_df); str(gp_stats_df)
 #===============================================================
 #******************************
 # summary stats not covered 
 # by groupedstats
 #******************************
 gp_gmean = lf %>%
  group_by(mediator, timepoint, outcomes) %>%
  dplyr::summarise(total_subjects = n()
            , mean = mean(value, na.rm = T)
            , sd = sd(value, na.rm = T)
            , mean_CI = qwraps2::frmtci(mean_ci(value, na_rm =T))
            , gmean_SD = gmean_sd(value, na_rm = T, denote_sd = "paren")
            , gmean_CI = qwraps2::frmtci(gmeanCI(value, na.rm = T)) 
            , median =  median(value, na.rm = T)
            , median_iqr = median_iqr(value, na_rm = T)
            , skew = skewness(value, na.rm = T)
            , kurtosis = kurtosis(value, na.rm = T)
  )
 class(gp_gmean); str(gp_gmean)
 #***************
 # convert to df
 #***************
 gp_gmean_df = as.data.frame(gp_gmean)
 class(gp_gmean_df); str(gp_gmean_df)
 #***************
 # format cols displaying na_rm info, this is covered by complete column
 #***************
 gp_gmean_df$gmean_SD <- gsub("[0-9]+; ", "", gp_gmean_df$gmean_SD)
 gp_gmean_df$median_iqr <- gsub("[0-9]+; ", "", gp_gmean_df$median_iqr)
 ############################################################
 #*******************
 # merge the two dfs
 #*******************
 common_cols = intersect(names(gp_gmean_df), names(gp_stats_df))
 print(common_cols)
 # sanity check
 for (i in common_cols){
  #print(i)
  if( all.equal(gp_gmean_df[i], gp_stats_df[i]) ){
    print('PASS')
    expected_rows = nlevels(factor(gp_gmean_df$mediator)) * nlevels(factor(gp_gmean_df$timepoint)) * nlevels(factor(gp_gmean_df$outcomes))
  } else{
    print('something went wrong!')
  }
 }
 # combining dfs
 if (nrow(gp_gmean_df) && nrow(gp_stats_df) == expected_rows){
  expected_cols = ncol(gp_gmean_df) + ncol(gp_stats_df) - length(common_cols)
  print('combining dfs...')
  # combined_df of stats
  combined_stats_df = merge(gp_stats_df, gp_gmean_df, by = common_cols, all = T )
  cat(paste0('nrows combined df: ', dim(combined_stats_df)[1])) 
  cat(paste0('\nncols combined df: ', dim(combined_stats_df)[2])) 
 }
 # make complete_to_percentage
 combined_stats_df$complete_cases_percent = round(combined_stats_df$complete*100, digits = 2)
 head(combined_stats_df$complete_cases_percent)
 # rearrange cols
 orig_colnames = colnames(combined_stats_df)
 orig_colnames
 # rearrange columns
 my_col_order = c("mediator" ,"timepoint", "outcomes"
                 , "n", "total_subjects" ,"missing"   
                 , "complete", "complete_cases_percent"
                 , "mean", "mean.conf.low" ,"mean.conf.high", "mean_CI"
                 , "sd", "std.error"
                 , "min" ,"median","p25","p75"   
                 , "median_iqr","max", "gmean_SD"     
                 , "gmean_CI","skew","kurtosis")
 length(my_col_order)
 if( (length(orig_colnames) == length(my_col_order)) && all(my_col_order %in% orig_colnames) ) {
  cat("PASS: ncols match and colnames BOTH match for reordering"
        , "\nReordering columns...")
  # combined_df reordered
  combined_stats_df_formatted =  combined_stats_df[, my_col_order]
  print("Successful: column reordering")
  print("formatted df called:'combined_stats_df_formatted'")
  cat('\nformatted df has the following dimensions\n')
  print(dim(combined_stats_df_formatted ))
 } else{
  cat(paste0("FAIL:Cannot reorder columns, length mismatch"
             , "\nExpected column order for: ", ncol(combined_stats_df)
             , "\nGot:", length(my_col_order)))
  quit()
 }    
 # rename cols
 orig_colnames_f = colnames(combined_stats_df_formatted)
 orig_colnames_f
 # rename cols
 new_colnames = c("mediator" ,"timepoint", "outcomes"
                 , "n", "n_total_subjects" ,"n_missing"   
                 , "complete", "complete_cases_percent"
                 , "mean","mean_ci_low" ,"mean_ci_high", "mean_ci"
                 , "sd", "std_error"
                 , "min" ,"median","p25","p75"   
                 , "median_iqr","max", "gmean_SD"     
                 , "gmean_CI","skew","kurtosis")
 if(length(new_colnames) == length(orig_colnames_f)){
  print("PASS: renaming columns...")
  colnames(combined_stats_df_formatted) = new_colnames 
  print("Columns renamed successfully")
 }
 # delete redundant cols
 #combined_stats_df_formatted = subset(combined_stats_df_formatted, select = -c(mean_ci, median_iqr))
 #cols_to_remove = c("complete", "mean_ci", "median_iqr")
 combined_stats_df_f = subset(combined_stats_df_formatted, select = -c(complete, mean_ci, median_iqr) )
 ############################################################
 #****************
 # write file out
 #*****************
 cat("Summary stats by time and outcome will be:", summary_stats_time_outcome)
 write.csv(combined_stats_df_f, summary_stats_time_outcome, row.names = FALSE)
 ############################################################