commit 7c343f8bde18f1bb3794bc3119d296d8d44e9fa5 Author: Tanushree Tunstall Date: Fri Oct 23 10:55:09 2020 +0100 added summary stats and read data file scripts diff --git a/Header_TT.R b/Header_TT.R new file mode 100755 index 0000000..379a250 --- /dev/null +++ b/Header_TT.R @@ -0,0 +1,99 @@ +#if (!require(tidyverse)){ +#install.packages("tidyverse") +#} + +#install.packages("tidyverse") +library(tidyverse) + +#install.packages("plyr") +library(plyr) + +#install.packages("psych") +library(psych) + +#install.packages("Hmisc") +library(Hmisc) + +#install.packages("rstatix") +library(rstatix) + +#install.packages("GGally") +library(GGally) + +#install.packages("ggpubr") +library(ggpubr) + +#install.packages("standardize") +library(standardize) + +#install.packages("quantable") +library(quantable) + +#install.packages("DescTools") +library(DescTools) + +#install.packages("scales") +library(scales) + +#install.packages("arsenal") +library(arsenal) + +#install.packages("ggcorrplot") +library(ggcorrplot) + +library("RColorBrewer") +library(gplots) +#install.packages("png") +library(png) + +library(corrplot) + +library(corrgram) + +library(ggrepel) +#install.packages("ggbeeswarm") +library(ggbeeswarm) + +#install.packages("ggridges") +library(ggridges) + +#install.packages("summarytools") +#library(summarytools) + +#install.packages("groupedstats") +library(groupedstats) + +#install.packages("moments") +library(moments) + +#install.packages("prob") +library(prob) # python type isin function + +#install.packages("cobalt") +#library(cobalt) # var.names + +library(magrittr) + +library(qwraps2) +#options(qwraps2_markup = "markdown") + + +#if (!requireNamespace("BiocManager", quietly = TRUE)) +# install.packages("BiocManager") +#BiocManager::install("ComplexHeatmap") + +#library(ComplexHeatmap) + + +my_kurtosis <- function(x) { + m4 <- mean((x - mean(x))^4) + kurtosis <- m4/(sd(x)^4) - 3 + kurtosis +} + +my_skewness <- function(x) { + m3 <- mean((x - mean(x))^3) + skewness <- m3/(sd(x)^3) + skewness +} + diff --git a/read_data.R b/read_data.R new file mode 100644 index 0000000..e052989 --- /dev/null +++ b/read_data.R @@ -0,0 +1,32 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/covid_analysis/') +getwd() +############################################################ +# TASK: basic plots + +# useful links: +# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization +############################################################ +# load libraries, packages and local imports + +source("Header_TT.R") +#========================================================== +datadir = "~/git/covid_data/" +outdir = "~/git/covid_data/output/" +outdir_plots = "~/git/covid_data/output/plots/" +outdir_stats = "~/git/covid_data/output/stats/" + +#========================================================== +# input data files +infile_icu_wf = paste0(datadir,"icu_covid_wf.csv") +infile_icu_lf = paste0(datadir,"icu_covid_lf.csv") + +#%%======================================================== +# read file +wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) +dim(wf_data) + +lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) +dim(lf_data) +#%%======================================================== diff --git a/summary_stats_time.R b/summary_stats_time.R new file mode 100755 index 0000000..4a3ec4f --- /dev/null +++ b/summary_stats_time.R @@ -0,0 +1,195 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/covid_analysis/') +getwd() +############################################################ +# TASK: basic plots + +# useful links: +# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization +############################################################ +# source data +source("read_data.R") +#========================================================== +# define output filenames +summary_stats_timepoint_combined = paste0(outdir_stats, "summary_stats_timepoint_v1.csv") + +#========================================================== +# data assignment for stats +wf = wf_data +lf = lf_data +#======================================================= +# summary stats by timepoint and outcome: each mediator +#======================================================= +#****************************** +# all major summary stats: by time and outcome +#****************************** +gp_stats = groupedstats::grouped_summary( + data = lf, + grouping.vars = c(mediator, timepoint), + measures = value, + measures.type = "numeric") + +# convert to df +gp_stats_df = as.data.frame(gp_stats) +orig_ncols = ncol(gp_stats_df) + +print("Dropping skim cols...") +ncols_drop = 2 +gp_stats_df = subset(gp_stats_df , select = -c(skim_type, skim_variable)) + +if(ncol(gp_stats_df) == orig_ncols - ncols_drop){ + + print('PASS: cols dropped successfully and dimemsions matched') +} else{ + print("FAIL: dim mismatch. check harcoded value of ncol_drop") +} + +class(gp_stats_df); str(gp_stats_df) + +# convert to factor to be consistent with to allow intersecting cols for merge +gp_stats_df$mediator = as.character(gp_stats_df$mediator) +gp_stats_df$timepoint = as.character(gp_stats_df$timepoint) + +class(gp_stats_df); str(gp_stats_df) + +#=============================================================== + +#****************************** +# summary stats not covered +# by groupedstats +#****************************** +gp_gmean = lf %>% + group_by(mediator, timepoint) %>% + dplyr::summarise(total_subjects = n() + , mean = mean(value, na.rm = T) + , sd = sd(value, na.rm = T) + , mean_CI = qwraps2::frmtci(mean_ci(value, na_rm =T)) + , gmean_SD = gmean_sd(value, na_rm = T, denote_sd = "paren") + , gmean_CI = qwraps2::frmtci(gmeanCI(value, na.rm = T)) + , median = median(value, na.rm = T) + , median_iqr = median_iqr(value, na_rm = T) + , skew = skewness(value, na.rm = T) + , kurtosis = kurtosis(value, na.rm = T) + ) + +class(gp_gmean); str(gp_gmean) + +#*************** +# convert to df +#*************** +gp_gmean_df = as.data.frame(gp_gmean) +class(gp_gmean_df); str(gp_gmean_df) + +#*************** +# format cols displaying na_rm info, this is covered by complete column +#*************** +gp_gmean_df$gmean_SD <- gsub("[0-9]+; ", "", gp_gmean_df$gmean_SD) +gp_gmean_df$median_iqr <- gsub("[0-9]+; ", "", gp_gmean_df$median_iqr) + +############################################################ + +#******************* +# merge the two dfs +#******************* +common_cols = intersect(names(gp_gmean_df), names(gp_stats_df)) +print(common_cols) + +# sanity check +for (i in common_cols){ + #print(i) + if( all.equal(gp_gmean_df[i], gp_stats_df[i]) ){ + print('PASS') + expected_rows = nlevels(factor(gp_gmean_df$mediator)) * nlevels(factor(gp_gmean_df$timepoint)) + } else{ + print('something went wrong!') + } +} + +# combining dfs +if (nrow(gp_gmean_df) && nrow(gp_stats_df) == expected_rows){ + expected_cols = ncol(gp_gmean_df) + ncol(gp_stats_df) - length(common_cols) + print('combining dfs...') + + # combined_df of stats + combined_stats_df = merge(gp_stats_df, gp_gmean_df, by = common_cols, all = T ) + + cat(paste0('nrows combined df: ', dim(combined_stats_df)[1])) + cat(paste0('\nncols combined df: ', dim(combined_stats_df)[2])) +} + +# make complete_to_percentage +combined_stats_df$complete_cases_percent = round(combined_stats_df$complete*100, digits = 2) +head(combined_stats_df$complete_cases_percent) + +# rearrange cols +orig_colnames = colnames(combined_stats_df) +orig_colnames + +# rearrange columns +my_col_order = c("mediator" ,"timepoint" + , "n", "total_subjects" ,"missing" + , "complete", "complete_cases_percent" + , "mean", "mean.conf.low" ,"mean.conf.high", "mean_CI" + , "sd", "std.error" + , "min" ,"median","p25","p75" + , "median_iqr","max", "gmean_SD" + , "gmean_CI","skew","kurtosis") + +length(my_col_order) + +if( (length(orig_colnames) == length(my_col_order)) && all(my_col_order %in% orig_colnames) ) { + cat("PASS: ncols match and colnames BOTH match for reordering" + , "\nReordering columns...") + # combined_df reordered + combined_stats_df_formatted = combined_stats_df[, my_col_order] + print("Successful: column reordering") + print("formatted df called:'combined_stats_df_formatted'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_stats_df_formatted )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_stats_df) + , "\nGot:", length(my_col_order))) + quit() +} + +# rename cols +orig_colnames_f = colnames(combined_stats_df_formatted) +orig_colnames_f + +# rename cols +new_colnames = c("mediator" ,"timepoint" + , "n", "n_total_subjects" ,"n_missing" + , "complete", "complete_cases_percent" + , "mean","mean_ci_low" ,"mean_ci_high", "mean_ci" + , "sd", "std_error" + , "min" ,"median","p25","p75" + , "median_iqr","max", "gmean_SD" + , "gmean_CI","skew","kurtosis") + + + +if(length(new_colnames) == length(orig_colnames_f)){ + print("PASS: renaming columns...") + colnames(combined_stats_df_formatted) = new_colnames + print("Columns renamed successfully") +} + + +# delete redundant cols +#combined_stats_df_formatted = subset(combined_stats_df_formatted, select = -c(mean_ci, median_iqr)) + +#cols_to_remove = c("complete", "mean_ci", "median_iqr") +combined_stats_df_f = subset(combined_stats_df_formatted, select = -c(complete, mean_ci, median_iqr) ) + +############################################################ + +#**************** +# write file out +#***************** +cat("Summary stats by timepoint will be:", summary_stats_timepoint_combined) +write.csv(combined_stats_df_f, summary_stats_timepoint_combined, row.names = FALSE) + +############################################################ +