added summary_stats_time_group for sam and serum
This commit is contained in:
parent
a2dbecd03d
commit
936bc23aa2
2 changed files with 420 additions and 0 deletions
211
summary_stats_time_group_sam.R
Executable file
211
summary_stats_time_group_sam.R
Executable file
|
@ -0,0 +1,211 @@
|
|||
#!/usr/bin/Rscript
|
||||
getwd()
|
||||
setwd("~/git/mosaic_2020/")
|
||||
getwd()
|
||||
############################################################
|
||||
# TASK: Summary stats for mediators
|
||||
# sample type: SAM
|
||||
# data: Flu positive adult patients
|
||||
# group: obesity
|
||||
############################################################
|
||||
my_sample_type = "sam"
|
||||
|
||||
#=============
|
||||
# Input
|
||||
#=============
|
||||
source("data_extraction_formatting.R")
|
||||
|
||||
# check: adult variable and age variable discrepancy!
|
||||
metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18]
|
||||
|
||||
#=============
|
||||
# Output
|
||||
#=============
|
||||
outfile_summ_stats = paste0("flu_summary_stats_time_obesity_", my_sample_type, ".csv")
|
||||
summary_stats_time_outcome = paste0(outdir_stats, outfile_summ_stats)
|
||||
|
||||
#===============================
|
||||
# data assignment for stats
|
||||
#================================
|
||||
wf = sam_wf[sam_wf$flustat == 1,]
|
||||
lf = sam_lf[sam_lf$flustat == 1,]
|
||||
lf$timepoint = paste0("t", lf$timepoint)
|
||||
lf = lf[!lf$mediator == "vitd",]
|
||||
########################################################################
|
||||
|
||||
#=======================================================
|
||||
# summary stats by timepoint and obesity: each mediator
|
||||
#=======================================================
|
||||
#******************************
|
||||
# all major summary stats: by time and outcome
|
||||
#******************************
|
||||
gp_stats = groupedstats::grouped_summary(
|
||||
data = lf,
|
||||
grouping.vars = c(mediator, timepoint, obesity),
|
||||
measures = value,
|
||||
measures.type = "numeric")
|
||||
|
||||
# convert to df
|
||||
gp_stats_df = as.data.frame(gp_stats)
|
||||
orig_ncols = ncol(gp_stats_df)
|
||||
|
||||
print("Dropping skim cols...")
|
||||
ncols_drop = 2
|
||||
gp_stats_df = subset(gp_stats_df , select = -c(skim_type, skim_variable))
|
||||
|
||||
if(ncol(gp_stats_df) == orig_ncols - ncols_drop){
|
||||
|
||||
print('PASS: cols dropped successfully and dimemsions matched')
|
||||
} else{
|
||||
print("FAIL: dim mismatch. check harcoded value of ncol_drop")
|
||||
}
|
||||
|
||||
class(gp_stats_df); str(gp_stats_df)
|
||||
|
||||
# convert to factor to be consistent with to allow intersecting cols for merge
|
||||
gp_stats_df$mediator = as.character(gp_stats_df$mediator)
|
||||
gp_stats_df$timepoint = as.character(gp_stats_df$timepoint)
|
||||
|
||||
class(gp_stats_df); str(gp_stats_df)
|
||||
|
||||
#===============================================================
|
||||
|
||||
#******************************
|
||||
# summary stats not covered
|
||||
# by groupedstats
|
||||
#******************************
|
||||
gp_gmean = lf %>%
|
||||
group_by(mediator, timepoint, obesity) %>%
|
||||
dplyr::summarise(total_subjects = n()
|
||||
, mean = mean(value, na.rm = T)
|
||||
, sd = sd(value, na.rm = T)
|
||||
, mean_CI = qwraps2::frmtci(mean_ci(value, na_rm =T))
|
||||
, gmean_SD = gmean_sd(value, na_rm = T, denote_sd = "paren")
|
||||
, gmean_CI = qwraps2::frmtci(gmeanCI(value, na.rm = T))
|
||||
, median = median(value, na.rm = T)
|
||||
, median_iqr = median_iqr(value, na_rm = T)
|
||||
, skew = skewness(value, na.rm = T)
|
||||
, kurtosis = kurtosis(value, na.rm = T)
|
||||
)
|
||||
|
||||
class(gp_gmean); str(gp_gmean)
|
||||
|
||||
#***************
|
||||
# convert to df
|
||||
#***************
|
||||
gp_gmean_df = as.data.frame(gp_gmean)
|
||||
class(gp_gmean_df); str(gp_gmean_df)
|
||||
|
||||
#***************
|
||||
# format cols displaying na_rm info, this is covered by complete column
|
||||
#***************
|
||||
gp_gmean_df$gmean_SD <- gsub("[0-9]+; ", "", gp_gmean_df$gmean_SD)
|
||||
gp_gmean_df$median_iqr <- gsub("[0-9]+; ", "", gp_gmean_df$median_iqr)
|
||||
|
||||
############################################################
|
||||
|
||||
#*******************
|
||||
# merge the two dfs
|
||||
#*******************
|
||||
common_cols = intersect(names(gp_gmean_df), names(gp_stats_df))
|
||||
print(common_cols)
|
||||
|
||||
# sanity check
|
||||
for (i in common_cols){
|
||||
#print(i)
|
||||
if( all.equal(gp_gmean_df[i], gp_stats_df[i]) ){
|
||||
print('PASS')
|
||||
expected_rows = nlevels(factor(gp_gmean_df$mediator)) * nlevels(factor(gp_gmean_df$timepoint)) * nlevels(factor(gp_gmean_df$obesity))
|
||||
} else{
|
||||
print('something went wrong!')
|
||||
}
|
||||
}
|
||||
|
||||
# combining dfs
|
||||
if (nrow(gp_gmean_df) && nrow(gp_stats_df) == expected_rows){
|
||||
expected_cols = ncol(gp_gmean_df) + ncol(gp_stats_df) - length(common_cols)
|
||||
print('combining dfs...')
|
||||
|
||||
# combined_df of stats
|
||||
combined_stats_df = merge(gp_stats_df, gp_gmean_df, by = common_cols, all = T )
|
||||
|
||||
cat(paste0('nrows combined df: ', dim(combined_stats_df)[1]))
|
||||
cat(paste0('\nncols combined df: ', dim(combined_stats_df)[2]))
|
||||
}
|
||||
|
||||
# make complete_to_percentage
|
||||
combined_stats_df$complete_cases_percent = round(combined_stats_df$complete*100, digits = 2)
|
||||
head(combined_stats_df$complete_cases_percent)
|
||||
|
||||
# rearrange cols
|
||||
orig_colnames = colnames(combined_stats_df)
|
||||
orig_colnames
|
||||
|
||||
# rearrange columns
|
||||
my_col_order = c("mediator" ,"timepoint", "obesity"
|
||||
, "n", "total_subjects" ,"missing"
|
||||
, "complete", "complete_cases_percent"
|
||||
, "mean", "mean.conf.low" ,"mean.conf.high", "mean_CI"
|
||||
, "sd", "std.error"
|
||||
, "min" ,"median","p25","p75"
|
||||
, "median_iqr","max", "gmean_SD"
|
||||
, "gmean_CI","skew","kurtosis")
|
||||
|
||||
length(my_col_order)
|
||||
|
||||
if( (length(orig_colnames) == length(my_col_order)) && all(my_col_order %in% orig_colnames) ) {
|
||||
cat("PASS: ncols match and colnames BOTH match for reordering"
|
||||
, "\nReordering columns...")
|
||||
# combined_df reordered
|
||||
combined_stats_df_formatted = combined_stats_df[, my_col_order]
|
||||
print("Successful: column reordering")
|
||||
print("formatted df called:'combined_stats_df_formatted'")
|
||||
cat('\nformatted df has the following dimensions\n')
|
||||
print(dim(combined_stats_df_formatted ))
|
||||
} else{
|
||||
cat(paste0("FAIL:Cannot reorder columns, length mismatch"
|
||||
, "\nExpected column order for: ", ncol(combined_stats_df)
|
||||
, "\nGot:", length(my_col_order)))
|
||||
quit()
|
||||
}
|
||||
|
||||
# rename cols
|
||||
orig_colnames_f = colnames(combined_stats_df_formatted)
|
||||
orig_colnames_f
|
||||
|
||||
# rename cols
|
||||
new_colnames = c("mediator" ,"timepoint", "obesity"
|
||||
, "n", "n_total_subjects" ,"n_missing"
|
||||
, "complete", "complete_cases_percent"
|
||||
, "mean","mean_ci_low" ,"mean_ci_high", "mean_ci"
|
||||
, "sd", "std_error"
|
||||
, "min" ,"median","p25","p75"
|
||||
, "median_iqr","max", "gmean_SD"
|
||||
, "gmean_CI","skew","kurtosis")
|
||||
|
||||
|
||||
|
||||
if(length(new_colnames) == length(orig_colnames_f)){
|
||||
print("PASS: renaming columns...")
|
||||
colnames(combined_stats_df_formatted) = new_colnames
|
||||
print("Columns renamed successfully")
|
||||
}
|
||||
|
||||
|
||||
# delete redundant cols
|
||||
#combined_stats_df_formatted = subset(combined_stats_df_formatted, select = -c(mean_ci, median_iqr))
|
||||
|
||||
#cols_to_remove = c("complete", "mean_ci", "median_iqr")
|
||||
combined_stats_df_f = subset(combined_stats_df_formatted, select = -c(complete, mean_ci, median_iqr) )
|
||||
combined_stats_df_f$sample_type = my_sample_type
|
||||
############################################################
|
||||
|
||||
#****************
|
||||
# write file out
|
||||
#*****************
|
||||
cat("Summary stats by time and outcome will be:", summary_stats_time_outcome)
|
||||
write.csv(combined_stats_df_f, summary_stats_time_outcome, row.names = FALSE)
|
||||
|
||||
############################################################
|
||||
|
||||
|
209
summary_stats_time_group_serum.R
Normal file
209
summary_stats_time_group_serum.R
Normal file
|
@ -0,0 +1,209 @@
|
|||
#!/usr/bin/Rscript
|
||||
getwd()
|
||||
setwd("~/git/mosaic_2020/")
|
||||
getwd()
|
||||
############################################################
|
||||
# TASK: Summary stats for mediators
|
||||
# sample type: SERUM
|
||||
# data: Flu positive adult patients
|
||||
# group: obesity
|
||||
############################################################
|
||||
my_sample_type = "serum"
|
||||
|
||||
#=============
|
||||
# Input
|
||||
#=============
|
||||
source("data_extraction_formatting.R")
|
||||
|
||||
# check: adult variable and age variable discrepancy!
|
||||
metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18]
|
||||
|
||||
#=============
|
||||
# Output
|
||||
#=============
|
||||
outfile_summ_stats = paste0("flu_summary_stats_time_obesity_", my_sample_type, ".csv")
|
||||
summary_stats_time_outcome = paste0(outdir_stats, outfile_summ_stats)
|
||||
|
||||
#===============================
|
||||
# data assignment for stats
|
||||
#================================
|
||||
wf = serum_wf[serum_wf$flustat == 1,]
|
||||
lf = serum_lf[serum_lf$flustat == 1,]
|
||||
lf$timepoint = paste0("t", lf$timepoint)
|
||||
########################################################################
|
||||
|
||||
#=======================================================
|
||||
# summary stats by timepoint and obesity: each mediator
|
||||
#=======================================================
|
||||
#******************************
|
||||
# all major summary stats: by time and outcome
|
||||
#******************************
|
||||
gp_stats = groupedstats::grouped_summary(
|
||||
data = lf,
|
||||
grouping.vars = c(mediator, timepoint, obesity),
|
||||
measures = value,
|
||||
measures.type = "numeric")
|
||||
|
||||
# convert to df
|
||||
gp_stats_df = as.data.frame(gp_stats)
|
||||
orig_ncols = ncol(gp_stats_df)
|
||||
|
||||
print("Dropping skim cols...")
|
||||
ncols_drop = 2
|
||||
gp_stats_df = subset(gp_stats_df , select = -c(skim_type, skim_variable))
|
||||
|
||||
if(ncol(gp_stats_df) == orig_ncols - ncols_drop){
|
||||
|
||||
print('PASS: cols dropped successfully and dimemsions matched')
|
||||
} else{
|
||||
print("FAIL: dim mismatch. check harcoded value of ncol_drop")
|
||||
}
|
||||
|
||||
class(gp_stats_df); str(gp_stats_df)
|
||||
|
||||
# convert to factor to be consistent with to allow intersecting cols for merge
|
||||
gp_stats_df$mediator = as.character(gp_stats_df$mediator)
|
||||
gp_stats_df$timepoint = as.character(gp_stats_df$timepoint)
|
||||
|
||||
class(gp_stats_df); str(gp_stats_df)
|
||||
|
||||
#===============================================================
|
||||
|
||||
#******************************
|
||||
# summary stats not covered
|
||||
# by groupedstats
|
||||
#******************************
|
||||
gp_gmean = lf %>%
|
||||
group_by(mediator, timepoint, obesity) %>%
|
||||
dplyr::summarise(total_subjects = n()
|
||||
, mean = mean(value, na.rm = T)
|
||||
, sd = sd(value, na.rm = T)
|
||||
, mean_CI = qwraps2::frmtci(mean_ci(value, na_rm =T))
|
||||
, gmean_SD = gmean_sd(value, na_rm = T, denote_sd = "paren")
|
||||
, gmean_CI = qwraps2::frmtci(gmeanCI(value, na.rm = T))
|
||||
, median = median(value, na.rm = T)
|
||||
, median_iqr = median_iqr(value, na_rm = T)
|
||||
, skew = skewness(value, na.rm = T)
|
||||
, kurtosis = kurtosis(value, na.rm = T)
|
||||
)
|
||||
|
||||
class(gp_gmean); str(gp_gmean)
|
||||
|
||||
#***************
|
||||
# convert to df
|
||||
#***************
|
||||
gp_gmean_df = as.data.frame(gp_gmean)
|
||||
class(gp_gmean_df); str(gp_gmean_df)
|
||||
|
||||
#***************
|
||||
# format cols displaying na_rm info, this is covered by complete column
|
||||
#***************
|
||||
gp_gmean_df$gmean_SD <- gsub("[0-9]+; ", "", gp_gmean_df$gmean_SD)
|
||||
gp_gmean_df$median_iqr <- gsub("[0-9]+; ", "", gp_gmean_df$median_iqr)
|
||||
|
||||
############################################################
|
||||
|
||||
#*******************
|
||||
# merge the two dfs
|
||||
#*******************
|
||||
common_cols = intersect(names(gp_gmean_df), names(gp_stats_df))
|
||||
print(common_cols)
|
||||
|
||||
# sanity check
|
||||
for (i in common_cols){
|
||||
#print(i)
|
||||
if( all.equal(gp_gmean_df[i], gp_stats_df[i]) ){
|
||||
print('PASS')
|
||||
expected_rows = nlevels(factor(gp_gmean_df$mediator)) * nlevels(factor(gp_gmean_df$timepoint)) * nlevels(factor(gp_gmean_df$obesity))
|
||||
} else{
|
||||
print('something went wrong!')
|
||||
}
|
||||
}
|
||||
|
||||
# combining dfs
|
||||
if (nrow(gp_gmean_df) && nrow(gp_stats_df) == expected_rows){
|
||||
expected_cols = ncol(gp_gmean_df) + ncol(gp_stats_df) - length(common_cols)
|
||||
print('combining dfs...')
|
||||
|
||||
# combined_df of stats
|
||||
combined_stats_df = merge(gp_stats_df, gp_gmean_df, by = common_cols, all = T )
|
||||
|
||||
cat(paste0('nrows combined df: ', dim(combined_stats_df)[1]))
|
||||
cat(paste0('\nncols combined df: ', dim(combined_stats_df)[2]))
|
||||
}
|
||||
|
||||
# make complete_to_percentage
|
||||
combined_stats_df$complete_cases_percent = round(combined_stats_df$complete*100, digits = 2)
|
||||
head(combined_stats_df$complete_cases_percent)
|
||||
|
||||
# rearrange cols
|
||||
orig_colnames = colnames(combined_stats_df)
|
||||
orig_colnames
|
||||
|
||||
# rearrange columns
|
||||
my_col_order = c("mediator" ,"timepoint", "obesity"
|
||||
, "n", "total_subjects" ,"missing"
|
||||
, "complete", "complete_cases_percent"
|
||||
, "mean", "mean.conf.low" ,"mean.conf.high", "mean_CI"
|
||||
, "sd", "std.error"
|
||||
, "min" ,"median","p25","p75"
|
||||
, "median_iqr","max", "gmean_SD"
|
||||
, "gmean_CI","skew","kurtosis")
|
||||
|
||||
length(my_col_order)
|
||||
|
||||
if( (length(orig_colnames) == length(my_col_order)) && all(my_col_order %in% orig_colnames) ) {
|
||||
cat("PASS: ncols match and colnames BOTH match for reordering"
|
||||
, "\nReordering columns...")
|
||||
# combined_df reordered
|
||||
combined_stats_df_formatted = combined_stats_df[, my_col_order]
|
||||
print("Successful: column reordering")
|
||||
print("formatted df called:'combined_stats_df_formatted'")
|
||||
cat('\nformatted df has the following dimensions\n')
|
||||
print(dim(combined_stats_df_formatted ))
|
||||
} else{
|
||||
cat(paste0("FAIL:Cannot reorder columns, length mismatch"
|
||||
, "\nExpected column order for: ", ncol(combined_stats_df)
|
||||
, "\nGot:", length(my_col_order)))
|
||||
quit()
|
||||
}
|
||||
|
||||
# rename cols
|
||||
orig_colnames_f = colnames(combined_stats_df_formatted)
|
||||
orig_colnames_f
|
||||
|
||||
# rename cols
|
||||
new_colnames = c("mediator" ,"timepoint", "obesity"
|
||||
, "n", "n_total_subjects" ,"n_missing"
|
||||
, "complete", "complete_cases_percent"
|
||||
, "mean","mean_ci_low" ,"mean_ci_high", "mean_ci"
|
||||
, "sd", "std_error"
|
||||
, "min" ,"median","p25","p75"
|
||||
, "median_iqr","max", "gmean_SD"
|
||||
, "gmean_CI","skew","kurtosis")
|
||||
|
||||
|
||||
|
||||
if(length(new_colnames) == length(orig_colnames_f)){
|
||||
print("PASS: renaming columns...")
|
||||
colnames(combined_stats_df_formatted) = new_colnames
|
||||
print("Columns renamed successfully")
|
||||
}
|
||||
|
||||
# delete redundant cols
|
||||
#combined_stats_df_formatted = subset(combined_stats_df_formatted, select = -c(mean_ci, median_iqr))
|
||||
|
||||
#cols_to_remove = c("complete", "mean_ci", "median_iqr")
|
||||
combined_stats_df_f = subset(combined_stats_df_formatted, select = -c(complete, mean_ci, median_iqr) )
|
||||
combined_stats_df_f$sample_type = my_sample_type
|
||||
############################################################
|
||||
|
||||
#****************
|
||||
# write file out
|
||||
#*****************
|
||||
cat("Summary stats by time and outcome will be:", summary_stats_time_outcome)
|
||||
write.csv(combined_stats_df_f, summary_stats_time_outcome, row.names = FALSE)
|
||||
|
||||
############################################################
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue