diff --git a/stats_paired.R b/stats_paired.R new file mode 100644 index 0000000..e37c99c --- /dev/null +++ b/stats_paired.R @@ -0,0 +1,111 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/covid_analysis/') +getwd() +############################################################ +# TASK: basic plots + +# useful links: +# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization +############################################################ +# source data +source("read_data.R") +############################################################ +#========================= +# output: paired_analysis +#========================= +stats_time_paired = paste0(outdir_stats, "stats_paired_v3.csv") + +############################################################ +# data assignment for stats +wf = wf_data +lf = lf_data +######################################################################## +# Pairwise stats by timepoint: wilcoxon paired analysis with correction +######################################################################## +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" +stats_by_timepoint = compare_means(value~timepoint, group.by = "mediator" + , data = lf + , paired = TRUE + , p.adjust.method = my_adjust_method) + + +# check: satisfied!!!! +wilcox.test(wf$sESelectin_ngmL_t1, wf$sESelectin_ngmL_t2, paired = T) +wilcox.test(wf$sRAGE_pgmL_t1, wf$sRAGE_pgmL_t2, paired = T) + +# delete unnecessary column +stats_by_timepoint = subset(stats_by_timepoint, select = -c(.y.)) + +# reflect stats method correctly +stats_by_timepoint$method +stats_by_timepoint$method = gsub("Wilcoxon", "Wilcoxon_paired", stats_by_timepoint$method) +stats_by_timepoint$method + +# replace "." in colnames with "_" +colnames(stats_by_timepoint) +#names(stats_by_timepoint) = gsub("\.", "_", names(stats_by_timepoint)) # weird!!!! + +colnames(stats_by_timepoint) = c("mediator" + ,"group1" + ,"group2" + ,"p" + ,"p_adj" + ,"p_format" + ,"p_signif" + ,"method" ) + +colnames(stats_by_timepoint) + +# add an extra column for padjust_signif +stats_by_timepoint$padjust_signif = round(stats_by_timepoint$p_adj, digits = 2) + +# add appropriate symbols for padjust_signif +#stats_by_timepoint = stats_by_timepoint %>% +# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." +# , padjust_signif <0.05 ~ '*' +# , padjust_signif <=0.01 ~ '**' +# , padjust_signif <=0.001 ~ '***' +# , padjust_signif <=0.0001 ~ '****' +# , TRUE ~ 'ns')) + +stats_by_timepoint = dplyr::mutate(stats_by_timepoint, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +# reorder columns +print("preparing to reorder columns...") +colnames(stats_by_timepoint) +my_col_order2 = c("mediator" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , "p_adj" + , "padjust_signif") + +if( length(my_col_order2) == ncol(stats_by_timepoint) && isin(my_col_order2, colnames(stats_by_timepoint)) ){ + print("PASS: Reordering columns...") + stats_by_timepoint_f = stats_by_timepoint[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'stats_by_timepoint_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(stats_by_timepoint_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(stats_by_timepoint) + , "\nGot:", length(my_col_order2))) + quit() +} + +#****************** +# write output file +#****************** +cat("Paired stats by timepoint will be:", stats_time_paired) +write.csv(stats_by_timepoint_f, stats_time_paired, row.names = FALSE) diff --git a/stats_unpaired.R b/stats_unpaired.R new file mode 100644 index 0000000..73600d5 --- /dev/null +++ b/stats_unpaired.R @@ -0,0 +1,211 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/covid_analysis/') +getwd() +############################################################ +# TASK: basic plots + +# useful links: +# http://www.sthda.com/english/wiki/ggplot2-dot-plot-quick-start-guide-r-software-and-data-visualization +############################################################ +# source data +source("read_data.R") +############################################################ +#============================ +# Output: unpaired analysis +#============================ +stats_time_unpaired = paste0(outdir_stats, "stats_unpaired_v3.csv") + +############################################################ +# data assignment for stats +wf = wf_data +lf = lf_data + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis with correction +####################################################################### +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +stats_un_t1 = compare_means(value~outcomes, group.by = "mediator" + , data = lf_t1 + , paired = FALSE + , p.adjust.method = my_adjust_method) + +stats_un_t1$timepoint = "t1" + +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +# check: satisfied!!!! +wilcox.test(wf$sESelectin_ngmL_t1[wf$outcomes == 0], wf$sESelectin_ngmL_t1[wf$outcomes == 1] + , paired = FALSE) + +wilcox.test(wf$PF_units_t1[wf$outcomes==0], wf$PF_units_t1[wf$outcomes == 1] + , paired = FALSE) + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +stats_un_t2 = compare_means(value~outcomes, group.by = "mediator" + , data = lf_t2 + , paired = FALSE + , p.adjust.method = my_adjust_method) + +stats_un_t2$timepoint = "t2" + +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +# check: satisfied!!!! +wilcox.test(wf$sESelectin_ngmL_t2[wf$outcomes == 0], wf$sESelectin_ngmL_t2[wf$outcomes == 1] + , paired = FALSE) + +wilcox.test(wf$PF_units_t2[wf$outcomes==0], wf$PF_units_t2[wf$outcomes == 1] + , paired = FALSE) + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +stats_un_t3 = compare_means(value~outcomes, group.by = "mediator" + , data = lf_t3 + , paired = FALSE + , p.adjust.method = my_adjust_method) + +stats_un_t3$timepoint = "t3" + +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +# check: satisfied!!!! +wilcox.test(wf$sESelectin_ngmL_t3[wf$outcomes == 0], wf$sESelectin_ngmL_t3[wf$outcomes == 1] + , paired = FALSE) + +wilcox.test(wf$PF_units_t3[wf$outcomes==0], wf$PF_units_t3[wf$outcomes == 1] + , paired = FALSE) + + +#============== +# Rbind these dfs +#============== +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +#=============================================================== +# formatting df +# delete unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# reflect stats method correctly +combined_unpaired_stats$method +combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method + +# replace "." in colnames with "_" +colnames(combined_unpaired_stats) +#names(combined_unpaired_stats) = gsub("\.", "_", names(combined_unpaired_stats)) # weird!!!! + +colnames(combined_unpaired_stats) = c("mediator" + ,"group1" + ,"group2" + ,"p" + ,"p_adj" + ,"p_format" + ,"p_signif" + ,"method" + , "timepoint") + +colnames(combined_unpaired_stats) + +# add an extra column for padjust_signif +combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) + +# add appropriate symbols for padjust_signif +#combined_unpaired_stats = combined_unpaired_stats %>% +# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." +# , padjust_signif <0.05 ~ '*' +# , padjust_signif <=0.01 ~ '**' +# , padjust_signif <=0.001 ~ '***' +# , padjust_signif <=0.0001 ~ '****' +# , TRUE ~ 'ns')) + +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + + +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , "p_adj" + , "padjust_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} + +#****************** +# write output file +#****************** +cat("UNpaired stats for groups will be:", stats_time_unpaired) +write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) + diff --git a/summary_stats_by_time_outcome.R b/summary_stats_by_time_outcome.R index b712422..8511d9f 100644 --- a/summary_stats_by_time_outcome.R +++ b/summary_stats_by_time_outcome.R @@ -10,15 +10,16 @@ getwd() ############################################################ # source data source("read_data.R") -#========================================================== -# define output filenames +############################################################ +#========================================= +# output: summary stats by time + outcome +#========================================= summary_stats_time_outcome = paste0(outdir_stats, "summary_stats_timepoint_outcome_v3.csv") -#========================================================== +############################################################ # data assignment for stats wf = wf_data lf = lf_data - ############################################################ #======================================================= diff --git a/summary_stats_time.R b/summary_stats_time.R index 127a11f..be1a9de 100755 --- a/summary_stats_time.R +++ b/summary_stats_time.R @@ -10,14 +10,17 @@ getwd() ############################################################ # source data source("read_data.R") -#========================================================== -# define output filenames +############################################################ +#=============================== +# output: summary stats by time +#=============================== summary_stats_timepoint_combined = paste0(outdir_stats, "summary_stats_timepoint_v3.csv") -#========================================================== +############################################################ # data assignment for stats wf = wf_data lf = lf_data +######################################################################## #======================================================= # summary stats by timepoint and outcome: each mediator #=======================================================