From 93973ed8505d68a6085b67dcae08258361507a20 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sun, 18 Oct 2020 17:00:19 +0100 Subject: [PATCH] added stats_unpaired.R for sam, serum and npa --- mosaic_bmi_data_extraction.R | 158 ++++++++++++++++++---- stats_unpaired_npa.R | 255 +++++++++++++++++++++++++++++++++++ stats_unpaired_sam.R | 255 +++++++++++++++++++++++++++++++++++ stats_unpaired_serum.R | 255 +++++++++++++++++++++++++++++++++++ 4 files changed, 897 insertions(+), 26 deletions(-) create mode 100644 stats_unpaired_npa.R create mode 100644 stats_unpaired_sam.R create mode 100644 stats_unpaired_serum.R diff --git a/mosaic_bmi_data_extraction.R b/mosaic_bmi_data_extraction.R index 31020fe..0b9c76b 100644 --- a/mosaic_bmi_data_extraction.R +++ b/mosaic_bmi_data_extraction.R @@ -171,16 +171,22 @@ tail(colnames_check_f) # LF data ########################################################################## -#========= +#============== # lf data: sam -#========= +#============== str(sam_df) table(sam_df$obesity); table(sam_df$obese2) sam_df_adults = sam_df[sam_df$adult == 1,] -cols_to_omit = c("adult", "flustat", "type", "obesity" - , "height", "height_unit", "weight", "weight_unit","visual_est_bmi", "bmi_rating") +cols_to_omit = c("flustat", "type", "obesity" + , "height", "height_unit", "weight" + , "weight_unit", "visual_est_bmi", "bmi_rating") + +#sam_df_adults_clean = sam_df_adults[!cols_to_omit] + +wf_cols = colnames(sam_df_adults)[!colnames(sam_df_adults)%in%cols_to_omit] +sam_df_adults_clean = sam_df_adults[wf_cols] pivot_cols = meta_data_cols pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols @@ -194,44 +200,144 @@ if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){ quit() } -expected_rows_sam_lf = nrow(sam_df_adults) * (length(sam_df_adults) - length(pivot_cols)); expected_rows_sam_lf +expected_rows_sam_lf = nrow(sam_df_adults_clean) * (length(sam_df_adults_clean) - length(pivot_cols)); expected_rows_sam_lf # using regex: -sam_adults_lf = sam_df_adults %>% - tidyr::pivot_longer(-all_of(pivot_cols), names_to = c("mediator", "sample_type", "timepoint"), - names_pattern = "(.*)_(.*)([1-3]{1})", - values_to = "value") +sam_adults_lf = sam_df_adults_clean %>% + tidyr::pivot_longer(-all_of(pivot_cols) + , names_to = c("mediator", "sample_type", "timepoint") + , names_pattern = "(.*)_(.*)([1-3]{1})" + , values_to = "value") -if ((nrow(sam_lf) == expected_rows_sam_lf) & (sum(table(is.na(sam_adults_lf$mediator))) == expected_rows_sam_lf)) { +if ( + (nrow(sam_adults_lf) == expected_rows_sam_lf) & (sum(table(is.na(sam_adults_lf$mediator))) == expected_rows_sam_lf) + ) { cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:" - , "\nNo. of rows: ", nrow(sam_lf) - , "\nNo. of cols: ", ncol(sam_lf))) + , "\nNo. of rows: ", nrow(sam_adults_lf) + , "\nNo. of cols: ", ncol(sam_adults_lf))) } else{ cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator" , "\nExpected no. of rows: ", expected_rows_sam_lf - , "\nGot: ", nrow(sam_lf) + , "\nGot: ", nrow(sam_adults_lf) + , "\ncheck expected rows calculation!")) + quit() +} + +#library(data.table) +#foo = sam_df_adults[1:10] +#long <- melt(setDT(sam_df_adults), id.vars = pivot_cols, variable.name = "levels") + +#============== +# lf data: serum +#============== +str(serum_df) +table(serum_df$obesity); table(serum_df$obese2) + +serum_df_adults = serum_df[serum_df$adult == 1,] + + +cols_to_omit = c("flustat", "type", "obesity" + , "height", "height_unit", "weight", "weight_unit","visual_est_bmi", "bmi_rating") + +#serum_df_adults_clean = serum_df_adults[!cols_to_omit] + +wf_cols = colnames(serum_df_adults)[!colnames(serum_df_adults)%in%cols_to_omit] +serum_df_adults_clean = serum_df_adults[wf_cols] + +pivot_cols = meta_data_cols +pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols + +if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){ + cat("PASS: pivot cols successfully extracted") +}else{ + cat("FAIL: length mismatch! pivot cols could not be extracted" + , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit) + , "\nGot:",length(pivot_cols) ) + quit() +} + +expected_rows_serum_lf = nrow(serum_df_adults_clean) * (length(serum_df_adults_clean) - length(pivot_cols)); expected_rows_serum_lf + +# using regex: +serum_adults_lf = serum_df_adults_clean %>% + tidyr::pivot_longer(-all_of(pivot_cols) + , names_to = c("mediator", "sample_type", "timepoint") + , names_pattern = "(.*)_(.*)([1-3]{1})" + , values_to = "value") + +if ( + (nrow(serum_adults_lf) == expected_rows_serum_lf) & (sum(table(is.na(serum_adults_lf$mediator))) == expected_rows_serum_lf) +) { + cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:" + , "\nNo. of rows: ", nrow(serum_adults_lf) + , "\nNo. of cols: ", ncol(serum_adults_lf))) +} else{ + cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator" + , "\nExpected no. of rows: ", expected_rows_serum_lf + , "\nGot: ", nrow(serum_adults_lf) + , "\ncheck expected rows calculation!")) + quit() +} + +#============== +# lf data: npa +#============== +str(npa_df) +table(npa_df$obesity); table(npa_df$obese2) + +npa_df_adults = npa_df[npa_df$adult == 1,] + + +cols_to_omit = c("flustat", "type", "obesity" + , "height", "height_unit", "weight", "weight_unit","visual_est_bmi", "bmi_rating") + +#npa_df_adults_clean = npa_df_adults[!cols_to_omit] + +wf_cols = colnames(npa_df_adults)[!colnames(npa_df_adults)%in%cols_to_omit] +npa_df_adults_clean = npa_df_adults[wf_cols] + +pivot_cols = meta_data_cols +pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols + +if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){ + cat("PASS: pivot cols successfully extracted") +}else{ + cat("FAIL: length mismatch! pivot cols could not be extracted" + , "\nExpected length:", length(meta_data_cols) - length(cols_to_omit) + , "\nGot:",length(pivot_cols) ) + quit() +} + +expected_rows_npa_lf = nrow(npa_df_adults_clean) * (length(npa_df_adults_clean) - length(pivot_cols)); expected_rows_npa_lf + +# using regex: +npa_adults_lf = npa_df_adults_clean %>% + tidyr::pivot_longer(-all_of(pivot_cols) + , names_to = c("mediator", "sample_type", "timepoint") + , names_pattern = "(.*)_(.*)([1-3]{1})" + , values_to = "value") + +if ( + (nrow(npa_adults_lf) == expected_rows_npa_lf) & (sum(table(is.na(npa_adults_lf$mediator))) == expected_rows_npa_lf) +) { + cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:" + , "\nNo. of rows: ", nrow(npa_adults_lf) + , "\nNo. of cols: ", ncol(npa_adults_lf))) +} else{ + cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator" + , "\nExpected no. of rows: ", expected_rows_npa_lf + , "\nGot: ", nrow(npa_adults_lf) , "\ncheck expected rows calculation!")) quit() } - - - - - - - - - - - - - +############################################################################### # remove unnecessary variables rm(sam_regex, sam_regex_log_days, sam_cols, sam_cols_b, sam_cols_clean, sam_cols_i, sam_cols_to_extract, sam_cols_to_omit) rm(serum_regex, serum_regex_log_days, serum_cols, serum_cols_clean, serum_cols_i, serum_cols_to_extract, serum_cols_to_omit) rm(npa_regex, npa_regex_log_days, npa_cols, npa_cols_clean, npa_cols_i, npa_cols_to_extract, npa_cols_to_omit) rm(all_df) rm(colnames_check) +rm(i, j, expected_cols, start, wf_cols, extra_cols, cols_to_omit) diff --git a/stats_unpaired_npa.R b/stats_unpaired_npa.R new file mode 100644 index 0000000..57e98a3 --- /dev/null +++ b/stats_unpaired_npa.R @@ -0,0 +1,255 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +############################################################ +# TASK: summary stats of mediators by time and outcome +############################################################ +# load libraries and packages + +source("../Header_TT.R") +library(tidyverse) +library(ggpubr) +library(rstatix) +library(Hmisc) +library(qwraps2) +#========================================================== +#datadir = "~/git/covid19/Data" +source("mosaic_bmi_data_extraction_v2.R") + + +#============= +# Input +#============= + +#infile_icu_wf = paste0(datadir,"/icu_covid_wf.csv") +#infile_icu_lf = paste0(datadir,"/icu_covid_lf.csv") + +# version 2 +#infile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv") +#infile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv") + +#npa_adults_lf + +#============= +# Output +#============= +outdir = paste0("~/git/mosaic_2020/version1") + +# unpaired analysis +stats_time_unpaired = paste0(outdir, "stats_unpaired_npa.csv") + +#%%======================================================== +# read file +#wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) +#dim(wf_data) + +#lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) +#dim(lf_data) + +#%%======================================================== +# data assignment for stats +#wf = wf_data +#lf = lf_data +wf = npa_df_adults_clean +lf = npa_adults_lf +table(lf$timepoint) +lf$timepoint = paste0("t", lf$timepoint) + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis with correction +####################################################################### +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +sum(is.na(lf_t1$value)) + +foo = lf_t1[which(is.na(lf_t1$value)),] +ci = which(is.na(lf_t1$value)) + +#lf_t1_comp = lf_t1[-ci,] +lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] +stats_un_t1 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t1 + , data = lf_t1_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] + +stats_un_t1$timepoint = "t1" +stats_un_t1$n_obs = length(unique(lf_t1_comp$mosaic)) # CHECK + +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +# check: satisfied!!!! +wilcox.test() + + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] + +stats_un_t2 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t2 + , data = lf_t2_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) +stats_un_t2$timepoint = "t2" +stats_un_t2$n_obs = length(unique(lf_t2_comp$mosaic)) # CHECK + +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +# check: satisfied!!!! +wilcox.test() + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] + +stats_un_t3 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t3 + , data = lf_t3_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +stats_un_t3$timepoint = "t3" +stats_un_t3$n_obs = length(unique(lf_t3_comp$mosaic)) # CHECK + +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +# check: satisfied!!!! +wilcox.test() + +#============== +# Rbind these dfs +#============== +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +#=============================================================== +# formatting df +# delete unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# reflect stats method correctly +combined_unpaired_stats$method +combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method + +# replace "." in colnames with "_" +colnames(combined_unpaired_stats) +#names(combined_unpaired_stats) = gsub("\.", "_", names(combined_unpaired_stats)) # weird!!!! + +colnames(combined_unpaired_stats) = c("mediator" + , "group1" + , "group2" + , "p" + , "p_adj" + , "p_format" + , "p_signif" + , "method" + , "timepoint" + , "n_obs") + +colnames(combined_unpaired_stats) +combined_unpaired_stats$sample_type = "npa" + +# add an extra column for padjust_signif +combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) + +# add appropriate symbols for padjust_signif +#combined_unpaired_stats = combined_unpaired_stats %>% +# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." +# , padjust_signif <0.05 ~ '*' +# , padjust_signif <=0.01 ~ '**' +# , padjust_signif <=0.001 ~ '***' +# , padjust_signif <=0.0001 ~ '****' +# , TRUE ~ 'ns')) + +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , "p_adj" + , "padjust_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} + +combined_unpaired_stats_f_npa = combined_unpaired_stats_f +#****************** +# write output file +#****************** +cat("UNpaired stats for groups will be:", stats_time_unpaired) +write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) + diff --git a/stats_unpaired_sam.R b/stats_unpaired_sam.R new file mode 100644 index 0000000..b0dd627 --- /dev/null +++ b/stats_unpaired_sam.R @@ -0,0 +1,255 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +############################################################ +# TASK: summary stats of mediators by time and outcome +############################################################ +# load libraries and packages + +source("../Header_TT.R") +library(tidyverse) +library(ggpubr) +library(rstatix) +library(Hmisc) +library(qwraps2) +#========================================================== +#datadir = "~/git/covid19/Data" +source("mosaic_bmi_data_extraction_v2.R") + + +#============= +# Input +#============= + +#infile_icu_wf = paste0(datadir,"/icu_covid_wf.csv") +#infile_icu_lf = paste0(datadir,"/icu_covid_lf.csv") + +# version 2 +#infile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv") +#infile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv") + +#sam_adults_lf +#============= +# Output +#============= +outdir = paste0("~/git/mosaic_2020/version1") + +# unpaired analysis +stats_time_unpaired = paste0(outdir, "stats_unpaired_sam.csv") + +#%%======================================================== +# read file +#wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) +#dim(wf_data) + +#lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) +#dim(lf_data) + +#%%======================================================== +# data assignment for stats +#wf = wf_data +#lf = lf_data +wf = sam_df_adults_clean +lf = sam_adults_lf +table(lf$timepoint) +lf$timepoint = paste0("t", lf$timepoint) + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis with correction +####################################################################### +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +sum(is.na(lf_t1$value)) + +foo = lf_t1[which(is.na(lf_t1$value)),] +ci = which(is.na(lf_t1$value)) + +#lf_t1_comp = lf_t1[-ci,] +lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] +stats_un_t1 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t1 + , data = lf_t1_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] + +stats_un_t1$timepoint = "t1" +stats_un_t1$n_obs = length(unique(lf_t1_comp$mosaic)) # CHECK + +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +# check: satisfied!!!! +wilcox.test() + + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] + +stats_un_t2 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t2 + , data = lf_t2_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) +stats_un_t2$timepoint = "t2" +stats_un_t2$n_obs = length(unique(lf_t2_comp$mosaic)) # CHECK + +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +# check: satisfied!!!! +wilcox.test() + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] + +stats_un_t3 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t3 + , data = lf_t3_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +stats_un_t3$timepoint = "t3" +stats_un_t3$n_obs = length(unique(lf_t3_comp$mosaic)) # CHECK + +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +# check: satisfied!!!! +wilcox.test() + +#============== +# Rbind these dfs +#============== +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +#=============================================================== +# formatting df +# delete unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# reflect stats method correctly +combined_unpaired_stats$method +combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method + +# replace "." in colnames with "_" +colnames(combined_unpaired_stats) +#names(combined_unpaired_stats) = gsub("\.", "_", names(combined_unpaired_stats)) # weird!!!! + +colnames(combined_unpaired_stats) = c("mediator" + , "group1" + , "group2" + , "p" + , "p_adj" + , "p_format" + , "p_signif" + , "method" + , "timepoint" + , "n_obs") + +colnames(combined_unpaired_stats) +combined_unpaired_stats$sample_type = "sam" + +# add an extra column for padjust_signif +combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) + +# add appropriate symbols for padjust_signif +#combined_unpaired_stats = combined_unpaired_stats %>% +# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." +# , padjust_signif <0.05 ~ '*' +# , padjust_signif <=0.01 ~ '**' +# , padjust_signif <=0.001 ~ '***' +# , padjust_signif <=0.0001 ~ '****' +# , TRUE ~ 'ns')) + +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , "p_adj" + , "padjust_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} + +combined_unpaired_stats_f_sam = combined_unpaired_stats_f + +#****************** +# write output file +#****************** +cat("UNpaired stats for groups will be:", stats_time_unpaired) +write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) + diff --git a/stats_unpaired_serum.R b/stats_unpaired_serum.R new file mode 100644 index 0000000..11cc2cd --- /dev/null +++ b/stats_unpaired_serum.R @@ -0,0 +1,255 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +############################################################ +# TASK: summary stats of mediators by time and outcome +############################################################ +# load libraries and packages + +#source("../Header_TT.R") +library(tidyverse) +library(ggpubr) +library(rstatix) +library(Hmisc) +library(qwraps2) +#========================================================== +#datadir = "~/git/covid19/Data" +source("mosaic_bmi_data_extraction_v2.R") + + +#============= +# Input +#============= + +#infile_icu_wf = paste0(datadir,"/icu_covid_wf.csv") +#infile_icu_lf = paste0(datadir,"/icu_covid_lf.csv") + +# version 2 +#infile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv") +#infile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv") + +#serum_adults_lf + +#============= +# Output +#============= +outdir = paste0("~/git/mosaic_2020/version1") + +# unpaired analysis +stats_time_unpaired = paste0(outdir, "stats_unpaired_serum.csv") + +#%%======================================================== +# read file +#wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) +#dim(wf_data) + +#lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) +#dim(lf_data) + +#%%======================================================== +# data assignment for stats +#wf = wf_data +#lf = lf_data +wf = serum_df_adults_clean +lf = serum_adults_lf +table(lf$timepoint) +lf$timepoint = paste0("t", lf$timepoint) + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis with correction +####################################################################### +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +sum(is.na(lf_t1$value)) + +foo = lf_t1[which(is.na(lf_t1$value)),] +ci = which(is.na(lf_t1$value)) + +#lf_t1_comp = lf_t1[-ci,] +lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] +stats_un_t1 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t1 + , data = lf_t1_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] + +stats_un_t1$timepoint = "t1" +stats_un_t1$n_obs = length(unique(lf_t1_comp$mosaic)) # CHECK + +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +# check: satisfied!!!! +#wilcox.test() + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] + +stats_un_t2 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t2 + , data = lf_t2_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) +stats_un_t2$timepoint = "t2" +stats_un_t2$n_obs = length(unique(lf_t2_comp$mosaic)) # CHECK + +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +# check: satisfied!!!! +wilcox.test() + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] + +stats_un_t3 = compare_means(value~obese2 + , group.by = "mediator" + #, data = lf_t3 + , data = lf_t3_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +stats_un_t3$timepoint = "t3" +stats_un_t3$n_obs = length(unique(lf_t3_comp$mosaic)) # CHECK + +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +# check: satisfied!!!! +wilcox.test() + +#============== +# Rbind these dfs +#============== +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +#=============================================================== +# formatting df +# delete unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# reflect stats method correctly +combined_unpaired_stats$method +combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method + +# replace "." in colnames with "_" +colnames(combined_unpaired_stats) +#names(combined_unpaired_stats) = gsub("\.", "_", names(combined_unpaired_stats)) # weird!!!! + +colnames(combined_unpaired_stats) = c("mediator" + , "group1" + , "group2" + , "p" + , "p_adj" + , "p_format" + , "p_signif" + , "method" + , "timepoint" + , "n_obs") + +colnames(combined_unpaired_stats) +combined_unpaired_stats$sample_type = "serum" + +# add an extra column for padjust_signif +combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) + +# add appropriate symbols for padjust_signif +#combined_unpaired_stats = combined_unpaired_stats %>% +# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." +# , padjust_signif <0.05 ~ '*' +# , padjust_signif <=0.01 ~ '**' +# , padjust_signif <=0.001 ~ '***' +# , padjust_signif <=0.0001 ~ '****' +# , TRUE ~ 'ns')) + +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) + +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , "p_adj" + , "padjust_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} + +combined_unpaired_stats_f_serum = combined_unpaired_stats_f + +#****************** +# write output file +#****************** +cat("UNpaired stats for groups will be:", stats_time_unpaired) +write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) +