diff --git a/stats_unpaired_npa.R b/stats_unpaired_npa.R index 57e98a3..f84f94a 100644 --- a/stats_unpaired_npa.R +++ b/stats_unpaired_npa.R @@ -1,58 +1,32 @@ #!/usr/bin/Rscript getwd() -setwd('~/git/mosaic_2020/') +setwd("~/git/mosaic_2020/") getwd() ############################################################ -# TASK: summary stats of mediators by time and outcome +# TASK: unpaired (time) analysis of mediators: NPA ############################################################ -# load libraries and packages - -source("../Header_TT.R") -library(tidyverse) -library(ggpubr) -library(rstatix) -library(Hmisc) -library(qwraps2) -#========================================================== -#datadir = "~/git/covid19/Data" -source("mosaic_bmi_data_extraction_v2.R") - - #============= # Input #============= +source("data_extraction_formatting.R") -#infile_icu_wf = paste0(datadir,"/icu_covid_wf.csv") -#infile_icu_lf = paste0(datadir,"/icu_covid_lf.csv") +# clear variables +rm(sam_adults_lf, sam_df_adults_clean +, serum_adults_lf, serum_df_adults_clean) +rm(colnames_sam_df, expected_rows_sam_lf +, colnames_serum_df, expected_rows_serum_lf) -# version 2 -#infile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv") -#infile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv") - -#npa_adults_lf +rm(pivot_cols) #============= -# Output +# Output: unpaired analysis of time for npa #============= -outdir = paste0("~/git/mosaic_2020/version1") - -# unpaired analysis -stats_time_unpaired = paste0(outdir, "stats_unpaired_npa.csv") - -#%%======================================================== -# read file -#wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) -#dim(wf_data) - -#lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) -#dim(lf_data) - +stats_time_unpaired_npa = paste0(outdir_stats, "stats_time_unpaired_npa.csv") #%%======================================================== # data assignment for stats -#wf = wf_data -#lf = lf_data wf = npa_df_adults_clean lf = npa_adults_lf +#%%======================================================== table(lf$timepoint) lf$timepoint = paste0("t", lf$timepoint) @@ -73,7 +47,7 @@ ci = which(is.na(lf_t1$value)) #lf_t1_comp = lf_t1[-ci,] lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] -stats_un_t1 = compare_means(value~obese2 +stats_un_t1 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t1 , data = lf_t1_comp @@ -83,14 +57,39 @@ stats_un_t1 = compare_means(value~obese2 foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] stats_un_t1$timepoint = "t1" -stats_un_t1$n_obs = length(unique(lf_t1_comp$mosaic)) # CHECK stats_un_t1 = as.data.frame(stats_un_t1) class(stats_un_t1) +# calculate n_obs for each mediator +n_t1 = data.frame(table(lf_t1_comp$mediator)) +colnames(n_t1) = c("mediator", "n_obs") +n_t1$mediator = as.character(n_t1$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t1), names(n_t1)); merging_cols +if (all(n_t1$mediator%in%stats_un_t1$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +}else{ + nf = n_t1$mediator[!n_t1$mediator%in%stats_un_t1$mediator] + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t1) +rm(lf_t1_comp) #============== # unpaired: t2 @@ -98,28 +97,54 @@ wilcox.test() lf_t2 = lf[lf$timepoint == "t2",] lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] -stats_un_t2 = compare_means(value~obese2 +stats_un_t2 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t2 , data = lf_t2_comp , paired = FALSE , p.adjust.method = my_adjust_method) stats_un_t2$timepoint = "t2" -stats_un_t2$n_obs = length(unique(lf_t2_comp$mosaic)) # CHECK stats_un_t2 = as.data.frame(stats_un_t2) class(stats_un_t2) +# calculate n_obs for each mediator +n_t2 = data.frame(table(lf_t2_comp$mediator)) +colnames(n_t2) = c("mediator", "n_obs") +n_t2$mediator = as.character(n_t2$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t2), names(n_t2)); merging_cols +if (all(n_t2$mediator%in%stats_un_t2$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +}else{ + nf = n_t2$mediator[!n_t2$mediator%in%stats_un_t2$mediator] + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t2) +rm(lf_t2_comp) + #============== # unpaired: t3 #============== lf_t3 = lf[lf$timepoint == "t3",] lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] -stats_un_t3 = compare_means(value~obese2 +stats_un_t3 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t3 , data = lf_t3_comp @@ -127,14 +152,40 @@ stats_un_t3 = compare_means(value~obese2 , p.adjust.method = my_adjust_method) stats_un_t3$timepoint = "t3" -stats_un_t3$n_obs = length(unique(lf_t3_comp$mosaic)) # CHECK stats_un_t3 = as.data.frame(stats_un_t3) class(stats_un_t3) +# calculate n_obs for each mediator +n_t3 = data.frame(table(lf_t3_comp$mediator)) +colnames(n_t3) = c("mediator", "n_obs") +n_t3$mediator = as.character(n_t3$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t3), names(n_t3)); merging_cols +if (all(n_t3$mediator%in%stats_un_t3$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +}else{ + nf = n_t3$mediator[!n_t3$mediator%in%stats_un_t3$mediator] + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t3) +rm(lf_t3_comp) + #============== # Rbind these dfs #============== @@ -178,7 +229,8 @@ combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) # reflect stats method correctly combined_unpaired_stats$method -combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +#combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method = "wilcoxon unpaired" combined_unpaired_stats$method # replace "." in colnames with "_" @@ -203,14 +255,6 @@ combined_unpaired_stats$sample_type = "npa" combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) # add appropriate symbols for padjust_signif -#combined_unpaired_stats = combined_unpaired_stats %>% -# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." -# , padjust_signif <0.05 ~ '*' -# , padjust_signif <=0.01 ~ '**' -# , padjust_signif <=0.001 ~ '***' -# , padjust_signif <=0.0001 ~ '****' -# , TRUE ~ 'ns')) - combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." , padjust_signif <=0.0001 ~ '****' , padjust_signif <=0.001 ~ '***' @@ -223,6 +267,8 @@ print("preparing to reorder columns...") colnames(combined_unpaired_stats) my_col_order2 = c("mediator" , "timepoint" + , "sample_type" + , "n_obs" , "group1" , "group2" , "method" @@ -232,7 +278,7 @@ my_col_order2 = c("mediator" , "p_adj" , "padjust_signif") -if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ +if( length(my_col_order2) == ncol(combined_unpaired_stats) && (all(my_col_order2%in%colnames(combined_unpaired_stats))) ){ print("PASS: Reordering columns...") combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] print("Successful: column reordering") @@ -244,12 +290,10 @@ if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2 , "\nExpected column order for: ", ncol(combined_unpaired_stats) , "\nGot:", length(my_col_order2))) quit() -} - -combined_unpaired_stats_f_npa = combined_unpaired_stats_f +} + #****************** # write output file #****************** -cat("UNpaired stats for groups will be:", stats_time_unpaired) -write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) - +cat("UNpaired stats for groups will be:", stats_time_unpaired_npa) +write.csv(combined_unpaired_stats_f, stats_time_unpaired_npa, row.names = FALSE) \ No newline at end of file diff --git a/stats_unpaired_sam.R b/stats_unpaired_sam.R index b0dd627..0894ef1 100644 --- a/stats_unpaired_sam.R +++ b/stats_unpaired_sam.R @@ -1,57 +1,32 @@ #!/usr/bin/Rscript getwd() -setwd('~/git/mosaic_2020/') +setwd("~/git/mosaic_2020/") getwd() ############################################################ -# TASK: summary stats of mediators by time and outcome +# TASK: unpaired (time) analysis of mediators: SAM ############################################################ -# load libraries and packages - -source("../Header_TT.R") -library(tidyverse) -library(ggpubr) -library(rstatix) -library(Hmisc) -library(qwraps2) -#========================================================== -#datadir = "~/git/covid19/Data" -source("mosaic_bmi_data_extraction_v2.R") - - #============= # Input #============= +source("data_extraction_formatting.R") -#infile_icu_wf = paste0(datadir,"/icu_covid_wf.csv") -#infile_icu_lf = paste0(datadir,"/icu_covid_lf.csv") +# clear variables +rm(npa_adults_lf, npa_df_adults_clean +, serum_adults_lf, serum_df_adults_clean) +rm(colnames_npa_df, expected_rows_npa_lf +, colnames_serum_df, expected_rows_serum_lf) -# version 2 -#infile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv") -#infile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv") +rm(pivot_cols) -#sam_adults_lf #============= -# Output +# Output: unpaired analysis of time for sam #============= -outdir = paste0("~/git/mosaic_2020/version1") - -# unpaired analysis -stats_time_unpaired = paste0(outdir, "stats_unpaired_sam.csv") - -#%%======================================================== -# read file -#wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) -#dim(wf_data) - -#lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) -#dim(lf_data) - +stats_time_unpaired_sam = paste0(outdir_stats, "stats_time_unpaired_sam.csv") #%%======================================================== # data assignment for stats -#wf = wf_data -#lf = lf_data wf = sam_df_adults_clean lf = sam_adults_lf +#%%======================================================== table(lf$timepoint) lf$timepoint = paste0("t", lf$timepoint) @@ -72,7 +47,7 @@ ci = which(is.na(lf_t1$value)) #lf_t1_comp = lf_t1[-ci,] lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] -stats_un_t1 = compare_means(value~obese2 +stats_un_t1 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t1 , data = lf_t1_comp @@ -82,14 +57,39 @@ stats_un_t1 = compare_means(value~obese2 foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] stats_un_t1$timepoint = "t1" -stats_un_t1$n_obs = length(unique(lf_t1_comp$mosaic)) # CHECK stats_un_t1 = as.data.frame(stats_un_t1) class(stats_un_t1) +# calculate n_obs for each mediator +n_t1 = data.frame(table(lf_t1_comp$mediator)) +colnames(n_t1) = c("mediator", "n_obs") +n_t1$mediator = as.character(n_t1$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t1), names(n_t1)); merging_cols +if (all(n_t1$mediator%in%stats_un_t1$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +}else{ + nf = n_t1$mediator[!n_t1$mediator%in%stats_un_t1$mediator] + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t1) +rm(lf_t1_comp) #============== # unpaired: t2 @@ -97,28 +97,54 @@ wilcox.test() lf_t2 = lf[lf$timepoint == "t2",] lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] -stats_un_t2 = compare_means(value~obese2 +stats_un_t2 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t2 , data = lf_t2_comp , paired = FALSE , p.adjust.method = my_adjust_method) stats_un_t2$timepoint = "t2" -stats_un_t2$n_obs = length(unique(lf_t2_comp$mosaic)) # CHECK stats_un_t2 = as.data.frame(stats_un_t2) class(stats_un_t2) +# calculate n_obs for each mediator +n_t2 = data.frame(table(lf_t2_comp$mediator)) +colnames(n_t2) = c("mediator", "n_obs") +n_t2$mediator = as.character(n_t2$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t2), names(n_t2)); merging_cols +if (all(n_t2$mediator%in%stats_un_t2$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +}else{ + nf = n_t2$mediator[!n_t2$mediator%in%stats_un_t2$mediator] + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t2) +rm(lf_t2_comp) + #============== # unpaired: t3 #============== lf_t3 = lf[lf$timepoint == "t3",] lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] -stats_un_t3 = compare_means(value~obese2 +stats_un_t3 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t3 , data = lf_t3_comp @@ -126,13 +152,40 @@ stats_un_t3 = compare_means(value~obese2 , p.adjust.method = my_adjust_method) stats_un_t3$timepoint = "t3" -stats_un_t3$n_obs = length(unique(lf_t3_comp$mosaic)) # CHECK stats_un_t3 = as.data.frame(stats_un_t3) class(stats_un_t3) +# calculate n_obs for each mediator +n_t3 = data.frame(table(lf_t3_comp$mediator)) +colnames(n_t3) = c("mediator", "n_obs") +n_t3$mediator = as.character(n_t3$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t3), names(n_t3)); merging_cols +if (all(n_t3$mediator%in%stats_un_t3$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +}else{ + nf = n_t3$mediator[!n_t3$mediator%in%stats_un_t3$mediator] + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +} + # check: satisfied!!!! -wilcox.test() +# FIXME: supply the col name automatically? +wilcox.test(wf$ifna2a_sam3[wf$obesity == 1], wf$ifna2a_sam3[wf$obesity == 0]) + +rm(n_t3) +rm(lf_t3_comp) #============== # Rbind these dfs @@ -177,7 +230,8 @@ combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) # reflect stats method correctly combined_unpaired_stats$method -combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +#combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method = "wilcoxon unpaired" combined_unpaired_stats$method # replace "." in colnames with "_" @@ -202,14 +256,6 @@ combined_unpaired_stats$sample_type = "sam" combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) # add appropriate symbols for padjust_signif -#combined_unpaired_stats = combined_unpaired_stats %>% -# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." -# , padjust_signif <0.05 ~ '*' -# , padjust_signif <=0.01 ~ '**' -# , padjust_signif <=0.001 ~ '***' -# , padjust_signif <=0.0001 ~ '****' -# , TRUE ~ 'ns')) - combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." , padjust_signif <=0.0001 ~ '****' , padjust_signif <=0.001 ~ '***' @@ -222,6 +268,8 @@ print("preparing to reorder columns...") colnames(combined_unpaired_stats) my_col_order2 = c("mediator" , "timepoint" + , "sample_type" + , "n_obs" , "group1" , "group2" , "method" @@ -231,11 +279,11 @@ my_col_order2 = c("mediator" , "p_adj" , "padjust_signif") -if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ +if( length(my_col_order2) == ncol(combined_unpaired_stats) && all(my_col_order2%in%colnames(combined_unpaired_stats)) ){ print("PASS: Reordering columns...") combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] print("Successful: column reordering") - print("formatted df called:'combined_unpaired_stats_f'") + print("formatted df called:'combined_unpaired_stats_f'") cat('\nformatted df has the following dimensions\n') print(dim(combined_unpaired_stats_f )) } else{ @@ -245,11 +293,8 @@ if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2 quit() } -combined_unpaired_stats_f_sam = combined_unpaired_stats_f - #****************** # write output file #****************** -cat("UNpaired stats for groups will be:", stats_time_unpaired) -write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) - +cat("UNpaired stats for groups will be:", stats_time_unpaired_sam) +write.csv(combined_unpaired_stats_f, stats_time_unpaired_sam, row.names = FALSE) \ No newline at end of file diff --git a/stats_unpaired_serum.R b/stats_unpaired_serum.R index 11cc2cd..dd78831 100644 --- a/stats_unpaired_serum.R +++ b/stats_unpaired_serum.R @@ -1,58 +1,32 @@ #!/usr/bin/Rscript getwd() -setwd('~/git/mosaic_2020/') +setwd("~/git/mosaic_2020/") getwd() ############################################################ -# TASK: summary stats of mediators by time and outcome +# TASK: unpaired (time) analysis of mediators: serum ############################################################ -# load libraries and packages - -#source("../Header_TT.R") -library(tidyverse) -library(ggpubr) -library(rstatix) -library(Hmisc) -library(qwraps2) -#========================================================== -#datadir = "~/git/covid19/Data" -source("mosaic_bmi_data_extraction_v2.R") - - #============= # Input #============= +source("data_extraction_formatting.R") -#infile_icu_wf = paste0(datadir,"/icu_covid_wf.csv") -#infile_icu_lf = paste0(datadir,"/icu_covid_lf.csv") +# clear variables +rm(sam_adults_lf, sam_df_adults_clean +, npa_adults_lf, npa_df_adults_clean) +rm(colnames_sam_df, expected_rows_sam_lf +, colnames_npa_df, expected_rows_npa_lf) -# version 2 -#infile_icu_wf = paste0(datadir,"/icu_covid_wf_v3.csv") -#infile_icu_lf = paste0(datadir,"/icu_covid_lf_v3.csv") - -#serum_adults_lf +rm(pivot_cols) #============= -# Output +# Output: unpaired analysis of time for serum #============= -outdir = paste0("~/git/mosaic_2020/version1") - -# unpaired analysis -stats_time_unpaired = paste0(outdir, "stats_unpaired_serum.csv") - -#%%======================================================== -# read file -#wf_data = read.csv(infile_icu_wf , stringsAsFactors = F) -#dim(wf_data) - -#lf_data = read.csv(infile_icu_lf , stringsAsFactors = F) -#dim(lf_data) - +stats_time_unpaired_serum = paste0(outdir_stats, "stats_time_unpaired_serum.csv") #%%======================================================== # data assignment for stats -#wf = wf_data -#lf = lf_data wf = serum_df_adults_clean lf = serum_adults_lf +#%%======================================================== table(lf$timepoint) lf$timepoint = paste0("t", lf$timepoint) @@ -73,7 +47,7 @@ ci = which(is.na(lf_t1$value)) #lf_t1_comp = lf_t1[-ci,] lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] -stats_un_t1 = compare_means(value~obese2 +stats_un_t1 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t1 , data = lf_t1_comp @@ -83,13 +57,39 @@ stats_un_t1 = compare_means(value~obese2 foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] stats_un_t1$timepoint = "t1" -stats_un_t1$n_obs = length(unique(lf_t1_comp$mosaic)) # CHECK stats_un_t1 = as.data.frame(stats_un_t1) class(stats_un_t1) +# calculate n_obs for each mediator +n_t1 = data.frame(table(lf_t1_comp$mediator)) +colnames(n_t1) = c("mediator", "n_obs") +n_t1$mediator = as.character(n_t1$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t1), names(n_t1)); merging_cols +if (all(n_t1$mediator%in%stats_un_t1$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +}else{ + nf = n_t1$mediator[!n_t1$mediator%in%stats_un_t1$mediator] + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +} + # check: satisfied!!!! -#wilcox.test() +wilcox.test() + +rm(n_t1) +rm(lf_t1_comp) #============== # unpaired: t2 @@ -97,28 +97,54 @@ class(stats_un_t1) lf_t2 = lf[lf$timepoint == "t2",] lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] -stats_un_t2 = compare_means(value~obese2 +stats_un_t2 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t2 , data = lf_t2_comp , paired = FALSE , p.adjust.method = my_adjust_method) stats_un_t2$timepoint = "t2" -stats_un_t2$n_obs = length(unique(lf_t2_comp$mosaic)) # CHECK stats_un_t2 = as.data.frame(stats_un_t2) class(stats_un_t2) +# calculate n_obs for each mediator +n_t2 = data.frame(table(lf_t2_comp$mediator)) +colnames(n_t2) = c("mediator", "n_obs") +n_t2$mediator = as.character(n_t2$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t2), names(n_t2)); merging_cols +if (all(n_t2$mediator%in%stats_un_t2$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +}else{ + nf = n_t2$mediator[!n_t2$mediator%in%stats_un_t2$mediator] + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t2) +rm(lf_t2_comp) + #============== # unpaired: t3 #============== lf_t3 = lf[lf$timepoint == "t3",] lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] -stats_un_t3 = compare_means(value~obese2 +stats_un_t3 = compare_means(value~obesity , group.by = "mediator" #, data = lf_t3 , data = lf_t3_comp @@ -126,14 +152,41 @@ stats_un_t3 = compare_means(value~obese2 , p.adjust.method = my_adjust_method) stats_un_t3$timepoint = "t3" -stats_un_t3$n_obs = length(unique(lf_t3_comp$mosaic)) # CHECK stats_un_t3 = as.data.frame(stats_un_t3) class(stats_un_t3) + +# calculate n_obs for each mediator +n_t3 = data.frame(table(lf_t3_comp$mediator)) +colnames(n_t3) = c("mediator", "n_obs") +n_t3$mediator = as.character(n_t3$mediator) + +# merge stats + n_obs df +merging_cols = intersect(names(stats_un_t3), names(n_t3)); merging_cols +if (all(n_t3$mediator%in%stats_un_t3$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +}else{ + nf = n_t3$mediator[!n_t3$mediator%in%stats_un_t3$mediator] + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +} + # check: satisfied!!!! wilcox.test() +rm(n_t3) +rm(lf_t3_comp) + #============== # Rbind these dfs #============== @@ -177,7 +230,8 @@ combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) # reflect stats method correctly combined_unpaired_stats$method -combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +#combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method = "wilcoxon unpaired" combined_unpaired_stats$method # replace "." in colnames with "_" @@ -202,14 +256,6 @@ combined_unpaired_stats$sample_type = "serum" combined_unpaired_stats$padjust_signif = round(combined_unpaired_stats$p_adj, digits = 2) # add appropriate symbols for padjust_signif -#combined_unpaired_stats = combined_unpaired_stats %>% -# mutate(padjust_signif = case_when(padjust_signif == 0.05 ~ "." -# , padjust_signif <0.05 ~ '*' -# , padjust_signif <=0.01 ~ '**' -# , padjust_signif <=0.001 ~ '***' -# , padjust_signif <=0.0001 ~ '****' -# , TRUE ~ 'ns')) - combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." , padjust_signif <=0.0001 ~ '****' , padjust_signif <=0.001 ~ '***' @@ -222,6 +268,8 @@ print("preparing to reorder columns...") colnames(combined_unpaired_stats) my_col_order2 = c("mediator" , "timepoint" + , "sample_type" + , "n_obs" , "group1" , "group2" , "method" @@ -231,12 +279,12 @@ my_col_order2 = c("mediator" , "p_adj" , "padjust_signif") -if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2, colnames(combined_unpaired_stats)) ){ +if( length(my_col_order2) == ncol(combined_unpaired_stats) && all(my_col_order2%in%colnames(combined_unpaired_stats)) ){ print("PASS: Reordering columns...") combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] print("Successful: column reordering") print("formatted df called:'combined_unpaired_stats_f'") - cat('\nformatted df has the following dimensions\n') + cat("\nformatted df has the following dimensions\n") print(dim(combined_unpaired_stats_f )) } else{ cat(paste0("FAIL:Cannot reorder columns, length mismatch" @@ -245,11 +293,8 @@ if( length(my_col_order2) == ncol(combined_unpaired_stats) && isin(my_col_order2 quit() } -combined_unpaired_stats_f_serum = combined_unpaired_stats_f - #****************** # write output file #****************** -cat("UNpaired stats for groups will be:", stats_time_unpaired) -write.csv(combined_unpaired_stats_f, stats_time_unpaired, row.names = FALSE) - +cat("UNpaired stats for groups will be:", stats_time_unpaired_serum) +write.csv(combined_unpaired_stats_f, stats_time_unpaired_serum, row.names = FALSE) \ No newline at end of file