diff --git a/boxplot_linear_na.R b/boxplot_linear_na.R new file mode 100644 index 0000000..12f883e --- /dev/null +++ b/boxplot_linear_na.R @@ -0,0 +1,166 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: boxplots at T1 +# FIXME: currently not rendering, problem with NAs for stats? +############################################################ +my_samples = "npa_sam_serum" +#============= +# Input +#============= +#source("data_extraction_formatting_non_asthmatics.R") +source("plot_data_na.R") + +# check: adult variable and age variable discrepancy! +metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18] + +#============= +# Output: +#============= +outfile_bp = paste0("boxplots_linear_NA_", my_samples, ".pdf") +output_boxplot = paste0(outdir_plots, outfile_bp); output_boxplot + +#=============================== +# data assignment for plots +#================================ +#----------- +# npa +#----------- +##wf_fp_npa = npa_wf[npa_wf$flustat == 1,] +#lf_fp_npa = npa_lf[npa_lf$flustat == 1,] +#lf_fp_npa$timepoint = paste0("t", lf_fp_npa$timepoint) +#lf_fp_npa$timepoint = as.factor(lf_fp_npa$timepoint) +#lf_fp_npa$obesity = as.factor(lf_fp_npa$obesity) + +#table(lf_fp_npa$mediator) +#head(lf_fp_npa$value[lf_fp_npa$mediator == "vitd"]) +#lf_fp_npa = lf_fp_npa[!lf_fp_npa$mediator == "vitd",] + +#----------- +# sam +#----------- +##wf_fp_sam = samm_wf[samm_wf$flustat == 1,] +#lf_fp_sam = sam_lf[sam_lf$flustat == 1,] +#lf_fp_sam$timepoint = paste0("t", lf_fp_sam$timepoint) +#lf_fp_sam$timepoint = as.factor(lf_fp_sam$timepoint) +#lf_fp_sam$obesity = as.factor(lf_fp_sam$obesity) + +#table(lf_fp_sam$mediator) +#head(lf_fp_sam$value[lf_fp_sam$mediator == "vitd"]) +#lf_fp_sam = lf_fp_sam[!lf_fp_sam$mediator == "vitd",] + +#----------- +# serum +#----------- +##wf_fp_serum = serum_wf[serum_wf$flustat == 1,] +#lf_fp_serum = serum_lf[serum_lf$flustat == 1,] +#lf_fp_serum$timepoint = paste0("t", lf_fp_serum$timepoint) +#lf_fp_serum$timepoint = as.factor(lf_fp_serum$timepoint) +#lf_fp_serum$obesity = as.factor(lf_fp_serum$obesity) + +######################################################################## +cat("Output plots will be in:", output_boxplot) +pdf(output_boxplot, width = 20, height = 15) + +#======================================================================= +# NPA +#======================================================================= +if (is.factor(lf_fp_npa$timepoint) && is.factor(lf_fp_npa$timepoint)){ + cat ("PASS: required groups are factors") +} +#------------------------------------------ +title_npa_linear = "NPA (Linear)" +#----------------------------------------- +bxp_npa_linear <- ggboxplot(lf_fp_npa, x = "timepoint", y = "value", + color = "obesity", palette = c("#00BFC4", "#F8766D")) + + facet_wrap(~mediator, nrow = 7, ncol = 5, scales = "free_y", shrink = T)+ + #scale_y_log10() + + theme(axis.text.x = element_text(size = 15) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = 15) + , axis.title.y = element_text(size = 15) + , plot.title = element_text(size = 20, hjust = 0.5) + , strip.text.x = element_text(size = 15, colour = "black") + , legend.title = element_text(color = "black", size = 20) + , legend.text = element_text(size = 15) + , legend.direction = "horizontal") + + labs(title = title_npa_linear + , x = "" + , y = "Levels") + +bxp_npa_linear + +#======================================================================= +# SAM +#======================================================================= +if (is.factor(lf_fp_sam$timepoint) && is.factor(lf_fp_sam$timepoint)){ + cat ("PASS: required groups are factors") +} + +#------------------------------------------ +title_sam_linear = "SAM (Linear)" +#----------------------------------------- +bxp_sam_linear <- ggboxplot(lf_fp_sam, x = "timepoint", y = "value", + color = "obesity", palette = c("#00BFC4", "#F8766D")) + + facet_wrap(~mediator, nrow = 7, ncol = 5, scales = "free_y", shrink = T)+ + #scale_y_log10() + + theme(axis.text.x = element_text(size = 15) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = 15) + , axis.title.y = element_text(size = 15) + , plot.title = element_text(size = 20, hjust = 0.5) + , strip.text.x = element_text(size = 15, colour = "black") + , legend.title = element_text(color = "black", size = 20) + , legend.text = element_text(size = 15) + , legend.direction = "horizontal") + + labs(title = title_sam_linear + , x = "" + , y = "Levels") + +bxp_sam_linear + +#======================================================================= +# SERUM +#======================================================================= +if (is.factor(lf_fp_serum$timepoint) && is.factor(lf_fp_serum$timepoint)){ + cat ("PASS: required groups are factors") +} + +table(lf_fp_serum$mediator) + +#------------------------------------------ +title_serum_linear = "SERUM (Linear)" +#----------------------------------------- +bxp_serum_linear <- ggboxplot(lf_fp_serum, x = "timepoint", y = "value", + color = "obesity", palette = c("#00BFC4", "#F8766D")) + + facet_wrap(~mediator, nrow = 7, ncol = 5, scales = "free_y", shrink = T)+ + #scale_y_log10() + + theme(axis.text.x = element_text(size = 15) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = 15) + , axis.title.y = element_text(size = 15) + , plot.title = element_text(size = 20, hjust = 0.5) + , strip.text.x = element_text(size = 15, colour = "black") + , legend.title = element_text(color = "black", size = 20) + , legend.text = element_text(size = 15) + , legend.direction = "horizontal") + + labs(title = title_serum_linear + , x = "" + , y = "Levels") + +bxp_serum_linear + +#========================================================================== +dev.off() +############################################################################ diff --git a/boxplot_log_na.R b/boxplot_log_na.R new file mode 100755 index 0000000..7eff01c --- /dev/null +++ b/boxplot_log_na.R @@ -0,0 +1,170 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: boxplots at T1 +# FIXME: currently not rendering, problem with NAs for stats? +############################################################ +my_samples = "npa_sam_serum" +#============= +# Input +#============= +#source("data_extraction_formatting_non_asthmatics.R") +source("plot_data_na.R") + +# check: adult variable and age variable discrepancy! +metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18] + +#============= +# Output: +#============= +outfile_bp_log = paste0("boxplots_log_NA_", my_samples, ".pdf") +output_boxplot_log = paste0(outdir_plots, outfile_bp_log); output_boxplot_log + +#=============================== +# data assignment for plots +#================================ +#----------- +# npa +#----------- +#wf_fp_npa = npa_wf[npa_wf$flustat == 1,] +lf_fp_npa = npa_lf[npa_lf$flustat == 1,] +lf_fp_npa$timepoint = paste0("t", lf_fp_npa$timepoint) +lf_fp_npa$timepoint = as.factor(lf_fp_npa$timepoint) +lf_fp_npa$obesity = as.factor(lf_fp_npa$obesity) + +table(lf_fp_npa$mediator) +head(lf_fp_npa$value[lf_fp_npa$mediator == "vitd"]) +lf_fp_npa = lf_fp_npa[!lf_fp_npa$mediator == "vitd",] + +#----------- +# sam +#----------- +#wf_fp_sam = samm_wf[samm_wf$flustat == 1,] +lf_fp_sam = sam_lf[sam_lf$flustat == 1,] +lf_fp_sam$timepoint = paste0("t", lf_fp_sam$timepoint) +lf_fp_sam$timepoint = as.factor(lf_fp_sam$timepoint) +lf_fp_sam$obesity = as.factor(lf_fp_sam$obesity) + +table(lf_fp_sam$mediator) +head(lf_fp_sam$value[lf_fp_sam$mediator == "vitd"]) +lf_fp_sam = lf_fp_sam[!lf_fp_sam$mediator == "vitd",] + +#----------- +# serum +#----------- +#wf_fp_serum = serum_wf[serum_wf$flustat == 1,] +lf_fp_serum = serum_lf[serum_lf$flustat == 1,] +lf_fp_serum$timepoint = paste0("t", lf_fp_serum$timepoint) +lf_fp_serum$timepoint = as.factor(lf_fp_serum$timepoint) +lf_fp_serum$obesity = as.factor(lf_fp_serum$obesity) + +######################################################################## +cat("Output plots will be in:", output_boxplot_log) +pdf(output_boxplot_log, width = 20, height = 15) + +#======================================================================= +# NPA +#======================================================================= +if (is.factor(lf_fp_npa$timepoint) && is.factor(lf_fp_npa$timepoint)){ + cat ("PASS: required groups are factors") +} + +#------------------------------------ +title_npa_log = "NPA (Log)" +#----------------------------------- + +bxp_npa_log <- ggboxplot(lf_fp_npa, x = "timepoint", y = "value", + color = "obesity", palette = c("#00BFC4", "#F8766D")) + + facet_wrap(~mediator, nrow = 7, ncol = 5, scales = "free_y", shrink = F)+ + scale_y_log10() + + theme(axis.text.x = element_text(size = 15) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = 15) + , axis.title.y = element_text(size = 15) + , plot.title = element_text(size = 20, hjust = 0.5) + , strip.text.x = element_text(size = 15, colour = "black") + , legend.title = element_text(color = "black", size = 20) + , legend.text = element_text(size = 15) + , legend.direction = "horizontal") + + labs(title = title_npa_log + , x = "" + , y = "Levels (Log)") + +bxp_npa_log + +#======================================================================= +# SAM +#======================================================================= +if (is.factor(lf_fp_sam$timepoint) && is.factor(lf_fp_sam$timepoint)){ + cat ("PASS: required groups are factors") +} + +#------------------------------------ +title_sam_log = "SAM (Log)" +#----------------------------------- + +bxp_sam_log <- ggboxplot(lf_fp_sam, x = "timepoint", y = "value", + color = "obesity", palette = c("#00BFC4", "#F8766D")) + + facet_wrap(~mediator, nrow = 7, ncol = 5, scales = "free_y", shrink = T)+ + scale_y_log10() + + theme(axis.text.x = element_text(size = 15) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = 15) + , axis.title.y = element_text(size = 15) + , plot.title = element_text(size = 20, hjust = 0.5) + , strip.text.x = element_text(size = 15, colour = "black") + , legend.title = element_text(color = "black", size = 20) + , legend.text = element_text(size = 15) + , legend.direction = "horizontal") + + labs(title = title_sam_log + , x = "" + , y = "Levels (Log)") + +bxp_sam_log + +#======================================================================= +# SERUM +#======================================================================= +if (is.factor(lf_fp_serum$timepoint) && is.factor(lf_fp_serum$timepoint)){ + cat ("PASS: required groups are factors") +} + +table(lf_fp_serum$mediator) + +#------------------------------------ +title_serum_log = "SERUM (Log)" +#----------------------------------- + +bxp_serum_log <- ggboxplot(lf_fp_serum, x = "timepoint", y = "value", + color = "obesity", palette = c("#00BFC4", "#F8766D")) + + facet_wrap(~mediator, nrow = 7, ncol = 5, scales = "free_y", shrink = T)+ + scale_y_log10() + + theme(axis.text.x = element_text(size = 15) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = 15) + , axis.title.y = element_text(size = 15) + , plot.title = element_text(size = 20, hjust = 0.5) + , strip.text.x = element_text(size = 15, colour = "black") + , legend.title = element_text(color = "black", size = 20) + , legend.text = element_text(size = 15) + , legend.direction = "horizontal") + + labs(title = title_serum_log + , x = "" + , y = "Levels (Log)") + +bxp_serum_log + +#========================================================================== +dev.off() +############################################################################ diff --git a/data_extraction_formatting_non_asthmatics.R b/data_extraction_formatting_non_asthmatics.R new file mode 100755 index 0000000..b977b38 --- /dev/null +++ b/data_extraction_formatting_non_asthmatics.R @@ -0,0 +1,357 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +######################################################################## +# TASK: Extract relevant columns from mosaic adults data +# sam +# serum +# npa +######################################################################## +#==================== +# Input: source data +#==================== +source("read_data.R") + +#============================ +# Data to use: Important step +#============================ +# select df to use +my_data = fp_adults_na + +# clear unnecessary variables +rm(all_df, adult_df, fp_adults) + +######################################################################## +if ( sum(table(my_data$asthma)) && sum(table(my_data$asthma, my_data$ia_exac_copd)) && sum(table(my_data$obesity)) == nrow(my_data) ){ + cat("PASS: fp adults WITHOUT asthma extracted sucessfully") +}else{ + cat("FAIL: numbers mismatch. Please check") + quit() +} + +#========= +# sam +#========= +sam_regex = regex(".*_sam[1-3]{1}$", ignore_case = T) +sam_cols_i = str_extract(colnames(my_data), sam_regex) # not boolean +#sam_cols_b = colnames(my_data)%in%sam_cols_i # boolean + +sam_cols = colnames(my_data)[colnames(my_data)%in%sam_cols_i] +sam_cols + +# this contains log columns + daysamp_samXX: omitting these +sam_regex_log_days = regex("log|day.*_sam[1-3]{1}$", ignore_case = T, perl = T) +sam_cols_to_omit = sam_cols[grepl(sam_regex_log_days, sam_cols)]; sam_cols_to_omit +sam_cols_to_omit + +sam_cols_clean = sam_cols[!sam_cols%in%sam_cols_to_omit]; sam_cols_clean +length(sam_cols_clean) + +if( length(sam_cols_clean) == length(sam_cols) - length(sam_cols_to_omit) ){ + cat("PASS: clean cols extracted" + , "\nNo. of clean SAM cols to extract:", length(sam_cols_clean)) +}else{ + cat("FAIL: length mismatch. Aborting further cols extraction" + , "Expected length:", length(sam_cols) - length(sam_cols_to_omit) + , "Got:", length(sam_cols_clean) ) +} + +sam_cols_to_extract = c(meta_data_cols, sam_cols_clean) + +cat("Extracting SAM cols + metadata_cols") + +if ( length(sam_cols_to_extract) == length(meta_data_cols) + length(sam_cols_clean) ){ + cat("Extracing", length(sam_cols_to_extract), "columns for sam") + sam_df = my_data[, sam_cols_to_extract] +}else{ + cat("FAIL: length mismatch" + , "Expeceted to extract:", length(meta_data_cols) + length(sam_cols_clean), "columns" + , "Got:", length(sam_cols_to_extract)) +} + +colnames_sam_df = colnames(sam_df); colnames_sam_df + +#========= +# serum +#========= +serum_regex = regex(".*_serum[1-3]{1}$", ignore_case = T) +serum_cols_i = str_extract(colnames(my_data), serum_regex) # not boolean +#serum_cols_b = colnames(my_data)%in%serum_cols_i # boolean + +serum_cols = colnames(my_data)[colnames(my_data)%in%serum_cols_i] + +# this contains log columns + dayserump_serumXX: omitting these +serum_regex_log_days = regex("log|day.*_serum[1-3]{1}$", ignore_case = T, perl = T) +serum_cols_to_omit = serum_cols[grepl(serum_regex_log_days, serum_cols)]; serum_cols_to_omit + +serum_cols_clean = serum_cols[!serum_cols%in%serum_cols_to_omit]; serum_cols_clean +length(serum_cols_clean) + +if( length(serum_cols_clean) == length(serum_cols) - length(serum_cols_to_omit) ){ + cat("PASS: clean cols extracted" + , "\nNo. of clean serum cols to extract:", length(serum_cols_clean)) +}else{ + cat("FAIL: length mismatch. Aborting further cols extraction" + , "Expected length:", length(serum_cols) - length(serum_cols_to_omit) + , "Got:", length(serum_cols_clean) ) +} + +serum_cols_to_extract = c(meta_data_cols, serum_cols_clean) + +cat("Extracting SERUM cols + metadata_cols") + +if ( length(serum_cols_to_extract) == length(meta_data_cols) + length(serum_cols_clean) ){ + cat("Extracing", length(serum_cols_to_extract), "columns for serum") + serum_df = my_data[, serum_cols_to_extract] +}else{ + cat("FAIL: length mismatch" + , "Expeceted to extract:", length(meta_data_cols) + length(serum_cols_clean), "columns" + , "Got:", length(serum_cols_to_extract)) +} + +colnames_serum_df = colnames(serum_df); colnames_serum_df + +#========= +# npa +#========= +npa_regex = regex(".*_npa[1-3]{1}$", ignore_case = T) +npa_cols_i = str_extract(colnames(my_data), npa_regex) # not boolean +#npa_cols_b = colnames(my_data)%in%npa_cols_i # boolean + +npa_cols = colnames(my_data)[colnames(my_data)%in%npa_cols_i] + +# this contains log columns + daynpap_npaXX: omitting these +npa_regex_log_days = regex("log|day|vl_samptime|ct.*_npa[1-3]{1}$", ignore_case = T, perl = T) +npa_cols_to_omit = npa_cols[grepl(npa_regex_log_days, npa_cols)]; npa_cols_to_omit + +npa_cols_clean = npa_cols[!npa_cols%in%npa_cols_to_omit]; npa_cols_clean +length(npa_cols_clean) + +if( length(npa_cols_clean) == length(npa_cols) - length(npa_cols_to_omit) ){ + cat("PASS: clean cols extracted" + , "\nNo. of clean npa cols to extract:", length(npa_cols_clean)) +}else{ + cat("FAIL: length mismatch. Aborting further cols extraction" + , "Expected length:", length(npa_cols) - length(npa_cols_to_omit) + , "Got:", length(npa_cols_clean) ) +} + +npa_cols_to_extract = c(meta_data_cols, npa_cols_clean) + +cat("Extracting NPA cols + metadata_cols") + +if ( length(npa_cols_to_extract) == length(meta_data_cols) + length(npa_cols_clean) ){ + cat("Extracing", length(npa_cols_to_extract), "columns for npa") + npa_df = my_data[, npa_cols_to_extract] +}else{ + cat("FAIL: length mismatch" + , "Expeceted to extract:", length(meta_data_cols) + length(npa_cols_clean), "columns" + , "Got:", length(npa_cols_to_extract)) +} + +colnames_npa_df = colnames(npa_df); colnames_npa_df + +#============== +# quick checks +#============== +colnames_check = as.data.frame(cbind(colnames_sam_df, colnames_serum_df, colnames_npa_df)) +tail(colnames_check) # gives a warning message due to differing no. of rows for cbind! + +# put NA where a match doesn't exist +# unmatched lengths +#colnames_check[117,1] <- NA +#colnames_check[117,2] <- NA + +if ( ncol(sam_df) == ncol(serum_df) ){ + start = ncol(sam_df)+1 + extra_cols = start:ncol(npa_df) +} + +colnames_check_f = colnames_check +tail(colnames_check_f) + +for (i in extra_cols){ + for (j in 1:2) { + cat("\ni:", i + ,"\nj:", j) + colnames_check_f[i,j] <- NA + #colnames_check_f[i, j]< - NA + + } +} +tail(colnames_check_f) + +########################################################################## +# LF data +########################################################################## +cols_to_omit = c("adult" + #, "obese2" + #, "height", "height_unit", "weight" + #, "weight_unit", "visual_est_bmi", "bmi_rating" + ) + +pivot_cols = meta_data_cols +# subselect pivot_cols +pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols +ncols_omitted = table(meta_data_cols%in%cols_to_omit)[[2]] +ncols_omitted + +#============== +# lf data: sam +#============== +str(sam_df) +table(sam_df$obesity) +#table(sam_df$obese2) + +#sam_df_adults = sam_df[sam_df$adult == 1,] # resolved at source and only dealing wit age as adult +sam_df_adults = sam_df + +wf_cols = colnames(sam_df_adults)[!colnames(sam_df_adults)%in%cols_to_omit] +sam_wf = sam_df_adults[wf_cols] + +if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){ + cat("PASS: pivot cols successfully extracted") +}else{ + cat("FAIL: length mismatch! pivot cols could not be extracted" + , "\nExpected length:", length(meta_data_cols) - ncols_omitted + , "\nGot:",length(pivot_cols) ) + quit() +} + +expected_rows_sam_lf = nrow(sam_wf) * (length(sam_wf) - length(pivot_cols)); expected_rows_sam_lf + +# using regex: +sam_lf = sam_wf %>% + tidyr::pivot_longer(-all_of(pivot_cols) + , names_to = c("mediator", "sample_type", "timepoint") + , names_pattern = "(.*)_(.*)([1-3]{1})" + , values_to = "value") + +if ( + (nrow(sam_lf) == expected_rows_sam_lf) & (sum(table(is.na(sam_lf$mediator))) == expected_rows_sam_lf) + ) { + cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:" + , "\nNo. of rows: ", nrow(sam_lf) + , "\nNo. of cols: ", ncol(sam_lf))) +} else{ + cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator" + , "\nExpected no. of rows: ", expected_rows_sam_lf + , "\nGot: ", nrow(sam_lf) + , "\ncheck expected rows calculation!")) + quit() +} + +#library(data.table) +#foo = sam_df_adults[1:10] +#long <- melt(setDT(sam_df_adults), id.vars = pivot_cols, variable.name = "levels") + +#============== +# lf data: serum +#============== +str(serum_df) +table(serum_df$obesity) +#table(serum_df$obese2) + +#serum_df_adults = serum_df[serum_df$adult == 1,] # extract based on age +serum_df_adults = serum_df + +wf_cols = colnames(serum_df_adults)[!colnames(serum_df_adults)%in%cols_to_omit] +serum_wf = serum_df_adults[wf_cols] + +if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){ + cat("PASS: pivot cols successfully extracted") +}else{ + cat("FAIL: length mismatch! pivot cols could not be extracted" + , "\nExpected length:", length(meta_data_cols) - ncols_omitted + , "\nGot:",length(pivot_cols) ) + quit() +} + +expected_rows_serum_lf = nrow(serum_wf) * (length(serum_wf) - length(pivot_cols)); expected_rows_serum_lf + +# using regex: +serum_lf = serum_wf %>% + tidyr::pivot_longer(-all_of(pivot_cols) + , names_to = c("mediator", "sample_type", "timepoint") + , names_pattern = "(.*)_(.*)([1-3]{1})" + , values_to = "value") + +if ( + (nrow(serum_lf) == expected_rows_serum_lf) & (sum(table(is.na(serum_lf$mediator))) == expected_rows_serum_lf) +) { + cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:" + , "\nNo. of rows: ", nrow(serum_lf) + , "\nNo. of cols: ", ncol(serum_lf))) +} else{ + cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator" + , "\nExpected no. of rows: ", expected_rows_serum_lf + , "\nGot: ", nrow(serum_lf) + , "\ncheck expected rows calculation!")) + quit() +} + +#============== +# lf data: npa +#============== +str(npa_df) +table(npa_df$obesity) +#table(npa_df$obese2) + +#npa_df_adults = npa_df[npa_df$adult == 1,] # extract based on age +npa_df_adults = npa_df + +wf_cols = colnames(npa_df_adults)[!colnames(npa_df_adults)%in%cols_to_omit] +npa_wf = npa_df_adults[wf_cols] + +if (length(pivot_cols) == length(meta_data_cols) - ncols_omitted){ + cat("PASS: pivot cols successfully extracted") +}else{ + cat("FAIL: length mismatch! pivot cols could not be extracted" + , "\nExpected length:", length(meta_data_cols) - ncols_omitted + , "\nGot:",length(pivot_cols) ) + quit() +} + +expected_rows_npa_lf = nrow(npa_wf) * (length(npa_wf) - length(pivot_cols)); expected_rows_npa_lf + +# using regex: +npa_lf = npa_wf %>% + tidyr::pivot_longer(-all_of(pivot_cols) + , names_to = c("mediator", "sample_type", "timepoint") + , names_pattern = "(.*)_(.*)([1-3]{1})" + , values_to = "value") + +if ( + (nrow(npa_lf) == expected_rows_npa_lf) & (sum(table(is.na(npa_lf$mediator))) == expected_rows_npa_lf) +) { + cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:" + , "\nNo. of rows: ", nrow(npa_lf) + , "\nNo. of cols: ", ncol(npa_lf))) +} else{ + cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator" + , "\nExpected no. of rows: ", expected_rows_npa_lf + , "\nGot: ", nrow(npa_lf) + , "\ncheck expected rows calculation!")) + quit() +} + +############################################################################### +# remove unnecessary variables +rm(sam_regex, sam_regex_log_days, sam_cols, sam_cols_clean, sam_cols_i, sam_cols_to_extract, sam_cols_to_omit) +rm(serum_regex, serum_regex_log_days, serum_cols, serum_cols_clean, serum_cols_i, serum_cols_to_extract, serum_cols_to_omit) +rm(npa_regex, npa_regex_log_days, npa_cols, npa_cols_clean, npa_cols_i, npa_cols_to_extract, npa_cols_to_omit) +rm(my_data) +rm(colnames_check) +rm(i, j + #, expected_cols + , start, wf_cols, extra_cols, cols_to_omit) + +# rm not_clean dfs +rm(sam_df_adults, serum_df_adults, npa_df_adults) + +# rm df +rm(sam_df, serum_df, npa_df) +rm(colnames_check_f + #, fp_adults_na) +) diff --git a/flu_stats_unpaired_na_npa.R b/flu_stats_unpaired_na_npa.R new file mode 100755 index 0000000..3f5d3de --- /dev/null +++ b/flu_stats_unpaired_na_npa.R @@ -0,0 +1,381 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: unpaired (time) analysis of mediators: +# sample type: NPA +# data: Flu positive adult patients +# group: obesity +############################################################ +my_sample_type = "npa" + +#============= +# Input +#============= +source("data_extraction_formatting_non_asthmatics.R") + +# check: adult variable and age variable discrepancy! +metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18] + +#============= +# Output +#============= +outfile_name = paste0("flu_stats_time_unpaired_NA_", my_sample_type, ".csv") +flu_stats_time_unpaired_na = paste0(outdir_stats, outfile_name) + +#=============================== +# data assignment for stats +#================================ +wf = npa_wf[npa_wf$flustat == 1,] +lf = npa_lf[npa_lf$flustat == 1,] +lf$timepoint = paste0("t", lf$timepoint) +lf = lf[!lf$mediator == "vitd",] +######################################################################## +# clear variables +rm(sam_lf, sam_wf +, serum_lf, serum_wf) +rm(colnames_sam_df, expected_rows_sam_lf +, colnames_serum_df, expected_rows_serum_lf) + +rm(pivot_cols) + +# sanity checks +table(lf$timepoint) + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis +# with correction +######################################################################## +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +sum(is.na(lf_t1$value)) + +foo = lf_t1[which(is.na(lf_t1$value)),] +#ci = which(is.na(lf_t1$value)) +#lf_t1_comp = lf_t1[-ci,] + +lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] +stats_un_t1 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t1 + , data = lf_t1_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] + +# add timepoint and convert to df +stats_un_t1$timepoint = "t1" +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +#---------------------------------------- +# calculate n_obs for each mediator: t1 +#---------------------------------------- +#n_t1 = data.frame(table(lf_t1_comp$mediator)) +n_t1_all = data.frame(table(lf_t1$mediator)) +colnames(n_t1_all) = c("mediator", "n_obs") +n_t1_all$mediator = as.character(n_t1_all$mediator) + +n_t1_comp = data.frame(table(lf_t1_comp$mediator)) +colnames(n_t1_comp) = c("mediator", "n_obs_complete") +n_t1_comp$mediator = as.character(n_t1_comp$mediator) + +merge_cols = intersect(names(n_t1_all), names(n_t1_comp)); merge_cols +n_t1= merge(n_t1_all, n_t1_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#=================================== +merging_cols = intersect(names(stats_un_t1), names(n_t1)); merging_cols +if (all(n_t1$mediator%in%stats_un_t1$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +}else{ + nf = n_t1$mediator[!n_t1$mediator%in%stats_un_t1$mediator] + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +} + +# add bonferroni adjustment as well +stats_un_t1$p_adj_bonferroni = p.adjust(stats_un_t1$p, method = "bonferroni") + +rm(n_t1) +rm(lf_t1_comp) + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] + +stats_un_t2 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t2 + , data = lf_t2_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +# add timepoint and convert to df +stats_un_t2$timepoint = "t2" +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +#---------------------------------------- +# calculate n_obs for each mediator: t2 +#---------------------------------------- +#n_t2 = data.frame(table(lf_t2_comp$mediator)) +n_t2_all = data.frame(table(lf_t2$mediator)) +colnames(n_t2_all) = c("mediator", "n_obs") +n_t2_all$mediator = as.character(n_t2_all$mediator) + +n_t2_comp = data.frame(table(lf_t2_comp$mediator)) +colnames(n_t2_comp) = c("mediator", "n_obs_complete") +n_t2_comp$mediator = as.character(n_t2_comp$mediator) + +merge_cols = intersect(names(n_t2_all), names(n_t2_comp)); merge_cols +n_t2= merge(n_t2_all, n_t2_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#================================== +merging_cols = intersect(names(stats_un_t2), names(n_t2)); merging_cols +if (all(n_t2$mediator%in%stats_un_t2$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +}else{ + nf = n_t2$mediator[!n_t2$mediator%in%stats_un_t2$mediator] + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +} + +# add bonferroni adjustment as well +stats_un_t2$p_adj_bonferroni = p.adjust(stats_un_t2$p, method = "bonferroni") + +rm(n_t2) +rm(lf_t2_comp) + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] + +stats_un_t3 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t3 + , data = lf_t3_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +# add timepoint and convert to df +stats_un_t3$timepoint = "t3" +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +#---------------------------------------- +# calculate n_obs for each mediator: t3 +#---------------------------------------- +#n_t3 = data.frame(table(lf_t3_comp$mediator)) +n_t3_all = data.frame(table(lf_t3$mediator)) +colnames(n_t3_all) = c("mediator", "n_obs") +n_t3_all$mediator = as.character(n_t3_all$mediator) + +n_t3_comp = data.frame(table(lf_t3_comp$mediator)) +colnames(n_t3_comp) = c("mediator", "n_obs_complete") +n_t3_comp$mediator = as.character(n_t3_comp$mediator) + +merge_cols = intersect(names(n_t3_all), names(n_t3_comp)); merge_cols +n_t3= merge(n_t3_all, n_t3_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#================================== +merging_cols = intersect(names(stats_un_t3), names(n_t3)); merging_cols +if (all(n_t3$mediator%in%stats_un_t3$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +}else{ + nf = n_t3$mediator[!n_t3$mediator%in%stats_un_t3$mediator] + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +} + +# add bonferroni adjustment as well +stats_un_t3$p_adj_bonferroni = p.adjust(stats_un_t3$p, method = "bonferroni") + +rm(n_t3) +rm(lf_t3_comp) + +######################################################################## +#============== +# Rbind these dfs +#============== +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +####################################################################### +#================= +# formatting df +#================= +# delete: unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# add sample_type +cat("Adding sample type info as a column", my_sample_type, "...") +combined_unpaired_stats$sample_type = my_sample_type + +# add: reflect stats method correctly i.e paired or unpaired +# incase there are NA due to LLODs, the gsub won't work! +#combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method = "wilcoxon unpaired" +combined_unpaired_stats$method + +# add an extra column for padjust_signif: my_adjust_method +combined_unpaired_stats$padjust_signif = combined_unpaired_stats$p.adj +# add appropriate symbols for padjust_signif: my_adjust_method +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# add an extra column for p_bon_signif +combined_unpaired_stats$p_bon_signif = combined_unpaired_stats$p_adj_bonferroni +# add appropriate symbols for p_bon_signif +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, p_bon_signif = case_when(p_bon_signif == 0.05 ~ "." + , p_bon_signif <=0.0001 ~ '****' + , p_bon_signif <=0.001 ~ '***' + , p_bon_signif <=0.01 ~ '**' + , p_bon_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "sample_type" + , "n_obs" + , "n_obs_complete" + , "group1" + , "group2" + , "method" + , "p" + , "p.format" + , "p.signif" + , "p.adj" + , "padjust_signif" + , "p_adj_bonferroni" + , "p_bon_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && (all(my_col_order2%in%colnames(combined_unpaired_stats))) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} +# assign nice column names like replace "." with "_" +colnames(combined_unpaired_stats_f) = c("mediator" + , "timepoint" + , "sample_type" + , "n_obs" + , "n_obs_complete" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , paste0("p_adj_fdr_", my_adjust_method) + , paste0("p_", my_adjust_method, "_signif") + , "p_adj_bonferroni" + , "p_bon_signif") + +colnames(combined_unpaired_stats_f) + +#--------------- +# quick summary +#--------------- +# count how many meds are significant +n_sig = length(combined_unpaired_stats_f$mediator[combined_unpaired_stats_f$p_signif<0.05 & !is.na(combined_unpaired_stats_f$p_signif<0.05)]) +sig_meds = combined_unpaired_stats_f[(combined_unpaired_stats_f$p_signif<0.05 & !is.na(combined_unpaired_stats_f$p_signif<0.05)),] + +sig_meds$med_time = paste0(sig_meds$mediator, "@", sig_meds$timepoint) + +cat("\nTotal no. of statistically significant mediators in", toupper(my_sample_type) + , "are:", n_sig + , "\nThese are:", sig_meds$med_time) + +######################################################################## +#****************** +# write output file +#****************** +cat("\nUNpaired stats for groups will be:", flu_stats_time_unpaired_na) +#write.csv(combined_unpaired_stats_f, flu_stats_time_unpaired_na, row.names = FALSE) diff --git a/flu_stats_unpaired_na_sam.R b/flu_stats_unpaired_na_sam.R new file mode 100755 index 0000000..b90ac0d --- /dev/null +++ b/flu_stats_unpaired_na_sam.R @@ -0,0 +1,383 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: unpaired (time) analysis of mediators: +# sample type: SAM +# data: Flu positive adult patients +# group: obesity +############################################################ +my_sample_type = "sam" + +#============= +# Input +#============= +source("data_extraction_formatting_non_asthmatics.R") + +# check: adult variable and age variable discrepancy! +metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18] + +#============= +# Output +#============= +outfile_name = paste0("flu_stats_time_unpaired_NA_", my_sample_type, ".csv") +flu_stats_time_unpaired_na = paste0(outdir_stats, outfile_name) + +#=============================== +# data assignment for stats +#================================ +wf = sam_wf[sam_wf$flustat == 1,] +lf = sam_lf[sam_lf$flustat == 1,] +lf$timepoint = paste0("t", lf$timepoint) +lf = lf[!lf$mediator == "vitd",] +######################################################################## +# clear variables +rm(npa_lf, npa_wf +, serum_lf, serum_wf) +rm(colnames_npa_df, expected_rows_npa_lf +, colnames_serum_df, expected_rows_serum_lf) + +rm(pivot_cols) + +# sanity checks +table(lf$timepoint) +length(unique(lf$mosaic)) + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis +# with correction +####################################################################### +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +sum(is.na(lf_t1$value)) + +foo = lf_t1[which(is.na(lf_t1$value)),] +#ci = which(is.na(lf_t1$value)) +#lf_t1_comp = lf_t1[-ci,] + +lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] +stats_un_t1 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t1 + , data = lf_t1_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] + +# add timepoint and convert to df +stats_un_t1$timepoint = "t1" +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +#---------------------------------------- +# calculate n_obs for each mediator: t1 +#---------------------------------------- +#n_t1 = data.frame(table(lf_t1_comp$mediator)) +n_t1_all = data.frame(table(lf_t1$mediator)) +colnames(n_t1_all) = c("mediator", "n_obs") +n_t1_all$mediator = as.character(n_t1_all$mediator) + +n_t1_comp = data.frame(table(lf_t1_comp$mediator)) +colnames(n_t1_comp) = c("mediator", "n_obs_complete") +n_t1_comp$mediator = as.character(n_t1_comp$mediator) + +merge_cols = intersect(names(n_t1_all), names(n_t1_comp)); merge_cols +n_t1= merge(n_t1_all, n_t1_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#=================================== +merging_cols = intersect(names(stats_un_t1), names(n_t1)); merging_cols +if (all(n_t1$mediator%in%stats_un_t1$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +}else{ + nf = n_t1$mediator[!n_t1$mediator%in%stats_un_t1$mediator] + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +} + +# add bonferroni adjustment as well +stats_un_t1$p_adj_bonferroni = p.adjust(stats_un_t1$p, method = "bonferroni") + +rm(n_t1) +rm(lf_t1_comp) + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] + +stats_un_t2 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t2 + , data = lf_t2_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) +# add timepoint and convert to df +stats_un_t2$timepoint = "t2" +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +#---------------------------------------- +# calculate n_obs for each mediator: t2 +#---------------------------------------- +#n_t2 = data.frame(table(lf_t2_comp$mediator)) +n_t2_all = data.frame(table(lf_t2$mediator)) +colnames(n_t2_all) = c("mediator", "n_obs") +n_t2_all$mediator = as.character(n_t2_all$mediator) + +n_t2_comp = data.frame(table(lf_t2_comp$mediator)) +colnames(n_t2_comp) = c("mediator", "n_obs_complete") +n_t2_comp$mediator = as.character(n_t2_comp$mediator) + +merge_cols = intersect(names(n_t2_all), names(n_t2_comp)); merge_cols +n_t2= merge(n_t2_all, n_t2_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#=================================== +merging_cols = intersect(names(stats_un_t2), names(n_t2)); merging_cols +if (all(n_t2$mediator%in%stats_un_t2$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +}else{ + nf = n_t2$mediator[!n_t2$mediator%in%stats_un_t2$mediator] + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +} + +# add bonferroni adjustment as well +stats_un_t2$p_adj_bonferroni = p.adjust(stats_un_t2$p, method = "bonferroni") + +rm(n_t2) +rm(lf_t2_comp) + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] + +stats_un_t3 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t3 + , data = lf_t3_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) +# add timepoint and convert to df +stats_un_t3$timepoint = "t3" +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +#---------------------------------------- +# calculate n_obs for each mediator: t3 +#---------------------------------------- +#n_t3 = data.frame(table(lf_t3_comp$mediator)) +n_t3_all = data.frame(table(lf_t3$mediator)) +colnames(n_t3_all) = c("mediator", "n_obs") +n_t3_all$mediator = as.character(n_t3_all$mediator) + +n_t3_comp = data.frame(table(lf_t3_comp$mediator)) +colnames(n_t3_comp) = c("mediator", "n_obs_complete") +n_t3_comp$mediator = as.character(n_t3_comp$mediator) + +merge_cols = intersect(names(n_t3_all), names(n_t3_comp)); merge_cols +n_t3= merge(n_t3_all, n_t3_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#================================== +merging_cols = intersect(names(stats_un_t3), names(n_t3)); merging_cols +if (all(n_t3$mediator%in%stats_un_t3$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +}else{ + nf = n_t3$mediator[!n_t3$mediator%in%stats_un_t3$mediator] + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +} + +# check: satisfied!!!! +# FIXME: supply the col name automatically? +wilcox.test(wf$ifna2a_sam3[wf$obesity == 1], wf$ifna2a_sam3[wf$obesity == 0]) + +# add bonferroni adjustment as well +stats_un_t3$p_adj_bonferroni = p.adjust(stats_un_t3$p, method = "bonferroni") + +rm(n_t3) +rm(lf_t3_comp) +######################################################################## +#================= +# Rbind these dfs +#================= +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +####################################################################### +#================= +# formatting df +#================= +# delete: unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# add sample_type +cat("Adding sample type info as a column", my_sample_type, "...") +combined_unpaired_stats$sample_type = my_sample_type + +# add: reflect stats method correctly i.e paired or unpaired +# incase there are NA due to LLODs, the gsub won't work! +#combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method = "wilcoxon unpaired" +combined_unpaired_stats$method + +# add an extra column for padjust_signif: my_adjust_method +combined_unpaired_stats$padjust_signif = combined_unpaired_stats$p.adj +# add appropriate symbols for padjust_signif: my_adjust_method +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# add an extra column for p_bon_signif +combined_unpaired_stats$p_bon_signif = combined_unpaired_stats$p_adj_bonferroni +# add appropriate symbols for p_bon_signif +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, p_bon_signif = case_when(p_bon_signif == 0.05 ~ "." + , p_bon_signif <=0.0001 ~ '****' + , p_bon_signif <=0.001 ~ '***' + , p_bon_signif <=0.01 ~ '**' + , p_bon_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "sample_type" + , "n_obs" + , "n_obs_complete" + , "group1" + , "group2" + , "method" + , "p" + , "p.format" + , "p.signif" + , "p.adj" + , "padjust_signif" + , "p_adj_bonferroni" + , "p_bon_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && (all(my_col_order2%in%colnames(combined_unpaired_stats))) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} +# assign nice column names like replace "." with "_" +colnames(combined_unpaired_stats_f) = c("mediator" + , "timepoint" + , "sample_type" + , "n_obs" + , "n_obs_complete" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , paste0("p_adj_fdr_", my_adjust_method) + , paste0("p_", my_adjust_method, "_signif") + , "p_adj_bonferroni" + , "p_bon_signif") + +colnames(combined_unpaired_stats_f) + +#--------------- +# quick summary +#--------------- +# count how many meds are significant +n_sig = length(combined_unpaired_stats_f$mediator[combined_unpaired_stats_f$p_signif<0.05 & !is.na(combined_unpaired_stats_f$p_signif<0.05)]) +sig_meds = combined_unpaired_stats_f[(combined_unpaired_stats_f$p_signif<0.05 & !is.na(combined_unpaired_stats_f$p_signif<0.05)),] + +sig_meds$med_time = paste0(sig_meds$mediator, "@", sig_meds$timepoint) + +cat("\nTotal no. of statistically significant mediators in", toupper(my_sample_type) + , "are:", n_sig + , "\nThese are:", sig_meds$med_time) + +####################################################################### +#****************** +# write output file +#****************** +cat("\nUNpaired stats for groups will be:", flu_stats_time_unpaired_na) +#write.csv(combined_unpaired_stats_f, flu_stats_time_unpaired_na, row.names = FALSE) diff --git a/flu_stats_unpaired_na_serum.R b/flu_stats_unpaired_na_serum.R new file mode 100755 index 0000000..97649c3 --- /dev/null +++ b/flu_stats_unpaired_na_serum.R @@ -0,0 +1,376 @@ +#!/usr/bin/Rscript +getwd() +setwd("~/git/mosaic_2020/") +getwd() +############################################################ +# TASK: unpaired (time) analysis of mediators: serum +############################################################ +my_sample_type = "serum" + +#============= +# Input +#============= +source("data_extraction_formatting_non_asthmatics.R") + +# check: adult variable and age variable discrepancy! +metadata_all$mosaic[metadata_all$adult==1 & metadata_all$age<=18] + +#============= +# Output +#============= +outfile_name = paste0("flu_stats_time_unpaired_NA_", my_sample_type, ".csv") +flu_stats_time_unpaired_na = paste0(outdir_stats, outfile_name) + +#=============================== +# data assignment for stats +#================================ +wf = serum_wf[serum_wf$flustat == 1,] +lf = serum_lf[serum_lf$flustat == 1,] +lf$timepoint = paste0("t", lf$timepoint) + +######################################################################## +# clear variables +rm(sam_lf, sam_wf +, npa_lf, npa_wf) +rm(colnames_sam_df, expected_rows_sam_lf +, colnames_npa_df, expected_rows_npa_lf) + +rm(pivot_cols) + +# sanity checks +table(lf$timepoint) + +######################################################################## +# Unpaired stats at each timepoint b/w groups: wilcoxon UNpaired analysis +# with correction +####################################################################### +# with adjustment: fdr and BH are identical +my_adjust_method = "BH" + +#============== +# unpaired: t1 +#============== +lf_t1 = lf[lf$timepoint == "t1",] +sum(is.na(lf_t1$value)) + +foo = lf_t1[which(is.na(lf_t1$value)),] +ci = which(is.na(lf_t1$value)) +#lf_t1_comp = lf_t1[-ci,] + +lf_t1_comp = lf_t1[-which(is.na(lf_t1$value)),] +stats_un_t1 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t1 + , data = lf_t1_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +foo$mosaic[!unique(foo$mosaic)%in%unique(lf_t1_comp$mosaic)] + +# add timepoint and convert to df +stats_un_t1$timepoint = "t1" +stats_un_t1 = as.data.frame(stats_un_t1) +class(stats_un_t1) + +#---------------------------------------- +# calculate n_obs for each mediator: t1 +#---------------------------------------- +#n_t1 = data.frame(table(lf_t1_comp$mediator)) +n_t1_all = data.frame(table(lf_t1$mediator)) +colnames(n_t1_all) = c("mediator", "n_obs") +n_t1_all$mediator = as.character(n_t1_all$mediator) + +n_t1_comp = data.frame(table(lf_t1_comp$mediator)) +colnames(n_t1_comp) = c("mediator", "n_obs_complete") +n_t1_comp$mediator = as.character(n_t1_comp$mediator) + +merge_cols = intersect(names(n_t1_all), names(n_t1_comp)); merge_cols +n_t1= merge(n_t1_all, n_t1_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#================================== +merging_cols = intersect(names(stats_un_t1), names(n_t1)); merging_cols +if (all(n_t1$mediator%in%stats_un_t1$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +}else{ + nf = n_t1$mediator[!n_t1$mediator%in%stats_un_t1$mediator] + stats_un_t1 = merge(stats_un_t1, n_t1, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t1) + , "\nncol:", ncol(stats_un_t1)) +} + +# add bonferroni adjustment as well +stats_un_t1$p_adj_bonferroni = p.adjust(stats_un_t1$p, method = "bonferroni") + +rm(n_t1) +rm(lf_t1_comp) + +#============== +# unpaired: t2 +#============== +lf_t2 = lf[lf$timepoint == "t2",] +lf_t2_comp = lf_t2[-which(is.na(lf_t2$value)),] + +stats_un_t2 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t2 + , data = lf_t2_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) +# add timepoint and convert to df +stats_un_t2$timepoint = "t2" +stats_un_t2 = as.data.frame(stats_un_t2) +class(stats_un_t2) + +#---------------------------------------- +# calculate n_obs for each mediator: t2 +#---------------------------------------- +#n_t2 = data.frame(table(lf_t2_comp$mediator)) +n_t2_all = data.frame(table(lf_t2$mediator)) +colnames(n_t2_all) = c("mediator", "n_obs") +n_t2_all$mediator = as.character(n_t2_all$mediator) + +n_t2_comp = data.frame(table(lf_t2_comp$mediator)) +colnames(n_t2_comp) = c("mediator", "n_obs_complete") +n_t2_comp$mediator = as.character(n_t2_comp$mediator) + +merge_cols = intersect(names(n_t2_all), names(n_t2_comp)); merge_cols +n_t2= merge(n_t2_all, n_t2_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#================================== +merging_cols = intersect(names(stats_un_t2), names(n_t2)); merging_cols +if (all(n_t2$mediator%in%stats_un_t2$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +}else{ + nf = n_t2$mediator[!n_t2$mediator%in%stats_un_t2$mediator] + stats_un_t2 = merge(stats_un_t2, n_t2, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t2) + , "\nncol:", ncol(stats_un_t2)) +} + +# add bonferroni adjustment as well +stats_un_t2$p_adj_bonferroni = p.adjust(stats_un_t2$p, method = "bonferroni") + +rm(n_t2) +rm(lf_t2_comp) + +#============== +# unpaired: t3 +#============== +lf_t3 = lf[lf$timepoint == "t3",] +lf_t3_comp = lf_t3[-which(is.na(lf_t3$value)),] + +stats_un_t3 = compare_means(value~obesity + , group.by = "mediator" + #, data = lf_t3 + , data = lf_t3_comp + , paired = FALSE + , p.adjust.method = my_adjust_method) + +# add timepoint and convert to df +stats_un_t3$timepoint = "t3" +stats_un_t3 = as.data.frame(stats_un_t3) +class(stats_un_t3) + +#---------------------------------------- +# calculate n_obs for each mediator: t3 +#---------------------------------------- +#n_t3 = data.frame(table(lf_t3_comp$mediator)) +n_t3_all = data.frame(table(lf_t3$mediator)) +colnames(n_t3_all) = c("mediator", "n_obs") +n_t3_all$mediator = as.character(n_t3_all$mediator) + +n_t3_comp = data.frame(table(lf_t3_comp$mediator)) +colnames(n_t3_comp) = c("mediator", "n_obs_complete") +n_t3_comp$mediator = as.character(n_t3_comp$mediator) + +merge_cols = intersect(names(n_t3_all), names(n_t3_comp)); merge_cols +n_t3 = merge(n_t3_all, n_t3_comp, by = merge_cols, all = T) + +#================================== +# Merge: merge stats + n_obs df +#================================== +merging_cols = intersect(names(stats_un_t3), names(n_t3)); merging_cols +if (all(n_t3$mediator%in%stats_un_t3$mediator)) { + cat("PASS: merging stats and n_obs on column/s:", merging_cols) + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all = T) + cat("\nsuccessfull merge:" + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +}else{ + nf = n_t3$mediator[!n_t3$mediator%in%stats_un_t3$mediator] + stats_un_t3 = merge(stats_un_t3, n_t3, by = merging_cols, all.y = T) + cat("\nMerged with caution:" + , "\nnrows mismatch:", nf + , "not found in stats possibly due to all obs being LLODs" + , "\nintroduced NAs for:", nf + , "\nnrow:", nrow(stats_un_t3) + , "\nncol:", ncol(stats_un_t3)) +} + +# add bonferroni adjustment as well +stats_un_t3$p_adj_bonferroni = p.adjust(stats_un_t3$p, method = "bonferroni") + +rm(n_t3) +rm(lf_t3_comp) +######################################################################## +#============== +# Rbind these dfs +#============== +str(stats_un_t1);str(stats_un_t2); str(stats_un_t3) + +n_dfs = 3 + +if ( all.equal(nrow(stats_un_t1), nrow(stats_un_t2), nrow(stats_un_t3)) && + all.equal(ncol(stats_un_t1), ncol(stats_un_t2), ncol(stats_un_t3)) ) { + expected_rows = nrow(stats_un_t1) * n_dfs + expected_cols = ncol(stats_un_t1) + print("PASS: expected_rows and cols variables generated for downstream sanity checks") +}else{ + cat("FAIL: dfs have different no. of rows and cols" + , "\nCheck harcoded value of n_dfs" + , "\nexpected_rows and cols could not be generated") + quit() +} + +if ( all.equal(colnames(stats_un_t1), colnames(stats_un_t2), colnames(stats_un_t3)) ){ + print("PASS: colnames match. Rbind the 3 dfs...") + combined_unpaired_stats = rbind(stats_un_t1, stats_un_t2, stats_un_t3) +} else{ + cat("FAIL: cannot combined dfs. Colnames don't match!") + quit() +} + +if ( nrow(combined_unpaired_stats) == expected_rows && ncol(combined_unpaired_stats) == expected_cols ){ + cat("PASS: combined_df has expected dimension" + , "\nNo. of rows in combined_df:", nrow(combined_unpaired_stats) + , "\nNo. of cols in combined_df:", ncol(combined_unpaired_stats) ) +}else{ + cat("FAIL: combined_df dimension mismatch") + quit() +} + +####################################################################### +#================= +# formatting df +#================= +# delete: unnecessary column +combined_unpaired_stats = subset(combined_unpaired_stats, select = -c(.y.)) + +# add sample_type +cat("Adding sample type info as a column", my_sample_type, "...") +combined_unpaired_stats$sample_type = my_sample_type + +# add: reflect stats method correctly i.e paired or unpaired +# incase there are NA due to LLODs, the gsub won't work! +#combined_unpaired_stats$method = gsub("Wilcoxon", "Wilcoxon_unpaired", combined_unpaired_stats$method) +combined_unpaired_stats$method = "wilcoxon unpaired" +combined_unpaired_stats$method + +# add an extra column for padjust_signif: my_adjust_method +combined_unpaired_stats$padjust_signif = combined_unpaired_stats$p.adj +# add appropriate symbols for padjust_signif: my_adjust_method +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, padjust_signif = case_when(padjust_signif == 0.05 ~ "." + , padjust_signif <=0.0001 ~ '****' + , padjust_signif <=0.001 ~ '***' + , padjust_signif <=0.01 ~ '**' + , padjust_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# add an extra column for p_bon_signif +combined_unpaired_stats$p_bon_signif = combined_unpaired_stats$p_adj_bonferroni +# add appropriate symbols for p_bon_signif +combined_unpaired_stats = dplyr::mutate(combined_unpaired_stats, p_bon_signif = case_when(p_bon_signif == 0.05 ~ "." + , p_bon_signif <=0.0001 ~ '****' + , p_bon_signif <=0.001 ~ '***' + , p_bon_signif <=0.01 ~ '**' + , p_bon_signif <0.05 ~ '*' + , TRUE ~ 'ns')) +# reorder columns +print("preparing to reorder columns...") +colnames(combined_unpaired_stats) +my_col_order2 = c("mediator" + , "timepoint" + , "sample_type" + , "n_obs" + , "n_obs_complete" + , "group1" + , "group2" + , "method" + , "p" + , "p.format" + , "p.signif" + , "p.adj" + , "padjust_signif" + , "p_adj_bonferroni" + , "p_bon_signif") + +if( length(my_col_order2) == ncol(combined_unpaired_stats) && (all(my_col_order2%in%colnames(combined_unpaired_stats))) ){ + print("PASS: Reordering columns...") + combined_unpaired_stats_f = combined_unpaired_stats[, my_col_order2] + print("Successful: column reordering") + print("formatted df called:'combined_unpaired_stats_f'") + cat('\nformatted df has the following dimensions\n') + print(dim(combined_unpaired_stats_f )) +} else{ + cat(paste0("FAIL:Cannot reorder columns, length mismatch" + , "\nExpected column order for: ", ncol(combined_unpaired_stats) + , "\nGot:", length(my_col_order2))) + quit() +} +# assign nice column names like replace "." with "_" +colnames(combined_unpaired_stats_f) = c("mediator" + , "timepoint" + , "sample_type" + , "n_obs" + , "n_obs_complete" + , "group1" + , "group2" + , "method" + , "p" + , "p_format" + , "p_signif" + , paste0("p_adj_fdr_", my_adjust_method) + , paste0("p_", my_adjust_method, "_signif") + , "p_adj_bonferroni" + , "p_bon_signif") + +colnames(combined_unpaired_stats_f) +#--------------- +# quick summary +#--------------- +# count how many meds are significant +n_sig = length(combined_unpaired_stats_f$mediator[combined_unpaired_stats_f$p_signif<0.05 & !is.na(combined_unpaired_stats_f$p_signif<0.05)]) +sig_meds = combined_unpaired_stats_f[(combined_unpaired_stats_f$p_signif<0.05 & !is.na(combined_unpaired_stats_f$p_signif<0.05)),] + +sig_meds$med_time = paste0(sig_meds$mediator, "@", sig_meds$timepoint) + +cat("\nTotal no. of statistically significant mediators in", toupper(my_sample_type) + , "are:", n_sig + , "\nThese are:", sig_meds$med_time) + + +######################################################################## +#****************** +# write output file +#****************** +cat("\nUNpaired stats for groups will be:", flu_stats_time_unpaired_na) +#write.csv(combined_unpaired_stats_f, flu_stats_time_unpaired_na, row.names = FALSE) diff --git a/logistic_regression_TEMPLATE.R b/logistic_regression_TEMPLATE.R new file mode 100644 index 0000000..d483858 --- /dev/null +++ b/logistic_regression_TEMPLATE.R @@ -0,0 +1,97 @@ +#!/usr/bin/Rscript +getwd() +setwd('~/git/mosaic_2020/') +getwd() +######################################################################## +# TASK: Run regression analysis +# npa +######################################################################## +#================================================================================= +# TO DO: +# Simple stats b/w obesity and non-obesity to consider including in reg analysis +# Include NPA statistically sign params +# Rerun graphs and plots without asthma? +#================================================================================= + +#==================== +# Input: source data +#==================== +source("data_extraction_formatting_clinical.R") + +rm(fp_adults, metadata_all) + +######################################################################## +my_data = reg_data +######################################################################### +# check factor of each column +lapply(my_data, class) + +character_vars <- lapply(my_data, class) == "character" +character_vars +table(character_vars) + +factor_vars <- lapply(my_data, class) == "factor" +table(factor_vars) + +my_data[, character_vars] <- lapply(my_data[, character_vars], as.factor) +factor_vars <- lapply(my_data, class) == "factor" +factor_vars +table(factor_vars) + +# check again +lapply(my_data, class) + +table(my_data$ethnicity) +my_data$ethnicity = as.factor(my_data$ethnicity) +class(my_data$ethnicity) + +colnames(my_data) +reg_param = c("age" + , "age_bins" + #, "death" # outcome + , "asthma" + , "obesity" + , "gender" + , "los" + , "o2_sat_admis" + #, "logistic_outcome" + #, "steroid_ics" + , "ethnicity" + , "smoking" + , "sfluv" + , "h1n1v" + , "ia_cxr" + , "max_resp_score" + , "T1_resp_score" + , "com_noasthma" + , "onset_initial_bin") + +for(i in reg_param) { + # print (i) + p_form = as.formula(paste("death ~ ", i ,sep = "")) + model_reg = glm(p_form , family = binomial, data = my_data) + print(summary(model_reg)) + print(exp(cbind(OR = coef(model_reg), confint(model_reg)))) + #print (PseudoR2(model_reg)) + cat("=================================================================================\n") +} + + +full_mod = glm(death ~ asthma + + gender + + age_bins + + los + + #ethnicity + + onset_initial_bin + + o2_sat_bin + + com_noasthma + + obesity + + #ia_cxr + + smoking + + #sfluv + + #h1n1v + max_resp_score + + T1_resp_score + + , family = "binomial", data = my_data) + +summary(full_mod)