224 lines
7.9 KiB
R
Executable file
224 lines
7.9 KiB
R
Executable file
#!/usr/bin/Rscript
|
|
getwd()
|
|
setwd("~/git/mosaic_2020/")
|
|
getwd()
|
|
############################################################
|
|
# TASK: unpaired (time) analysis of clinical data
|
|
# data: clincial data of flu positive adult patients
|
|
# group: obesity
|
|
############################################################
|
|
#my_sample_type = "npa"
|
|
|
|
#=============
|
|
# Input
|
|
#=============
|
|
source("data_extraction_formatting_clinical.R")
|
|
|
|
#=============
|
|
# Output
|
|
#=============
|
|
outfile_name_clinical = paste0("flu_stats_clinical_unpaired.csv")
|
|
outfile_clinical_unpaired = paste0(outdir_stats, outfile_name_clinical)
|
|
outfile_clinical_unpaired
|
|
|
|
########################################################################
|
|
# Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis
|
|
# No correction required
|
|
########################################################################
|
|
str(clinical_df_ics)
|
|
numerical_cols = c("age"
|
|
, "vl_pfu_ul_npa1"
|
|
, "los"
|
|
, "onset2final"
|
|
, "onsfindeath"
|
|
#, "onset_2_initial" # already bin
|
|
#, "o2_sat_admis"# already bin
|
|
)
|
|
|
|
metadata_cols = c("mosaic", "obesity")
|
|
|
|
clinical_df_numerical = clinical_df_ics[, c(metadata_cols, numerical_cols)]
|
|
|
|
pivot_cols = metadata_cols
|
|
#pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols
|
|
expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf
|
|
|
|
# lf data colnames
|
|
keycol <- "clinical_params"
|
|
valuecol <- "value"
|
|
gathercols <- numerical_cols
|
|
|
|
clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols)
|
|
|
|
if( nrow(clinical_lf) == expected_rows_clinical_lf){
|
|
cat("PASS: long format data created successfully"
|
|
, "\nnrow:", nrow(clinical_lf)
|
|
, "\nncol:", ncol(clinical_lf))
|
|
}
|
|
|
|
#====================
|
|
# unpaired: clinical
|
|
#====================
|
|
sum(is.na(clinical_lf$value))
|
|
|
|
foo = clinical_lf[which(is.na(clinical_lf$value)),]
|
|
|
|
clinical_lf_comp = clinical_lf[-which(is.na(clinical_lf$value)),]
|
|
|
|
stats_un_clinical = compare_means(value~obesity
|
|
, group.by = "clinical_params"
|
|
, data = clinical_lf
|
|
#, data = clinical_lf_comp
|
|
, paired = FALSE)
|
|
|
|
head(stats_un_clinical)
|
|
|
|
# rstatix
|
|
stat_df <- clinical_lf %>%
|
|
group_by(clinical_params) %>%
|
|
wilcox_test(value ~ obesity, paired = F) %>%
|
|
add_significance("p")
|
|
stat_df$p_format = round(stat_df$p, digits = 3)
|
|
stat_df
|
|
|
|
#----------------------------------------
|
|
# calculate n_obs for each clinical param: Overall
|
|
#----------------------------------------
|
|
#n_t1 = data.frame(table(lf_t1_comp$mediator))
|
|
n_all = data.frame(table(clinical_lf$clinical_params))
|
|
colnames(n_all) = c("clinical_params", "n")
|
|
n_all$clinical_params = as.character(n_all$clinical_params)
|
|
|
|
n_gp_lf = data.frame(table(clinical_lf$clinical_params, clinical_lf$obesity))
|
|
n_gp = spread(n_gp_lf, "Var2", "Freq"); n_gp
|
|
colnames(n_gp)
|
|
colnames(n_gp) = c("clinical_params"
|
|
, paste0("n_gp", colnames(n_gp)[2])
|
|
, paste0("n_gp", colnames(n_gp)[3]))
|
|
|
|
n_gp$clinical_params = as.character(n_gp$clinical_params)
|
|
|
|
n_all_gp = merge(n_all, n_gp
|
|
, by = intersect( names(n_all), names(n_gp) )
|
|
, all = T)
|
|
|
|
#----------------------------------------
|
|
# calculate n_obs for each clinical param: complete cases
|
|
#----------------------------------------
|
|
n_comp = data.frame(table(clinical_lf$clinical_params))
|
|
colnames(n_comp) = c("clinical_params", "n_complete")
|
|
n_comp$clinical_params = as.character(n_comp$clinical_params)
|
|
n_comp
|
|
|
|
n_gp_comp_lf = data.frame(table(clinical_lf$clinical_params
|
|
, clinical_lf$obesity)); n_gp_comp_lf
|
|
n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp
|
|
colnames(n_gp_comp)
|
|
colnames(n_gp_comp) = c("clinical_params"
|
|
, paste0("n_complete_gp", colnames(n_gp_comp)[2])
|
|
, paste0("n_complete_gp", colnames(n_gp_comp)[3]))
|
|
|
|
#---------
|
|
# merge 1
|
|
#---------
|
|
n_comp_gp = merge(n_comp, n_gp_comp
|
|
, by = intersect( names(n_comp), names(n_gp_comp))
|
|
, all = T)
|
|
n_comp_gp
|
|
|
|
#---------
|
|
# merge 2
|
|
#---------
|
|
merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols
|
|
|
|
n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df
|
|
|
|
#----------------------------------
|
|
# Merge 3: merge stats + n_obs df
|
|
#----------------------------------
|
|
merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols
|
|
|
|
if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) {
|
|
cat("PASS: merging stats and n_obs on column/s:", merging_cols)
|
|
stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T)
|
|
cat("\nsuccessfull merge:"
|
|
, "\nnrow:", nrow(stats_un_clinical)
|
|
, "\nncol:", ncol(stats_un_clinical))
|
|
}else{
|
|
nf = n_df$clinical_params[!n_df$clinical_params%in%stats_un_clinical$clinical_params]
|
|
stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all.y = T)
|
|
cat("\nMerged with caution:"
|
|
, "\nnrows mismatch:", nf
|
|
, "\nnot found in stats possibly due to all obs being missing"
|
|
, "\nintroduced NAs for:", nf
|
|
, "\nnrow:", nrow(stats_un_clinical)
|
|
, "\nncol:", ncol(stats_un_clinical))
|
|
}
|
|
|
|
#######################################################################
|
|
#=================
|
|
# formatting df
|
|
#=================
|
|
# delete: unnecessary column
|
|
stats_clinical_df = subset(stats_un_clinical, select = -c(.y.,p.adj))
|
|
|
|
# add: reflect stats method correctly i.e paired or unpaired
|
|
# incase there are NA due to LLODs, the gsub won't work!
|
|
#stats_clinical_df$method = gsub("Wilcoxon", "Wilcoxon_unpaired", stats_clinical_df$method)
|
|
stats_clinical_df$method = "wilcoxon unpaired"
|
|
stats_clinical_df$method
|
|
|
|
# reorder columns
|
|
print("preparing to reorder columns...")
|
|
colnames(stats_clinical_df)
|
|
my_col_order2 = c("clinical_params"
|
|
, "method"
|
|
, "group1"
|
|
, "group2"
|
|
, "n"
|
|
, "n_gp0"
|
|
, "n_gp1"
|
|
, "n_complete"
|
|
, "n_complete_gp0"
|
|
, "n_complete_gp1"
|
|
, "p"
|
|
, "p.format"
|
|
, "p.signif")
|
|
|
|
if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%colnames(stats_clinical_df))) ){
|
|
print("PASS: Reordering columns...")
|
|
stats_clinical_df_f = stats_clinical_df[, my_col_order2]
|
|
print("Successful: column reordering")
|
|
print("formatted df called:'stats_clinical_df_f'")
|
|
cat('\nformatted df has the following dimensions\n')
|
|
print(dim(stats_clinical_df_f ))
|
|
} else{
|
|
cat(paste0("FAIL:Cannot reorder columns, length or names mismatch"
|
|
, "\nExpected column order for: ", ncol(stats_clinical_df) # FIXME: can handle better!
|
|
, "\nGot:", length(my_col_order2)
|
|
, "\nElse check colnames to see if they exist in both"))
|
|
quit()
|
|
}
|
|
# assign nice column names like replace "." with "_"
|
|
# same ordering as my_col_order2, just minor formatting
|
|
colnames(stats_clinical_df_f) = c("clinical_params"
|
|
, "method"
|
|
, "group1"
|
|
, "group2"
|
|
, "n"
|
|
, "n_gp0"
|
|
, "n_gp1"
|
|
, "n_complete"
|
|
, "n_complete_gp0"
|
|
, "n_complete_gp1"
|
|
, "p"
|
|
, "p_format"
|
|
, "p_signif")
|
|
|
|
colnames(stats_clinical_df_f)
|
|
########################################################################
|
|
#******************
|
|
# write output file
|
|
#******************
|
|
cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired)
|
|
write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)
|