#!/usr/bin/Rscript getwd() setwd("~/git/mosaic_2020/") getwd() ############################################################ # TASK: unpaired (time) analysis of clinical data # data: clincial data of flu positive adult patients # group: obesity ############################################################ #my_sample_type = "npa" #============= # Input #============= source("data_extraction_formatting_clinical.R") #============= # Output #============= outfile_name_clinical = paste0("flu_stats_clinical_unpaired.csv") outfile_clinical_unpaired = paste0(outdir_stats, outfile_name_clinical) outfile_clinical_unpaired ######################################################################## # Unpaired stats for clinical data b/w groups: wilcoxon UNpaired analysis # No correction required ######################################################################## numerical_cols = c("age" #, "vl_pfu_ul_npa1" , "los" , "onset2final" , "onsfindeath" , "onset_2_initial" , "o2_sat_admis") metadata_cols = c("mosaic", "obesity") clinical_df_numerical = clinical_df[, c(metadata_cols, numerical_cols)] pivot_cols = metadata_cols #pivot_cols = metadata_cols[!meta_data_cols%in%cols_to_omit];pivot_cols expected_rows_clinical_lf = nrow(clinical_df_numerical) * (length(clinical_df_numerical) - length(pivot_cols)); expected_rows_clinical_lf keycol <- "clinical_params" valuecol <- "value" gathercols <- c("age", "los", "onset2final", "onsfindeath", "onset_2_initial", "o2_sat_admis") clinical_lf = gather_(clinical_df_numerical, keycol, valuecol, gathercols) if( nrow(clinical_lf) == expected_rows_clinical_lf){ cat("PASS: long format data created successfully" , "\nnrow:", nrow(clinical_lf) , "\nncol:", ncol(clinical_lf)) } #==================== # unpaired: clinical #==================== sum(is.na(clinical_lf$value)) foo = clinical_lf[which(is.na(clinical_lf$value)),] clinical_lf_comp = clinical_lf[-which(is.na(clinical_lf$value)),] stats_un_clinical = compare_means(value~obesity , group.by = "clinical_params" , data = clinical_lf #, data = clinical_lf_comp , paired = FALSE) stat_df <- clinical_lf %>% group_by(clinical_params) %>% wilcox_test(value ~ obesity, paired = F) %>% add_significance("p") stat_df$p_format = round(stat_df$p, digits = 3) #---------------------------------------- # calculate n_obs for each clinical param: Overall #---------------------------------------- #n_t1 = data.frame(table(lf_t1_comp$mediator)) n_all = data.frame(table(clinical_lf$clinical_params)) colnames(n_all) = c("clinical_params", "n") n_all$clinical_params = as.character(n_all$clinical_params) n_gp_lf = data.frame(table(clinical_lf$clinical_params, clinical_lf$obesity)) n_gp = spread(n_gp_lf, "Var2", "Freq"); n_gp colnames(n_gp) colnames(n_gp) = c("clinical_params" , paste0("n_gp", colnames(n_gp)[2]) , paste0("n_gp", colnames(n_gp)[3])) n_gp$clinical_params = as.character(n_gp$clinical_params) n_all_gp = merge(n_all, n_gp , by = intersect( names(n_all), names(n_gp) ) , all = T) #---------------------------------------- # calculate n_obs for each clinical param: complete cases #---------------------------------------- n_comp = data.frame(table(clinical_lf_comp$clinical_params)) colnames(n_comp) = c("clinical_params", "n_complete") n_comp$clinical_params = as.character(n_comp$clinical_params) n_comp n_gp_comp_lf = data.frame(table(clinical_lf_comp$clinical_params, clinical_lf_comp$obesity)); n_gp_comp_lf n_gp_comp = spread(n_gp_comp_lf, "Var2", "Freq"); n_gp_comp colnames(n_gp_comp) colnames(n_gp_comp) = c("clinical_params" , paste0("n_complete_gp", colnames(n_gp_comp)[2]) , paste0("n_complete_gp", colnames(n_gp_comp)[3])) n_comp_gp = merge(n_comp, n_gp_comp , by = intersect( names(n_comp), names(n_gp_comp)) , all = T) merge_cols = intersect(names(n_all_gp), names(n_comp_gp)); merge_cols n_df = merge(n_all_gp, n_comp_gp, by = merge_cols, all = T); n_df #================================== # Merge: merge stats + n_obs df #=================================== merging_cols = intersect(names(stats_un_clinical), names(n_df)); merging_cols if (all(n_df$clinical_params%in%stats_un_clinical$clinical_params)) { cat("PASS: merging stats and n_obs on column/s:", merging_cols) stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all = T) cat("\nsuccessfull merge:" , "\nnrow:", nrow(stats_un_clinical) , "\nncol:", ncol(stats_un_clinical)) }else{ nf = n_df$clinical_params[!n_df$clinical_params%in%stats_un_clinical$clinical_params] stats_un_clinical = merge(stats_un_clinical, n_df, by = merging_cols, all.y = T) cat("\nMerged with caution:" , "\nnrows mismatch:", nf , "\nnot found in stats possibly due to all obs being missing" , "\nintroduced NAs for:", nf , "\nnrow:", nrow(stats_un_clinical) , "\nncol:", ncol(stats_un_clinical)) } ####################################################################### #================= # formatting df #================= # delete: unnecessary column stats_clinical_df = subset(stats_un_clinical, select = -c(.y.,p.adj)) # add: reflect stats method correctly i.e paired or unpaired # incase there are NA due to LLODs, the gsub won't work! #stats_clinical_df$method = gsub("Wilcoxon", "Wilcoxon_unpaired", stats_clinical_df$method) stats_clinical_df$method = "wilcoxon unpaired" stats_clinical_df$method # reorder columns print("preparing to reorder columns...") colnames(stats_clinical_df) my_col_order2 = c("clinical_params" , "method" , "group1" , "group2" , "n" , "n_gp0" , "n_gp1" , "n_complete" , "n_complete_gp0" , "n_complete_gp1" , "p" , "p.format" , "p.signif") if( length(my_col_order2) == ncol(stats_clinical_df) && (all(my_col_order2%in%colnames(stats_clinical_df))) ){ print("PASS: Reordering columns...") stats_clinical_df_f = stats_clinical_df[, my_col_order2] print("Successful: column reordering") print("formatted df called:'stats_clinical_df_f'") cat('\nformatted df has the following dimensions\n') print(dim(stats_clinical_df_f )) } else{ cat(paste0("FAIL:Cannot reorder columns, length or names mismatch" , "\nExpected column order for: ", ncol(stats_clinical_df) # FIXME: can handle better! , "\nGot:", length(my_col_order2) , "\nElse check colnames to see if they exist in both")) quit() } # assign nice column names like replace "." with "_" colnames(stats_clinical_df_f) = c("clinical_params" , "method" , "group1" , "group2" , "n" , "n_gp0" , "n_gp1" , "n_complete" , "n_complete_gp0" , "n_complete_gp1" , "p" , "p_format" , "p_signif") colnames(stats_clinical_df_f) ######################################################################## #****************** # write output file #****************** cat("UNpaired stats for clinical data for groups in:", outfile_clinical_unpaired) #write.csv(stats_clinical_df_f, outfile_clinical_unpaired, row.names = FALSE)