Dashboards/ml/global.R

190 lines
6.3 KiB
R

library(shiny)
library(shinyjs)
library(shinydashboard)
#library("wesanderson") # ayyyy lmao hipster af
library(dplyr)
library(ggplot2)
library(grid) # for the info box
library(plotly)
library(shinycssloaders)
# make shiny non-stupid
#options(shiny.launch.browser = FALSE) # i am a big girl and can tie my own laces
#options(shiny.port = 8000) # don't change the port every time
#options(shiny.host = '0.0.0.0') # This means "listen to all addresses on all interfaces"
#options(width=120)
#options(DT.options = list(scrollX = TRUE))
# FIXME: get rid of this hardcoded thing which i'm only reading in to have resampling types ahead of loading the real files
if (interactive()){
print("Interactive Session, using home directories")
data_dir = "~/git/"
} else {
data_dir = "/srv/shiny-server/git/"
}
thing = read.csv(paste0(data_dir, "Data/ml_combined/genes/pnca_70_30_actual.csv"))
# list of splits
split_type = c(
"cd_7030",
"cd_8020",
"cd_sl",
"none"
)
split_file = c(
"_70_30_complete",
"_80_20_complete",
"_sl_complete",
"_none_complete"
)
# necessary because the names will be wrong otherwise
split_map = data.frame(
files=c(
"_70_30_complete",
"_80_20_complete",
"_sl_complete",
"_none"
),
splits=c(
"cd_7030",
"cd_8020",
"cd_sl",
"none"
)
)
metadata_cols = c("n_training_size", "n_test_size", "n_trainingY_ratio", "n_testY_ratio", "resampling", "n_features")
# hardcoded list of drugs
drug = c("ethambutol", "isoniazid", "pyrazinamide", "rifampicin", "streptomycin")
gene = c("embb", "katg", "pnca", "rpob", "gid")
combo = data.frame(drug, gene)
# Loader for per-gene CSVs
#"/home/sethp/git/Data/ml_combined/genes/pnca_70_30_complete.csv"
loaded_files=list()
for (x in gene) {
#x=tolower(x)
for (split in split_file){
filedata = paste0(x, split)
filename = paste0(data_dir,'LSHTM_ML/output/genes/',x,split,'.csv')
#print(c(filename))
#load_name=paste0(combo[gene==x,"drug"],'_',split_map['splits'][split_map['files']==split])
load_name=paste0(x,'_baselineC_',split_map['splits'][split_map['files']==split])
#print(load_name)
# try() on its own is fine here because we don't need to do anything if it fails
try({loaded_files[[load_name]] = read.csv(filename)})
}
}
# Funky loader for combined data
for (x in gene) {
for (ac in c('_actual','_complete', '_FS')){
for (gene_count in c(1:6)){
load_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac)
filename = paste0(data_dir,'LSHTM_ML/output/combined/',load_name, ".csv")
store_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac)
# tryCatch is necessary here rather than try() because we need to do more
# manipulation afterwards (throwing away the column after loading)
load_successful=TRUE
tryCatch({temp_df = read.csv(filename)},error=function(e){load_successful<<-FALSE})
if (load_successful){
temp_df=temp_df[, 2:ncol(temp_df)] # throw away first column
loaded_files[[store_name]] = temp_df
print(paste0("loaded file: ", filename, "into var: ", store_name))
}
}
}
}
scores=c("F1", "ROC_AUC", "JCC", "MCC", "Accuracy", "Recall", "Precision")
resample_types <<- unique(thing$resampling)
makeplot = function(x, # the DataFrame to plot
selection, # scoring method e.g. 'MCC'
resampler, # resampling type e.g. 'none'
display_infobox = TRUE, # display the infobox on top of the plot
display_combined = TRUE, # show stuff that only applies to "combined model" plots
gene = 'NOT SET', # used only for the info box
drug = 'NOT SET', # used only for the info box
combined_training_genes = '999' # used only for the info box
){
plot_data = x[x$resampling==resampler,]
y_coord_min = min(plot_data[selection])
if (y_coord_min > 0) {
y_coord_min = 0
}
if (display_infobox) {
metadata=t(plot_data[1,metadata_cols])
if (display_combined){
metatext=paste0("Train/Test: ",
metadata[1], "/", metadata[2],
"\nTrain/Test Target Ratio: ", metadata[3], "/", metadata[4],
"\nResampling: ", metadata[5],
"\nFeatures: ", metadata[6],
"\nGenes Trained: ", combined_training_genes,
"\nTest Gene: ", gene
)
} else {
metatext=paste0("Train/Test: ",
metadata[1], "/", metadata[2],
"\nTrain/Test Target Ratio: ", metadata[3], "/", metadata[4],
"\nResampling: ", metadata[5],
"\nFeatures: ", metadata[6],
"\nTest Gene: ", gene
)
}
#print(metatext)
grob <- grobTree(textGrob(metatext,
x=0.01,
y=0.90,
hjust=0,
gp=gpar(col="black")
)
)
}
ggplot(data=plot_data, aes_string(x="Model_name",
y=selection,
fill="source_data",
group=selection) ) +
geom_bar(stat="identity"
, width = 0.75
, position=position_dodge2(padding=0.1, preserve='total', reverse=TRUE)
) +
coord_cartesian(ylim = c(y_coord_min, 1)) +
scale_fill_manual(values = c("BT" = "#605ca8",
"CV" = "#bebddb") ) +
#guides=guide_legend(reverse=TRUE) +
annotation_custom(grob) +
# doesn't work with plotly but looks nice :-(
geom_label(aes_string(label=selection),
position=position_dodge(width = -0.75),
#position=position_dodge2(padding=0.1),
vjust = 1.5,
alpha=0.75,
fill="white"
) +
# works with plotly but i can't figure out the background yet
# geom_text(aes_string(label=selection, group=selection),
# position=position_dodge(width = -0.75),
# vjust = 1.5,
# alpha=0.75,
#
# ) +
# add little numbers for the BT bars only
labs(x="",y=paste(selection,"Score")) +
theme(
axis.text.x = element_text(angle = 90),
)
# ggplotly()
}