ML dashboard/Score Selector initial commit

2022-09-02 16:09:33 +00:00 · 2022-09-02 16:09:33 +00:00 · 5a4535f747
commit 5a4535f747
parent 8a8b36d725
3 changed files with 382 additions and 0 deletions
--- a/ml/global.R
+++ b/ml/global.R
@ -0,0 +1,235 @@
+library(shiny)
+library(shinyjs)
+library(shinydashboard)
+#library("wesanderson") # ayyyy lmao hipster af
+library(dplyr)
+library(ggplot2)
+library(grid) # for the info box
+library(plotly)
+library(shinycssloaders)
+
+# make shiny non-stupid
+#options(shiny.launch.browser = FALSE) # i am a big girl and can tie my own laces
+#options(shiny.port = 8000) # don't change the port every time
+#options(shiny.host = '0.0.0.0') # This means "listen to all addresses on all interfaces"
+#options(width=120)
+#options(DT.options = list(scrollX = TRUE))
+
+# FIXME: get rid of this hardcoded thing which i'm only reading in to have resampling types ahead of loading the real files
+thing  = read.csv("/srv/shiny-server/git/Data/ml_combined/genes/pnca_70_30_actual.csv")
+
+# list of splits
+split_type = c(
+  "7030", 
+  "8020", 
+  "sl", 
+  "cd_7030", 
+  "cd_8020", 
+  "cd_sl",
+  "cd_none_bts",
+  "cd_rt"
+  ) 
+
+split_file = c(
+  "_70_30_actual", 
+  "_70_30_complete", 
+  "_80_20_actual", 
+  "_80_20_complete", 
+  "_sl_actual", 
+  "_sl_complete", 
+  "_none_bts_complete",
+  "_rt_complete"
+  )
+
+# necessary because the names will be wrong otherwise
+split_map = data.frame(
+  files=c(
+    "_70_30_actual",
+    "_70_30_complete",
+    "_80_20_actual",
+    "_80_20_complete",
+    "_sl_actual",
+    "_sl_complete",
+    "_none_bts_complete",
+    "_rt_complete"
+    ),
+  splits=c(
+    "7030",
+    "cd_7030",
+    "8020",
+    "cd_8020",
+    "sl",
+    "cd_sl",
+    "cd_none_bts",
+    "cd_rt"
+  )
+)
+
+metadata_cols = c("n_training_size", "n_test_size", "n_trainingY_ratio", "n_testY_ratio", "resampling", "n_features")
+
+# hardcoded list of drugs
+drug = c("ethambutol", "isoniazid", "pyrazinamide", "rifampicin", "streptomycin")
+gene = c("embb", "katg", "pnca", "rpob", "gid")
+combo = data.frame(drug, gene)
+
+# Loader for per-gene CSVs
+#"/home/sethp/git/Data/ml_combined/genes/pnca_70_30_complete.csv"
+loaded_files=list()
+for (x in gene) {
+  #x=tolower(x)
+  for (split in split_file){
+    filedata = paste0(x, split)
+    filename = paste0('/srv/shiny-server/git/LSHTM_ML/output/genes/',x,split,'.csv')
+    
+    #print(c(filename))
+    #load_name=paste0(combo[gene==x,"drug"],'_',split_map['splits'][split_map['files']==split])
+    load_name=paste0(x,'_baselineC_',split_map['splits'][split_map['files']==split])
+    #print(load_name)
+    #try({loaded_files[[filedata]] = read.csv(filename)})
+    try({loaded_files[[load_name]] = read.csv(filename)})
+  }
+}
+# Funky loader for combined data
+for (x in gene) {
+  for (ac in c('_actual','_complete')){
+    for (gene_count in c(6,5)){
+      load_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac)
+      filename = paste0('/srv/shiny-server/git/LSHTM_ML/output/combined/',load_name, ".csv")
+      print(filename)
+      
+      # if (ac=='') {
+      #   ac2 <- '_complete'
+      # } else {
+      #   ac2 = ac
+      # }
+      store_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac)
+      print(store_name)
+      try({temp_df = read.csv(filename)})
+      
+      temp_df=temp_df[, 2:ncol(temp_df)] # throw away first column
+      loaded_files[[store_name]] = temp_df
+      
+    }
+  }
+}
+
+# 
+# loaded_files_old=list()
+# for (x in drug) {
+#   for (split in split_type){
+#     filename = paste0('/home/sethp/git/Data/',
+#                       x,
+#                       '/output/ml/tts_',
+#                       split,
+#                       '/',
+#                       combo[drug==x,"gene"],
+#                       '_baselineC_',
+#                       split,
+#                       '.csv')
+#     filedata = paste0(combo[drug==x,"gene"],
+#                       '_baselineC_',
+#                       split
+#                       )
+#     print(c(filename, filedata))
+# 
+#     try({loaded_files_old[[filedata]] = read.csv(filename)})
+#   }
+# }
+
+#plot_data    = thing[thing$resampling=='none',]
+# FIXME commented out for the moment because we need to use
+# this before the data is actually loaded :-(
+# scores = colnames(thing %>% dplyr::select(-c("Model_name",
+#                                              "source_data",
+#                                              "resampling"
+#                                              )
+#                                           )
+#                   )
+scores=c("F1", "ROC_AUC", "JCC", "MCC", "Accuracy", "Recall", "Precision")
+
+resample_types <<- unique(thing$resampling)
+
+makeplot = function(x, # the DataFrame to plot
+                    selection, # scoring method e.g. 'MCC'
+                    resampler, # resampling type e.g. 'none'
+                    display_infobox         = TRUE, # display the infobox on top of the plot
+                    display_combined        = TRUE, # show stuff that only applies to "combined model" plots
+                    gene                    = 'NOT SET', # used only for the info box
+                    drug                    = 'NOT SET', # used only for the info box
+                    combined_training_genes = '999' # used only for the info box
+                    ){
+  plot_data    = x[x$resampling==resampler,]
+  y_coord_min = min(plot_data[selection])
+  
+  if (y_coord_min > 0) {
+    y_coord_min = 0
+  }
+  if (display_infobox) {
+    metadata=t(plot_data[1,metadata_cols])
+    if (display_combined){
+      metatext=paste0("Train/Test: ",
+                      metadata[1], "/", metadata[2],
+                      "\nTrain/Test Target Ratio: ", metadata[3], "/", metadata[4],
+                      "\nResampling: ", metadata[5],
+                      "\nFeatures: ", metadata[6],
+                      "\nGenes Trained: ", combined_training_genes,
+                      "\nTest Gene: ", gene
+                      )
+      } else {
+        metatext=paste0("Train/Test: ",
+                        metadata[1], "/", metadata[2],
+                        "\nTrain/Test Target Ratio: ", metadata[3], "/", metadata[4],
+                        "\nResampling: ", metadata[5],
+                        "\nFeatures: ", metadata[6],
+                        "\nTest Gene: ", gene
+        )
+      }
+    
+  #print(metatext)
+  
+  grob <- grobTree(textGrob(metatext,
+    x=0.01,
+    y=0.90,
+    hjust=0,
+    gp=gpar(col="black")
+    )
+    )
+  }
+  ggplot(data=plot_data, aes_string(x="Model_name",
+                                    y=selection, 
+                                    fill="source_data",
+                                    group=selection) ) +
+    geom_bar(stat="identity"
+             , width = 0.75
+             , position=position_dodge2(padding=0.1, preserve='total', reverse=TRUE)
+    ) +
+    coord_cartesian(ylim = c(y_coord_min, 1)) +
+    scale_fill_manual(values = c("BT" = "#605ca8",
+                                 "CV" = "#bebddb") ) +
+    #guides=guide_legend(reverse=TRUE) +
+    annotation_custom(grob) +
+    # doesn't work with plotly but looks nice :-( 
+    geom_label(aes_string(label=selection),
+               position=position_dodge(width = -0.75),
+               #position=position_dodge2(padding=0.1),
+               vjust = 1.5,
+               alpha=0.75,
+               fill="white"
+               
+    ) +
+    
+    # works with plotly but i can't figure out the background yet
+    # geom_text(aes_string(label=selection, group=selection),
+    #           position=position_dodge(width = -0.75),
+    #           vjust = 1.5,
+    #           alpha=0.75,
+    #           
+    # ) +
+    
+    # add little numbers for the BT bars only
+    labs(x="",y=paste(selection,"Score")) +
+    theme(
+      axis.text.x = element_text(angle = 90),
+    )
+  # ggplotly()
+}