diff --git a/scripts/functions/plotting_globals.R b/scripts/functions/plotting_globals.R index fc66b78..19a4b55 100644 --- a/scripts/functions/plotting_globals.R +++ b/scripts/functions/plotting_globals.R @@ -52,7 +52,7 @@ angstroms_symbol <<- "\u212b" # Delta symbol #=============== delta_symbol <<- "\u0394"; delta_symbol -stability_suffix <- paste0(delta_symbol, delta_symbol, "G") +stability_suffix <- paste0(delta_symbol, delta_symbol, "G Kcal/mol") #========== # Colours diff --git a/scripts/thesis_summary_table.R b/scripts/thesis_summary_table.R new file mode 100644 index 0000000..5a73592 --- /dev/null +++ b/scripts/thesis_summary_table.R @@ -0,0 +1,303 @@ +library(xtable) +library(dplyr) +library(tidyverse) +if (interactive()){ + print("Interactive Session, using home directories") + data_dir = "~/git/" +} else { + data_dir = "/srv/shiny-server/git/" +} + +latex_outdir = "~/git/Writing/thesis/tex/appendices/" +thing = read.csv(paste0(data_dir, "Data/ml_combined/genes/pnca_70_30_actual.csv")) + +# list of splits +split_type = c( + # "cd_7030", + # "cd_8020", + "cd_sl" + # "none" +) +split_choicenames=c( + # "70:30", + # "80:20", + "Scaling law" + # "CV thresholds" +) +split_file = c( + # "_70_30_complete", + # "_80_20_complete", + "_sl_complete" + # "_none_complete" +) + +split_file_FS = c( + # "_70_30_complete", + # "_80_20_complete", + "_sl_complete" +) + +# necessary because the names will be wrong otherwise +split_map = data.frame( + files=c( + # "_70_30_complete", + # "_80_20_complete", + "_sl_complete" + # "_none_complete" + ), + splits=c( + # "cd_7030", + # "cd_8020", + "cd_sl" + # "none" + ) +) + +metadata_cols = c("n_training_size", "n_test_size", "n_trainingY_ratio", "n_testY_ratio", "resampling", "n_features") + +# hardcoded list of drugs +drug = c("ethambutol", "isoniazid", "pyrazinamide", "rifampicin", "streptomycin") + +drug_choicenames = c("EmbB-ethambutol", "KatG-isoniazid", "PncA-pyrazinamide", "RpoB-rifampicin", "GidB-streptomycin") + +gene = c("embb", "katg", "pnca", "rpob", "gid") +combo = data.frame(drug, gene) + +score_cols = c( + 'MCC', + 'F1', + 'Accuracy', + 'JCC', + 'Recall', + 'Precision', + 'source_data', + 'Model_name' +) + + +# Loader for per-gene CSVs +#"/home/sethp/git/Data/ml_combined/genes/pnca_70_30_complete.csv" +loaded_files=list() +for (x in gene) { + #x=tolower(x) + for (split in split_file){ + filedata = paste0(x, split) + filename = paste0(data_dir,'LSHTM_ML/output/genes/',x,split,'.csv') + + #print(c(filename)) + #load_name=paste0(combo[gene==x,"drug"],'_',split_map['splits'][split_map['files']==split]) + load_name=paste0(x,'_baselineC_',split_map['splits'][split_map['files']==split]) + print(load_name) + # try() on its own is fine here because we don't need to do anything if it fails + try({loaded_files[[load_name]] = read.csv(filename)}) + } +} +# Loader for per-gene Feature Selection CSVs +for (x in gene) { + #x=tolower(x) + for (split in split_file_FS){ + filedata = paste0(x, split) + filename = paste0(data_dir,'LSHTM_ML/output/genes/',x,split,'_FS.csv') + + #print(c(filename)) + #load_name=paste0(combo[gene==x,"drug"],'_',split_map['splits'][split_map['files']==split]) + load_name=paste0(x,'_baselineC_',split_map['splits'][split_map['files']==split], '_FS') + print(load_name) + # try() on its own is fine here because we don't need to do anything if it fails + try({loaded_files[[load_name]] = read.csv(filename)}) + } +} + + +# Funky loader for combined data +for (x in gene) { + for (ac in c('_complete', '_FS')){ + for (gene_count in c(1:5)){ + load_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac) + filename = paste0(data_dir,'LSHTM_ML/output/combined/',load_name, ".csv") + store_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac) + # tryCatch is necessary here rather than try() because we need to do more + # manipulation afterwards (throwing away the column after loading) + load_successful=TRUE + tryCatch({temp_df = read.csv(filename)},error=function(e){load_successful<<-FALSE}) + if (load_successful){ + temp_df=temp_df[, 2:ncol(temp_df)] # throw away first column + loaded_files[[store_name]] = temp_df + print(paste0("loaded file: ", filename, "into var: ", store_name)) + } + } + } +} + +loaded_files$embb_baselineC_cd_sl = loaded_files$embb_baselineC_cd_sl[loaded_files$embb_baselineC_cd_sl$resampling=='none',score_cols] +loaded_files$katg_baselineC_cd_sl = loaded_files$katg_baselineC_cd_sl[loaded_files$katg_baselineC_cd_sl$resampling=='none',score_cols] +loaded_files$pnca_baselineC_cd_sl = loaded_files$pnca_baselineC_cd_sl[loaded_files$pnca_baselineC_cd_sl$resampling=='none',score_cols] +loaded_files$rpob_baselineC_cd_sl = loaded_files$rpob_baselineC_cd_sl[loaded_files$rpob_baselineC_cd_sl$resampling=='none',score_cols] +loaded_files$gid_baselineC_cd_sl = loaded_files$gid_baselineC_cd_sl[loaded_files$gid_baselineC_cd_sl$resampling=='none',score_cols] + +loaded_files$embb_baselineC_cd_sl_FS = loaded_files$embb_baselineC_cd_sl_FS[loaded_files$embb_baselineC_cd_sl_FS$resampling=='none',score_cols] +loaded_files$katg_baselineC_cd_sl_FS = loaded_files$katg_baselineC_cd_sl_FS[loaded_files$katg_baselineC_cd_sl_FS$resampling=='none',score_cols] +loaded_files$pnca_baselineC_cd_sl_FS = loaded_files$pnca_baselineC_cd_sl_FS[loaded_files$pnca_baselineC_cd_sl_FS$resampling=='none',score_cols] +loaded_files$rpob_baselineC_cd_sl_FS = loaded_files$rpob_baselineC_cd_sl_FS[loaded_files$rpob_baselineC_cd_sl_FS$resampling=='none',score_cols] +loaded_files$gid_baselineC_cd_sl_FS = loaded_files$gid_baselineC_cd_sl_FS[loaded_files$gid_baselineC_cd_sl_FS$resampling=='none',score_cols] + +loaded_files$`5genes_logo_skf_BT_embb_complete` = loaded_files$`5genes_logo_skf_BT_embb_complete`[loaded_files$`5genes_logo_skf_BT_embb_complete`$resampling=='none',score_cols] +loaded_files$`5genes_logo_skf_BT_katg_complete` = loaded_files$`5genes_logo_skf_BT_katg_complete`[loaded_files$`5genes_logo_skf_BT_katg_complete`$resampling=='none',score_cols] +loaded_files$`5genes_logo_skf_BT_pnca_complete` = loaded_files$`5genes_logo_skf_BT_pnca_complete`[loaded_files$`5genes_logo_skf_BT_pnca_complete`$resampling=='none',score_cols] +loaded_files$`5genes_logo_skf_BT_rpob_complete` = loaded_files$`5genes_logo_skf_BT_rpob_complete`[loaded_files$`5genes_logo_skf_BT_rpob_complete`$resampling=='none',score_cols] +loaded_files$`5genes_logo_skf_BT_gid_complete` = loaded_files$`5genes_logo_skf_BT_gid_complete`[loaded_files$`5genes_logo_skf_BT_gid_complete`$resampling=='none',score_cols] + +loaded_files$`5genes_logo_skf_BT_embb_complete`[loaded_files$`5genes_logo_skf_BT_embb_complete`$source_data=="Train","source_data"]="0Train" +loaded_files$`5genes_logo_skf_BT_katg_complete`[loaded_files$`5genes_logo_skf_BT_katg_complete`$source_data=="Train","source_data"]="0Train" +loaded_files$`5genes_logo_skf_BT_pnca_complete`[loaded_files$`5genes_logo_skf_BT_pnca_complete`$source_data=="Train","source_data"]="0Train" +loaded_files$`5genes_logo_skf_BT_rpob_complete`[loaded_files$`5genes_logo_skf_BT_rpob_complete`$source_data=="Train","source_data"]="0Train" +loaded_files$`5genes_logo_skf_BT_gid_complete`[loaded_files$`5genes_logo_skf_BT_gid_complete`$source_data=="Train","source_data"]="0Train" + +loaded_files$`1genes_logo_skf_BT_embb_FS` = loaded_files$`1genes_logo_skf_BT_embb_FS`[loaded_files$`1genes_logo_skf_BT_embb_FS`$resampling=='none',score_cols] +loaded_files$`1genes_logo_skf_BT_katg_FS` = loaded_files$`1genes_logo_skf_BT_katg_FS`[loaded_files$`1genes_logo_skf_BT_katg_FS`$resampling=='none',score_cols] +loaded_files$`1genes_logo_skf_BT_pnca_FS` = loaded_files$`1genes_logo_skf_BT_pnca_FS`[loaded_files$`1genes_logo_skf_BT_pnca_FS`$resampling=='none',score_cols] +loaded_files$`1genes_logo_skf_BT_rpob_FS` = loaded_files$`1genes_logo_skf_BT_rpob_FS`[loaded_files$`1genes_logo_skf_BT_rpob_FS`$resampling=='none',score_cols] +loaded_files$`1genes_logo_skf_BT_gid_FS` = loaded_files$`1genes_logo_skf_BT_gid_FS`[loaded_files$`1genes_logo_skf_BT_gid_FS`$resampling=='none',score_cols] + +loaded_files$`1genes_logo_skf_BT_embb_FS`[loaded_files$`1genes_logo_skf_BT_embb_FS`$source_data=="Train","source_data"]="0Train" +loaded_files$`1genes_logo_skf_BT_katg_FS`[loaded_files$`1genes_logo_skf_BT_katg_FS`$source_data=="Train","source_data"]="0Train" +loaded_files$`1genes_logo_skf_BT_pnca_FS`[loaded_files$`1genes_logo_skf_BT_pnca_FS`$source_data=="Train","source_data"]="0Train" +loaded_files$`1genes_logo_skf_BT_rpob_FS`[loaded_files$`1genes_logo_skf_BT_rpob_FS`$source_data=="Train","source_data"]="0Train" +loaded_files$`1genes_logo_skf_BT_gid_FS`[loaded_files$`1genes_logo_skf_BT_gid_FS`$source_data=="Train","source_data"]="0Train" + +# Rewrite CV/BT to Train/Test and ensure that Train gets presented first +loaded_files$embb_baselineC_cd_sl_FS[loaded_files$embb_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train" +loaded_files$katg_baselineC_cd_sl_FS[loaded_files$katg_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train" +loaded_files$pnca_baselineC_cd_sl_FS[loaded_files$pnca_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train" +loaded_files$rpob_baselineC_cd_sl_FS[loaded_files$rpob_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train" +loaded_files$gid_baselineC_cd_sl_FS[loaded_files$gid_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train" +loaded_files$embb_baselineC_cd_sl_FS[loaded_files$embb_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test" +loaded_files$katg_baselineC_cd_sl_FS[loaded_files$katg_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test" +loaded_files$pnca_baselineC_cd_sl_FS[loaded_files$pnca_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test" +loaded_files$rpob_baselineC_cd_sl_FS[loaded_files$rpob_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test" +loaded_files$gid_baselineC_cd_sl_FS[loaded_files$gid_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test" + +loaded_files$embb_baselineC_cd_sl[loaded_files$embb_baselineC_cd_sl$source_data=="Train","source_data"]="0Train" +loaded_files$katg_baselineC_cd_sl[loaded_files$katg_baselineC_cd_sl$source_data=="Train","source_data"]="0Train" +loaded_files$pnca_baselineC_cd_sl[loaded_files$pnca_baselineC_cd_sl$source_data=="Train","source_data"]="0Train" +loaded_files$rpob_baselineC_cd_sl[loaded_files$rpob_baselineC_cd_sl$source_data=="Train","source_data"]="0Train" +loaded_files$gid_baselineC_cd_sl[loaded_files$gid_baselineC_cd_sl$source_data=="Train","source_data"]="0Train" + +# Normal tables +for (x in gene) { + out_tex=paste0(latex_outdir,"ml_table-", x, ".tex") + df_name=paste0(x,"_baselineC_cd_sl") + processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c( + 'MCC', + 'F1', + 'Accuracy', + 'Recall', + 'Precision', + 'JCC' + ), + names_sep = " ", + names_sort = TRUE + ) + processed_df=as.data.frame(processed_df) + + colnames(processed_df)=gsub('0Train','Train',colnames(processed_df)) + rownames(processed_df)=processed_df$Model_name + #colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df)) + processed_df=subset(processed_df, select=-Model_name) + assign(paste0(x,"_out"),processed_df) + print(out_tex) + print.xtable(xtable( + processed_df, type="latex"), + file=out_tex + ) +} + +quick_latex(huxtable(processed_df), file="/tmp/foo.tex", open=FALSE) + + +# Feature Selection tables +for (x in gene) { + out_tex=paste0(latex_outdir,"ml_table_FS-", x, ".tex") + df_name=paste0(x,"_baselineC_cd_sl_FS") + processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c( + 'MCC', + 'F1', + 'Accuracy', + 'Recall', + 'Precision', + 'JCC' + ), + names_sep = " ", + names_sort = TRUE + ) + processed_df=as.data.frame(processed_df) + + colnames(processed_df)=gsub('0Train','Train',colnames(processed_df)) + rownames(processed_df)=processed_df$Model_name + #colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df)) + processed_df=subset(processed_df, select=-Model_name) + assign(paste0(x,"_out"),processed_df) + print(out_tex) + print.xtable(xtable( + processed_df, type="latex"), + file=out_tex + ) +} + +# Combined Normal +for (x in gene) { + out_tex=paste0(latex_outdir,"ml_table_combined-", x, ".tex") + df_name=paste0("5genes_logo_skf_BT_",x,"_complete") + processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c( + 'MCC', + 'F1', + 'Accuracy', + 'Recall', + 'Precision', + 'JCC' + ), + names_sep = " ", + names_sort = TRUE + ) + processed_df=as.data.frame(processed_df) + + colnames(processed_df)=gsub('0Train','Train',colnames(processed_df)) + rownames(processed_df)=processed_df$Model_name + #colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df)) + processed_df=subset(processed_df, select=-Model_name) + assign(paste0(x,"_out"),processed_df) + print.xtable(xtable( + processed_df, type="latex"), + file=out_tex + ) +} + +# Combined Feature Selection +for (x in gene) { + out_tex=paste0(latex_outdir,"ml_table_combined-FS-", x, ".tex") + df_name=paste0("1genes_logo_skf_BT_",x,"_FS") + processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c( + 'MCC', + 'F1', + 'Accuracy', + 'Recall', + 'Precision', + 'JCC' + ), + names_sep = " ", + names_sort = TRUE + ) + processed_df=as.data.frame(processed_df) + + colnames(processed_df)=gsub('0Train','Train',colnames(processed_df)) + rownames(processed_df)=processed_df$Model_name + #colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df)) + processed_df=subset(processed_df, select=-Model_name) + assign(paste0(x,"_out"),processed_df) + print(out_tex) + print.xtable(xtable( + processed_df, type="latex"), + file=out_tex + ) +} +