table generator
This commit is contained in:
parent
c2d6eb49ea
commit
e9d841d989
2 changed files with 304 additions and 1 deletions
303
scripts/thesis_summary_table.R
Normal file
303
scripts/thesis_summary_table.R
Normal file
|
@ -0,0 +1,303 @@
|
|||
library(xtable)
|
||||
library(dplyr)
|
||||
library(tidyverse)
|
||||
if (interactive()){
|
||||
print("Interactive Session, using home directories")
|
||||
data_dir = "~/git/"
|
||||
} else {
|
||||
data_dir = "/srv/shiny-server/git/"
|
||||
}
|
||||
|
||||
latex_outdir = "~/git/Writing/thesis/tex/appendices/"
|
||||
thing = read.csv(paste0(data_dir, "Data/ml_combined/genes/pnca_70_30_actual.csv"))
|
||||
|
||||
# list of splits
|
||||
split_type = c(
|
||||
# "cd_7030",
|
||||
# "cd_8020",
|
||||
"cd_sl"
|
||||
# "none"
|
||||
)
|
||||
split_choicenames=c(
|
||||
# "70:30",
|
||||
# "80:20",
|
||||
"Scaling law"
|
||||
# "CV thresholds"
|
||||
)
|
||||
split_file = c(
|
||||
# "_70_30_complete",
|
||||
# "_80_20_complete",
|
||||
"_sl_complete"
|
||||
# "_none_complete"
|
||||
)
|
||||
|
||||
split_file_FS = c(
|
||||
# "_70_30_complete",
|
||||
# "_80_20_complete",
|
||||
"_sl_complete"
|
||||
)
|
||||
|
||||
# necessary because the names will be wrong otherwise
|
||||
split_map = data.frame(
|
||||
files=c(
|
||||
# "_70_30_complete",
|
||||
# "_80_20_complete",
|
||||
"_sl_complete"
|
||||
# "_none_complete"
|
||||
),
|
||||
splits=c(
|
||||
# "cd_7030",
|
||||
# "cd_8020",
|
||||
"cd_sl"
|
||||
# "none"
|
||||
)
|
||||
)
|
||||
|
||||
metadata_cols = c("n_training_size", "n_test_size", "n_trainingY_ratio", "n_testY_ratio", "resampling", "n_features")
|
||||
|
||||
# hardcoded list of drugs
|
||||
drug = c("ethambutol", "isoniazid", "pyrazinamide", "rifampicin", "streptomycin")
|
||||
|
||||
drug_choicenames = c("EmbB-ethambutol", "KatG-isoniazid", "PncA-pyrazinamide", "RpoB-rifampicin", "GidB-streptomycin")
|
||||
|
||||
gene = c("embb", "katg", "pnca", "rpob", "gid")
|
||||
combo = data.frame(drug, gene)
|
||||
|
||||
score_cols = c(
|
||||
'MCC',
|
||||
'F1',
|
||||
'Accuracy',
|
||||
'JCC',
|
||||
'Recall',
|
||||
'Precision',
|
||||
'source_data',
|
||||
'Model_name'
|
||||
)
|
||||
|
||||
|
||||
# Loader for per-gene CSVs
|
||||
#"/home/sethp/git/Data/ml_combined/genes/pnca_70_30_complete.csv"
|
||||
loaded_files=list()
|
||||
for (x in gene) {
|
||||
#x=tolower(x)
|
||||
for (split in split_file){
|
||||
filedata = paste0(x, split)
|
||||
filename = paste0(data_dir,'LSHTM_ML/output/genes/',x,split,'.csv')
|
||||
|
||||
#print(c(filename))
|
||||
#load_name=paste0(combo[gene==x,"drug"],'_',split_map['splits'][split_map['files']==split])
|
||||
load_name=paste0(x,'_baselineC_',split_map['splits'][split_map['files']==split])
|
||||
print(load_name)
|
||||
# try() on its own is fine here because we don't need to do anything if it fails
|
||||
try({loaded_files[[load_name]] = read.csv(filename)})
|
||||
}
|
||||
}
|
||||
# Loader for per-gene Feature Selection CSVs
|
||||
for (x in gene) {
|
||||
#x=tolower(x)
|
||||
for (split in split_file_FS){
|
||||
filedata = paste0(x, split)
|
||||
filename = paste0(data_dir,'LSHTM_ML/output/genes/',x,split,'_FS.csv')
|
||||
|
||||
#print(c(filename))
|
||||
#load_name=paste0(combo[gene==x,"drug"],'_',split_map['splits'][split_map['files']==split])
|
||||
load_name=paste0(x,'_baselineC_',split_map['splits'][split_map['files']==split], '_FS')
|
||||
print(load_name)
|
||||
# try() on its own is fine here because we don't need to do anything if it fails
|
||||
try({loaded_files[[load_name]] = read.csv(filename)})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Funky loader for combined data
|
||||
for (x in gene) {
|
||||
for (ac in c('_complete', '_FS')){
|
||||
for (gene_count in c(1:5)){
|
||||
load_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac)
|
||||
filename = paste0(data_dir,'LSHTM_ML/output/combined/',load_name, ".csv")
|
||||
store_name=paste0(gene_count, "genes_logo_skf_BT_", x, ac)
|
||||
# tryCatch is necessary here rather than try() because we need to do more
|
||||
# manipulation afterwards (throwing away the column after loading)
|
||||
load_successful=TRUE
|
||||
tryCatch({temp_df = read.csv(filename)},error=function(e){load_successful<<-FALSE})
|
||||
if (load_successful){
|
||||
temp_df=temp_df[, 2:ncol(temp_df)] # throw away first column
|
||||
loaded_files[[store_name]] = temp_df
|
||||
print(paste0("loaded file: ", filename, "into var: ", store_name))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
loaded_files$embb_baselineC_cd_sl = loaded_files$embb_baselineC_cd_sl[loaded_files$embb_baselineC_cd_sl$resampling=='none',score_cols]
|
||||
loaded_files$katg_baselineC_cd_sl = loaded_files$katg_baselineC_cd_sl[loaded_files$katg_baselineC_cd_sl$resampling=='none',score_cols]
|
||||
loaded_files$pnca_baselineC_cd_sl = loaded_files$pnca_baselineC_cd_sl[loaded_files$pnca_baselineC_cd_sl$resampling=='none',score_cols]
|
||||
loaded_files$rpob_baselineC_cd_sl = loaded_files$rpob_baselineC_cd_sl[loaded_files$rpob_baselineC_cd_sl$resampling=='none',score_cols]
|
||||
loaded_files$gid_baselineC_cd_sl = loaded_files$gid_baselineC_cd_sl[loaded_files$gid_baselineC_cd_sl$resampling=='none',score_cols]
|
||||
|
||||
loaded_files$embb_baselineC_cd_sl_FS = loaded_files$embb_baselineC_cd_sl_FS[loaded_files$embb_baselineC_cd_sl_FS$resampling=='none',score_cols]
|
||||
loaded_files$katg_baselineC_cd_sl_FS = loaded_files$katg_baselineC_cd_sl_FS[loaded_files$katg_baselineC_cd_sl_FS$resampling=='none',score_cols]
|
||||
loaded_files$pnca_baselineC_cd_sl_FS = loaded_files$pnca_baselineC_cd_sl_FS[loaded_files$pnca_baselineC_cd_sl_FS$resampling=='none',score_cols]
|
||||
loaded_files$rpob_baselineC_cd_sl_FS = loaded_files$rpob_baselineC_cd_sl_FS[loaded_files$rpob_baselineC_cd_sl_FS$resampling=='none',score_cols]
|
||||
loaded_files$gid_baselineC_cd_sl_FS = loaded_files$gid_baselineC_cd_sl_FS[loaded_files$gid_baselineC_cd_sl_FS$resampling=='none',score_cols]
|
||||
|
||||
loaded_files$`5genes_logo_skf_BT_embb_complete` = loaded_files$`5genes_logo_skf_BT_embb_complete`[loaded_files$`5genes_logo_skf_BT_embb_complete`$resampling=='none',score_cols]
|
||||
loaded_files$`5genes_logo_skf_BT_katg_complete` = loaded_files$`5genes_logo_skf_BT_katg_complete`[loaded_files$`5genes_logo_skf_BT_katg_complete`$resampling=='none',score_cols]
|
||||
loaded_files$`5genes_logo_skf_BT_pnca_complete` = loaded_files$`5genes_logo_skf_BT_pnca_complete`[loaded_files$`5genes_logo_skf_BT_pnca_complete`$resampling=='none',score_cols]
|
||||
loaded_files$`5genes_logo_skf_BT_rpob_complete` = loaded_files$`5genes_logo_skf_BT_rpob_complete`[loaded_files$`5genes_logo_skf_BT_rpob_complete`$resampling=='none',score_cols]
|
||||
loaded_files$`5genes_logo_skf_BT_gid_complete` = loaded_files$`5genes_logo_skf_BT_gid_complete`[loaded_files$`5genes_logo_skf_BT_gid_complete`$resampling=='none',score_cols]
|
||||
|
||||
loaded_files$`5genes_logo_skf_BT_embb_complete`[loaded_files$`5genes_logo_skf_BT_embb_complete`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`5genes_logo_skf_BT_katg_complete`[loaded_files$`5genes_logo_skf_BT_katg_complete`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`5genes_logo_skf_BT_pnca_complete`[loaded_files$`5genes_logo_skf_BT_pnca_complete`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`5genes_logo_skf_BT_rpob_complete`[loaded_files$`5genes_logo_skf_BT_rpob_complete`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`5genes_logo_skf_BT_gid_complete`[loaded_files$`5genes_logo_skf_BT_gid_complete`$source_data=="Train","source_data"]="0Train"
|
||||
|
||||
loaded_files$`1genes_logo_skf_BT_embb_FS` = loaded_files$`1genes_logo_skf_BT_embb_FS`[loaded_files$`1genes_logo_skf_BT_embb_FS`$resampling=='none',score_cols]
|
||||
loaded_files$`1genes_logo_skf_BT_katg_FS` = loaded_files$`1genes_logo_skf_BT_katg_FS`[loaded_files$`1genes_logo_skf_BT_katg_FS`$resampling=='none',score_cols]
|
||||
loaded_files$`1genes_logo_skf_BT_pnca_FS` = loaded_files$`1genes_logo_skf_BT_pnca_FS`[loaded_files$`1genes_logo_skf_BT_pnca_FS`$resampling=='none',score_cols]
|
||||
loaded_files$`1genes_logo_skf_BT_rpob_FS` = loaded_files$`1genes_logo_skf_BT_rpob_FS`[loaded_files$`1genes_logo_skf_BT_rpob_FS`$resampling=='none',score_cols]
|
||||
loaded_files$`1genes_logo_skf_BT_gid_FS` = loaded_files$`1genes_logo_skf_BT_gid_FS`[loaded_files$`1genes_logo_skf_BT_gid_FS`$resampling=='none',score_cols]
|
||||
|
||||
loaded_files$`1genes_logo_skf_BT_embb_FS`[loaded_files$`1genes_logo_skf_BT_embb_FS`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`1genes_logo_skf_BT_katg_FS`[loaded_files$`1genes_logo_skf_BT_katg_FS`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`1genes_logo_skf_BT_pnca_FS`[loaded_files$`1genes_logo_skf_BT_pnca_FS`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`1genes_logo_skf_BT_rpob_FS`[loaded_files$`1genes_logo_skf_BT_rpob_FS`$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$`1genes_logo_skf_BT_gid_FS`[loaded_files$`1genes_logo_skf_BT_gid_FS`$source_data=="Train","source_data"]="0Train"
|
||||
|
||||
# Rewrite CV/BT to Train/Test and ensure that Train gets presented first
|
||||
loaded_files$embb_baselineC_cd_sl_FS[loaded_files$embb_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train"
|
||||
loaded_files$katg_baselineC_cd_sl_FS[loaded_files$katg_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train"
|
||||
loaded_files$pnca_baselineC_cd_sl_FS[loaded_files$pnca_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train"
|
||||
loaded_files$rpob_baselineC_cd_sl_FS[loaded_files$rpob_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train"
|
||||
loaded_files$gid_baselineC_cd_sl_FS[loaded_files$gid_baselineC_cd_sl_FS$source_data=="CV","source_data"]="0Train"
|
||||
loaded_files$embb_baselineC_cd_sl_FS[loaded_files$embb_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test"
|
||||
loaded_files$katg_baselineC_cd_sl_FS[loaded_files$katg_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test"
|
||||
loaded_files$pnca_baselineC_cd_sl_FS[loaded_files$pnca_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test"
|
||||
loaded_files$rpob_baselineC_cd_sl_FS[loaded_files$rpob_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test"
|
||||
loaded_files$gid_baselineC_cd_sl_FS[loaded_files$gid_baselineC_cd_sl_FS$source_data=="BT","source_data"]="Test"
|
||||
|
||||
loaded_files$embb_baselineC_cd_sl[loaded_files$embb_baselineC_cd_sl$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$katg_baselineC_cd_sl[loaded_files$katg_baselineC_cd_sl$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$pnca_baselineC_cd_sl[loaded_files$pnca_baselineC_cd_sl$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$rpob_baselineC_cd_sl[loaded_files$rpob_baselineC_cd_sl$source_data=="Train","source_data"]="0Train"
|
||||
loaded_files$gid_baselineC_cd_sl[loaded_files$gid_baselineC_cd_sl$source_data=="Train","source_data"]="0Train"
|
||||
|
||||
# Normal tables
|
||||
for (x in gene) {
|
||||
out_tex=paste0(latex_outdir,"ml_table-", x, ".tex")
|
||||
df_name=paste0(x,"_baselineC_cd_sl")
|
||||
processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c(
|
||||
'MCC',
|
||||
'F1',
|
||||
'Accuracy',
|
||||
'Recall',
|
||||
'Precision',
|
||||
'JCC'
|
||||
),
|
||||
names_sep = " ",
|
||||
names_sort = TRUE
|
||||
)
|
||||
processed_df=as.data.frame(processed_df)
|
||||
|
||||
colnames(processed_df)=gsub('0Train','Train',colnames(processed_df))
|
||||
rownames(processed_df)=processed_df$Model_name
|
||||
#colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df))
|
||||
processed_df=subset(processed_df, select=-Model_name)
|
||||
assign(paste0(x,"_out"),processed_df)
|
||||
print(out_tex)
|
||||
print.xtable(xtable(
|
||||
processed_df, type="latex"),
|
||||
file=out_tex
|
||||
)
|
||||
}
|
||||
|
||||
quick_latex(huxtable(processed_df), file="/tmp/foo.tex", open=FALSE)
|
||||
|
||||
|
||||
# Feature Selection tables
|
||||
for (x in gene) {
|
||||
out_tex=paste0(latex_outdir,"ml_table_FS-", x, ".tex")
|
||||
df_name=paste0(x,"_baselineC_cd_sl_FS")
|
||||
processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c(
|
||||
'MCC',
|
||||
'F1',
|
||||
'Accuracy',
|
||||
'Recall',
|
||||
'Precision',
|
||||
'JCC'
|
||||
),
|
||||
names_sep = " ",
|
||||
names_sort = TRUE
|
||||
)
|
||||
processed_df=as.data.frame(processed_df)
|
||||
|
||||
colnames(processed_df)=gsub('0Train','Train',colnames(processed_df))
|
||||
rownames(processed_df)=processed_df$Model_name
|
||||
#colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df))
|
||||
processed_df=subset(processed_df, select=-Model_name)
|
||||
assign(paste0(x,"_out"),processed_df)
|
||||
print(out_tex)
|
||||
print.xtable(xtable(
|
||||
processed_df, type="latex"),
|
||||
file=out_tex
|
||||
)
|
||||
}
|
||||
|
||||
# Combined Normal
|
||||
for (x in gene) {
|
||||
out_tex=paste0(latex_outdir,"ml_table_combined-", x, ".tex")
|
||||
df_name=paste0("5genes_logo_skf_BT_",x,"_complete")
|
||||
processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c(
|
||||
'MCC',
|
||||
'F1',
|
||||
'Accuracy',
|
||||
'Recall',
|
||||
'Precision',
|
||||
'JCC'
|
||||
),
|
||||
names_sep = " ",
|
||||
names_sort = TRUE
|
||||
)
|
||||
processed_df=as.data.frame(processed_df)
|
||||
|
||||
colnames(processed_df)=gsub('0Train','Train',colnames(processed_df))
|
||||
rownames(processed_df)=processed_df$Model_name
|
||||
#colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df))
|
||||
processed_df=subset(processed_df, select=-Model_name)
|
||||
assign(paste0(x,"_out"),processed_df)
|
||||
print.xtable(xtable(
|
||||
processed_df, type="latex"),
|
||||
file=out_tex
|
||||
)
|
||||
}
|
||||
|
||||
# Combined Feature Selection
|
||||
for (x in gene) {
|
||||
out_tex=paste0(latex_outdir,"ml_table_combined-FS-", x, ".tex")
|
||||
df_name=paste0("1genes_logo_skf_BT_",x,"_FS")
|
||||
processed_df = loaded_files[[df_name]] %>% pivot_wider(names_from=source_data, values_from=c(
|
||||
'MCC',
|
||||
'F1',
|
||||
'Accuracy',
|
||||
'Recall',
|
||||
'Precision',
|
||||
'JCC'
|
||||
),
|
||||
names_sep = " ",
|
||||
names_sort = TRUE
|
||||
)
|
||||
processed_df=as.data.frame(processed_df)
|
||||
|
||||
colnames(processed_df)=gsub('0Train','Train',colnames(processed_df))
|
||||
rownames(processed_df)=processed_df$Model_name
|
||||
#colnames(processed_df)=gsub('Model_name','Model name',colnames(processed_df))
|
||||
processed_df=subset(processed_df, select=-Model_name)
|
||||
assign(paste0(x,"_out"),processed_df)
|
||||
print(out_tex)
|
||||
print.xtable(xtable(
|
||||
processed_df, type="latex"),
|
||||
file=out_tex
|
||||
)
|
||||
}
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue