From 33e3b5a0a6ab19bb4fefa707acdf2d8f97819caf Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sun, 10 Jul 2022 20:00:35 +0100 Subject: [PATCH] various bugs --- scripts/count_vars_ML.R | 47 +++++++++++++++++------------ scripts/ml/ml_functions/MultClfs.py | 7 +++-- scripts/ml/ml_iterator.py | 9 ++++-- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 65a2a77..6f87c92 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -213,32 +213,41 @@ str(a) ################################################### ################################################### # -# source("~/git/LSHTM_analysis/config/alr.R") -# source("~/git/LSHTM_analysis/config/embb.R") -# source("~/git/LSHTM_analysis/config/gid.R") -# source("~/git/LSHTM_analysis/config/katg.R") -# source("~/git/LSHTM_analysis/config/pnca.R") -# source("~/git/LSHTM_analysis/config/rpob.R") +source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/rpob.R") # -# df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") -# df3 = read.csv(df3_filename) +df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") +df3 = read.csv(df3_filename) # -# # mutationinformation -# length(unique((df3$mutationinformation))) +# mutationinformation +length(unique((df3$mutationinformation))) # # #dm _om -# table(df3$mutation_info) -# table(df3$mutation_info_labels) -# table(df3$mutation_info_orig) -# table(df3$mutation_info_labels_orig) -# -# # test_set -# na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) -# na_count[drug] +table(df3$mutation_info) +table(df3$mutation_info_labels) +table(df3$mutation_info_orig) +table(df3$mutation_info_labels_orig) + +# test_set +na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) +na_count[drug] # # # training set -# table(df3[drug]) +table(df3[drug]) # # # drtype: MDR and XDR # #table(df3$drtype) orig i.e. incorrect ones! # table(df3$drtype_mode_labels) + + +df3_complete = df3 +table(df3_complete$dst_mode) + + +df3_actual = df3[!is.na(df3$dst), ] +table(df3_actual$dst_mode) + diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py index 3e6c729..c4a5be1 100755 --- a/scripts/ml/ml_functions/MultClfs.py +++ b/scripts/ml/ml_functions/MultClfs.py @@ -354,9 +354,10 @@ def MultModelsCl(input_df, target y_pred = cross_val_predict(model_pipeline , input_df , target - , cv = sel_cv - #, groups = group - , **njobs) + #, commented out thing, + , cv=sel_cv + , **njobs + ) #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py index bfa3675..6daf527 100755 --- a/scripts/ml/ml_iterator.py +++ b/scripts/ml/ml_iterator.py @@ -48,8 +48,13 @@ ml_gene_drugD = {'pncA' : 'pyrazinamide' , 'gid' : 'streptomycin' } gene_dataD={} -split_types = ['70_30', '80_20', 'sl'] -split_data_types = ['actual', 'complete'] +split_types = ['70_30', + '80_20', + 'sl' + ] +split_data_types = ['actual', + 'complete' + ] for gene, drug in ml_gene_drugD.items(): print ('\nGene:', gene