From 33e3b5a0a6ab19bb4fefa707acdf2d8f97819caf Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sun, 10 Jul 2022 20:00:35 +0100
Subject: [PATCH] various bugs

---
 scripts/count_vars_ML.R             | 47 +++++++++++++++++------------
 scripts/ml/ml_functions/MultClfs.py |  7 +++--
 scripts/ml/ml_iterator.py           |  9 ++++--
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R
index 65a2a77..6f87c92 100644
--- a/scripts/count_vars_ML.R
+++ b/scripts/count_vars_ML.R
@@ -213,32 +213,41 @@ str(a)
 ###################################################
 ###################################################
 # 
-# source("~/git/LSHTM_analysis/config/alr.R")
-# source("~/git/LSHTM_analysis/config/embb.R")
-# source("~/git/LSHTM_analysis/config/gid.R")
-# source("~/git/LSHTM_analysis/config/katg.R")
-# source("~/git/LSHTM_analysis/config/pnca.R")
-# source("~/git/LSHTM_analysis/config/rpob.R")
+source("~/git/LSHTM_analysis/config/alr.R")
+source("~/git/LSHTM_analysis/config/embb.R")
+source("~/git/LSHTM_analysis/config/gid.R")
+source("~/git/LSHTM_analysis/config/katg.R")
+source("~/git/LSHTM_analysis/config/pnca.R")
+source("~/git/LSHTM_analysis/config/rpob.R")
 # 
-# df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
-# df3 = read.csv(df3_filename)
+df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
+df3 = read.csv(df3_filename)
 # 
-# # mutationinformation
-# length(unique((df3$mutationinformation)))
+# mutationinformation
+length(unique((df3$mutationinformation)))
 # 
 # #dm _om
-# table(df3$mutation_info)
-# table(df3$mutation_info_labels)
-# table(df3$mutation_info_orig)
-# table(df3$mutation_info_labels_orig)
-# 
-# # test_set
-# na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
-# na_count[drug]
+table(df3$mutation_info)
+table(df3$mutation_info_labels)
+table(df3$mutation_info_orig)
+table(df3$mutation_info_labels_orig)
+ 
+# test_set
+na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
+na_count[drug]
 # 
 # # training set
-# table(df3[drug])
+table(df3[drug])
 # 
 # # drtype: MDR and XDR
 # #table(df3$drtype) orig i.e. incorrect ones!
 # table(df3$drtype_mode_labels)
+
+
+df3_complete = df3
+table(df3_complete$dst_mode)
+
+
+df3_actual =  df3[!is.na(df3$dst), ]
+table(df3_actual$dst_mode)
+
diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py
index 3e6c729..c4a5be1 100755
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@@ -354,9 +354,10 @@ def MultModelsCl(input_df, target
            y_pred   = cross_val_predict(model_pipeline
                                         , input_df
                                         , target
-                                        , cv = sel_cv
-                                        #, groups = group
-                                        , **njobs)
+                                        #, commented out thing,
+                                        , cv=sel_cv
+                                        , **njobs
+                                        )
             #_tn, _fp, _fn, _tp = confusion_matrix(y_pred, y).ravel() # internally
            tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
     
diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py
index bfa3675..6daf527 100755
--- a/scripts/ml/ml_iterator.py
+++ b/scripts/ml/ml_iterator.py
@@ -48,8 +48,13 @@ ml_gene_drugD = {'pncA'   : 'pyrazinamide'
                  , 'gid'  : 'streptomycin'
                  }
 gene_dataD={}
-split_types = ['70_30', '80_20', 'sl']
-split_data_types = ['actual', 'complete']
+split_types = ['70_30',
+               '80_20',
+               'sl'
+               ]
+split_data_types = ['actual',
+                    'complete'
+                    ]
 
 for gene, drug in ml_gene_drugD.items():
     print ('\nGene:', gene