From 11af00f1db0e7c07d0a0fe107b311ef608fe3166 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 1 Jul 2022 21:40:14 +0100
Subject: [PATCH] changed ml output dirs and ready to run fs

---
 scripts/ml/combined_model/cm_logo_skf.py    |  8 +-
 scripts/ml/combined_model/cm_ml_iterator.py | 26 +++---
 scripts/ml/ml_iterator.py                   |  8 +-
 scripts/ml/ml_iterator_fs.py                | 82 +++++++++---------
 scripts/ml/running_ml_scripts.txt           | 95 ---------------------
 5 files changed, 67 insertions(+), 152 deletions(-)
 mode change 100644 => 100755 scripts/ml/ml_iterator_fs.py
 delete mode 100644 scripts/ml/running_ml_scripts.txt

diff --git a/scripts/ml/combined_model/cm_logo_skf.py b/scripts/ml/combined_model/cm_logo_skf.py
index f4cf311..24d6af9 100755
--- a/scripts/ml/combined_model/cm_logo_skf.py
+++ b/scripts/ml/combined_model/cm_logo_skf.py
@@ -80,6 +80,8 @@ homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
 sys.path
 ###############################################################################
+outdir = homedir + '/git/LSHTM_ML/output/combined/
+
 #====================
 # Import ML functions 
 #====================
@@ -92,6 +94,9 @@ skf_cv = StratifiedKFold(n_splits = 10 , shuffle = True, random_state = 42)
 
 #logo = LeaveOneGroupOut()
 
+########################################################################
+# COMPLETE data: No tts_split
+########################################################################
 #%%
 def CMLogoSkf(combined_df
         , all_genes =  ["embb", "katg", "rpob", "pnca", "gid", "alr"]
@@ -125,7 +130,8 @@ def CMLogoSkf(combined_df
             
         tts_split_type = "logo_skf_BT_" + bts_gene
         
-        outFile = "/home/tanu/git/Data/ml_combined/" + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv"
+        outFile = outdir + str(n_tr_genes+1) + "genes_" + tts_split_type + ".csv"
+        
         print(outFile)
     
         #-------
diff --git a/scripts/ml/combined_model/cm_ml_iterator.py b/scripts/ml/combined_model/cm_ml_iterator.py
index e6ea9d2..20e8b0a 100755
--- a/scripts/ml/combined_model/cm_ml_iterator.py
+++ b/scripts/ml/combined_model/cm_ml_iterator.py
@@ -15,19 +15,19 @@ homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
 sys.path
 ###############################################################################
+outdir = homedir + '/git/LSHTM_ML/output/combined/
+
 #====================
 # Import ML functions 
 #====================
-from MultClfs import *
+#from MultClfs import *
+from MultClfs_logo_skf import *
 from GetMLData import *
 from SplitTTS import *
 
-# param dict for getmldata()
-combined_model_paramD = {'data_combined_model'   : False
-                    , 'use_or'                   : False
-                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
-                    , 'write_outfile'            : False }
+# Input data
+from ml_data_combined import *
+
 ###############################################################################
 #ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
 
@@ -54,7 +54,7 @@ for gene, drug in ml_gene_drugD.items():
 
     for split_type in split_types:
         for data_type in split_data_types:
-            out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv')
+            out_filename = outdir + gene.lower()+ '_' + split_type + '_' + data_type + '.csv'
             tempD=split_tts(gene_dataD[gene_low]
                       , data_type = data_type
                       , split_type = split_type
@@ -88,14 +88,8 @@ for gene, drug in ml_gene_drugD.items():
             
             mmDD = {}
             for k, v in paramD.items():
-                scoresD = MultModelsCl(**paramD[k]
-                                    , tts_split_type = split_type
-                                    , skf_cv = skf_cv
-                                    , blind_test_df =  tempD['X_bts']
-                                    , blind_test_target = tempD['y_bts']
-                                    , add_cm = True 
-                                    , add_yn = True
-                                    , return_formatted_output = True)
+                scoresD = MultModelsCl_logo_skf(**paramD[k]
+                XXXXXXXXXXXXXXXXXXXXXXX
                 mmDD[k] = scoresD
 
             # Extracting the dfs from within the dict and concatenating to output as one df
diff --git a/scripts/ml/ml_iterator.py b/scripts/ml/ml_iterator.py
index e6ea9d2..a7dc7c6 100755
--- a/scripts/ml/ml_iterator.py
+++ b/scripts/ml/ml_iterator.py
@@ -15,6 +15,8 @@ homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
 sys.path
 ###############################################################################
+outdir = homedir + '/git/LSHTM_ML/output/genes/'
+
 #====================
 # Import ML functions 
 #====================
@@ -54,7 +56,9 @@ for gene, drug in ml_gene_drugD.items():
 
     for split_type in split_types:
         for data_type in split_data_types:
-            out_filename = (gene.lower()+'_'+split_type+'_'+data_type+'.csv')
+        
+            out_filename = outdir + gene.lower() + '_' + split_type + '_' + data_type + '.csv'
+            
             tempD=split_tts(gene_dataD[gene_low]
                       , data_type = data_type
                       , split_type = split_type
@@ -103,5 +107,5 @@ for gene, drug in ml_gene_drugD.items():
                 out_wf= pd.concat(mmDD, ignore_index = True)
             
             out_wf_f = out_wf.sort_values(by = ['resampling', 'source_data', 'MCC'], ascending = [True, True, False], inplace = False)
-            out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+out_filename), index = False)
+            out_wf_f.to_csv(('/home/tanu/git/Data/ml_combined/genes/'+ out_filename), index = False)
 
diff --git a/scripts/ml/ml_iterator_fs.py b/scripts/ml/ml_iterator_fs.py
old mode 100644
new mode 100755
index 60ec2db..57d93e2
--- a/scripts/ml/ml_iterator_fs.py
+++ b/scripts/ml/ml_iterator_fs.py
@@ -15,6 +15,8 @@ homedir = os.path.expanduser("~")
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts/ml/ml_functions')
 sys.path
 ###############################################################################
+outdir = homedir + '/git/LSHTM_ML/output/fs/'
+
 #====================
 # Import ML functions 
 #====================
@@ -31,7 +33,8 @@ combined_model_paramD = {'data_combined_model'   : False
                     , 'write_outfile'            : False }
 ###############################################################################
 #ml_genes = ["pncA", "embB", "katG", "rpoB", "gid"]
-outdir = homedir + '/git/Data/ml_combined/fs/'
+# outdir = homedir + '/git/Data/ml_combined/fs/'
+
 ml_gene_drugD = {'pncA'   : 'pyrazinamide'
                  # , 'embB' : 'ethambutol'
                  # , 'katG' : 'isoniazid'
@@ -39,26 +42,27 @@ ml_gene_drugD = {'pncA'   : 'pyrazinamide'
                  # , 'gid'  : 'streptomycin'
                  }
 gene_dataD={}
-#split_types = ['70_30', '80_20', 'sl']
-#split_data_types = ['actual', 'complete']
-split_types = ['70_30']
+split_types = ['70_30', '80_20', 'sl']
 split_data_types = ['actual', 'complete']
+#split_types = ['70_30']
+#split_data_types = ['actual', 'complete']
 
-fs_models  = [('Logistic Regression'       , LogisticRegression(**rs) )]
-# fs_models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
-#           , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
-#           , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
-#           , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
-#           , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
-#           , ('LDA'                       , LinearDiscriminantAnalysis() )
-#           , ('Logistic Regression'       , LogisticRegression(**rs) )
-#           , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
-#           , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
-#           , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
-#           , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-#           , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-#           , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-#           ]
+#fs_models  = [('Logistic Regression'       , LogisticRegression(**rs) )]
+
+fs_models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+          , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+          , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+          , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+          , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+          , ('LDA'                       , LinearDiscriminantAnalysis() )
+          , ('Logistic Regression'       , LogisticRegression(**rs) )
+          , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+          , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+          , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000 ) ) 
+          , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+          , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+          , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+          ]
 
 for gene, drug in ml_gene_drugD.items():
     print ('\nGene:', gene
@@ -88,26 +92,28 @@ for gene, drug in ml_gene_drugD.items():
                                         , 'target'         : tempD['y']
                                         , 'var_type'       : 'mixed'
                                         , 'resampling_type': 'none'}
-                    ,'smnc_paramD': { 'input_df'          : tempD['X_smnc']
-                                     , 'target'           : tempD['y_smnc']
-                                     , 'var_type'         : 'mixed'
-                                     , 'resampling_type'  : 'smnc'}
-                    # , 'ros_paramD': { 'input_df'           : tempD['X_ros']
-                    #                 , 'target'             : tempD['y_ros']
-                    #                 , 'var_type'           : 'mixed'
-                    #                 , 'resampling_type'    : 'ros'}
-                    # , 'rus_paramD' : { 'input_df'          : tempD['X_rus']
-                    #                   , 'target'           : tempD['y_rus']
-                    #                   , 'var_type'         : 'mixed'
-                    #                   , 'resampling_type'  : 'rus'}
-                    # , 'rouC_paramD' : { 'input_df'         : tempD['X_rouC']
-                    #                     , 'target'         : tempD['y_rouC']
-                    #                     , 'var_type'       : 'mixed'
-                    #                     , 'resampling_type': 'rouC'}
+                    
+                    , 'smnc_paramD'  : { 'input_df'          : tempD['X_smnc']
+                                        , 'target'           : tempD['y_smnc']
+                                        , 'var_type'         : 'mixed'
+                                        , 'resampling_type'  : 'smnc'}
+
+                    , 'ros_paramD'   : { 'input_df'           : tempD['X_ros']
+                                        , 'target'             : tempD['y_ros']
+                                        , 'var_type'           : 'mixed'
+                                        , 'resampling_type'    : 'ros'}
+
+                    , 'rus_paramD'   : { 'input_df'          : tempD['X_rus']
+                                        , 'target'           : tempD['y_rus']
+                                        , 'var_type'         : 'mixed'
+                                        , 'resampling_type'  : 'rus'}
+
+                    , 'rouC_paramD'  : { 'input_df'         : tempD['X_rouC']
+                                        , 'target'         : tempD['y_rouC']
+                                        , 'var_type'       : 'mixed'
+                                        , 'resampling_type': 'rouC'}
                     }
-            #for m in fs_models:
-            #    print(m)
-            
+
             out_fsD = {}
             index = 1
             for model_name, model_fn in fs_models:
diff --git a/scripts/ml/running_ml_scripts.txt b/scripts/ml/running_ml_scripts.txt
deleted file mode 100644
index 279cb9d..0000000
--- a/scripts/ml/running_ml_scripts.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-########################################################################
-
-#                          70/30 [WITHOUT OR]
-
-########################################################################
-
-=-----------------------------------=
-# actual data
-#------------------------------------=
-
-time ./run_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_7030_.txt
-time ./run_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_7030_.txt
-time ./run_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_7030_.txt
-time ./run_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_7030_.txt
-time ./run_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_7030_.txt
-time ./run_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_7030_.txt
-
-=-----------------------------------=
-# COMPLETE data
-#------------------------------------=
-
-time ./run_cd_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_7030_.txt
-time ./run_cd_7030.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_7030_.txt
-time ./run_cd_7030.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_7030_.txt
-time ./run_cd_7030.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_7030_.txt
-time ./run_cd_7030.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_7030_.txt
-time ./run_cd_7030.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_7030_.txt
-
-
-########################################################################
-
-#                      80/20 [WITHOUT OR]
-
-########################################################################
-=-----------------------------------=
-# actual data
-#------------------------------------=
-
-time ./run_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_8020_.txt
-time ./run_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_8020_.txt
-time ./run_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_8020_.txt
-time ./run_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_8020_.txt
-time ./run_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_8020_.txt
-time ./run_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_8020_.txt
-
-=-----------------------------------=
-# COMPLETE data
-#------------------------------------=
-
-time ./run_cd_8020.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_8020_.txt
-time ./run_cd_8020.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_8020_.txt
-time ./run_cd_8020.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_8020_.txt
-time ./run_cd_8020.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_8020_.txt
-time ./run_cd_8020.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_8020_.txt
-time ./run_cd_8020.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_8020_.txt
-########################################################################
-
-#                          SL [WITHOUT OR]
-
-########################################################################
-
-=-----------------------------------=
-# actual data
-#------------------------------------=
-time ./run_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_sl_.txt
-time ./run_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_sl_.txt
-time ./run_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_sl_.txt
-time ./run_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_sl_.txt
-time ./run_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_sl_.txt
-time ./run_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_sl_.txt
-
-=-----------------------------------=
-# COMPLETE data
-#------------------------------------=
-time ./run_cd_sl.py -g pncA -d pyrazinamide 2>&1 | tee log_pnca_cd_sl_.txt
-time ./run_cd_sl.py -g embB -d ethambutol 2>&1 | tee log_embb_cd_sl_.txt
-time ./run_cd_sl.py -g katG -d isoniazid 2>&1 | tee log_katg_cd_sl_.txt
-time ./run_cd_sl.py -g rpoB -d rifampicin 2>&1 | tee log_rpob_cd_sl_.txt
-time ./run_cd_sl.py -g gid -d streptomycin 2>&1 | tee log_gid_cd_sl_.txt
-time ./run_cd_sl.py -g alr -d cycloserine 2>&1 | tee log_alr_cd_sl_.txt
-
-
-########################################################################
-
-########################################################################
-########################################################################
-######################   Feature Selection    ##########################
-########################################################################
-########################################################################
-
-# 7030
-time ./run_FS.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030.txt
-
-
-time ./run_FS_7030.py -g pncA -d pyrazinamide 2>&1 | tee log_FS_pnca_7030_.txt