From 79cb89a0197d1d73e733f754eb4a627d0ccf7e2a Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 5 Jul 2022 16:06:03 +0100
Subject: [PATCH] saving work

---
 scripts/ml/ml_functions/MultClfs.py    |  2 +-
 scripts/ml/ml_functions/MultClfs_fi.py | 93 +++++++++++---------------
 2 files changed, 39 insertions(+), 56 deletions(-)

diff --git a/scripts/ml/ml_functions/MultClfs.py b/scripts/ml/ml_functions/MultClfs.py
index d3b684a..688caf3 100755
--- a/scripts/ml/ml_functions/MultClfs.py
+++ b/scripts/ml/ml_functions/MultClfs.py
@@ -227,7 +227,7 @@ def MultModelsCl(input_df, target, skf_cv
                 , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
                 , ('SVC'                       , SVC(**rs) ) 
                 , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
              ]
                 
     mm_skf_scoresD = {}
diff --git a/scripts/ml/ml_functions/MultClfs_fi.py b/scripts/ml/ml_functions/MultClfs_fi.py
index 3803bd7..89562e2 100644
--- a/scripts/ml/ml_functions/MultClfs_fi.py
+++ b/scripts/ml/ml_functions/MultClfs_fi.py
@@ -74,6 +74,8 @@ from sklearn.impute import KNNImputer as KNN
 import json
 import argparse
 import re
+from sklearn.model_selection import LeaveOneGroupOut
+
 #%% GLOBALS
 rs = {'random_state': 42}
 njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores
@@ -95,6 +97,9 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
                                   , n_repeats = 3
                                   , **rs)
 
+logo = LeaveOneGroupOut()
+
+
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 ###############################################################################
@@ -116,7 +121,7 @@ def remove(string):
 # Run Multiple Classifiers
 ############################
 # Multiple Classification - Model Pipeline
-def XGBClf(input_df, target, sel_cv
+def MultClfs_fi(input_df, target, sel_cv
                        , blind_test_df
                        , blind_test_target
                        , tts_split_type 
@@ -175,9 +180,37 @@ def XGBClf(input_df, target, sel_cv
     #======================================================
     # Specify multiple Classification Models  
     #======================================================
-    models = [ ('XGBoost' , XGBClassifier(**rs, verbosity = 3, use_label_encoder = False, **njobs) ) 
-              , ( 'Random Forest', RandomForestClassifier(**rs, **njobs, n_estimators = 1000))
-              , ('Logistic Regression', LogisticRegression(**rs))]    
+    models = [('AdaBoost Classifier'     , AdaBoostClassifier(**rs) )
+               # , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True) )
+               #  , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
+               #  , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
+               #  , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
+               #  , ('Gradient Boosting'         , GradientBoostingClassifier(**rs) )
+               #  , ('Gaussian NB'               , GaussianNB() )
+               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
+               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
+               #  , ('LDA'                       , LinearDiscriminantAnalysis() )
+               #  , ('Logistic Regression'       , LogisticRegression(**rs) )
+               #  , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
+               #  , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
+               #  , ('Multinomial'               , MultinomialNB() )
+               #  , ('Naive Bayes'               , BernoulliNB() )
+               #  , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
+               #  , ('QDA'                       , QuadraticDiscriminantAnalysis() )
+               #  , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
+               #  # , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
+               #  #                                                         , n_estimators     = 1000
+               #  #                                                         , bootstrap        = True
+               #  #                                                         , oob_score        = True
+               #  #                                                         , **njobs
+               #  #                                                         , **rs
+               #  #                                                         , max_features     = 'auto') ) 
+               #   , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+               #   , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+               #   , ('SVC'                       , SVC(**rs) ) 
+                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+             ]
        
     mm_skf_scoresD = {}
     
@@ -270,54 +303,4 @@ def XGBClf(input_df, target, sel_cv
            mm_skf_scoresD[model_name]['bts_jcc']       = round(jaccard_score(blind_test_target, bts_predict),2)
            
     return(mm_skf_scoresD)
-#%%
-sel_cv = skf_cv
-# param dict for getmldata()
-combined_model_paramD = {'data_combined_model'   : False
-                    , 'use_or'                   : False
-                    , 'omit_all_genomic_features': False
-                    , 'write_maskfile'           : False
-                    , 'write_outfile'            : False }
-#df = getmldata(gene, drug, **combined_model_paramD)
-df = getmldata('pncA', 'pyrazinamide', **combined_model_paramD)
-
-df2 = split_tts(df
-          , data_type = 'actual'
-          , split_type = '80_20'
-          , oversampling = False
-          , dst_colname = 'dst'
-          , target_colname = 'dst_mode'
-          , include_gene_name = True
-          , random_state = 42 # default
-      )
-
-all(df2['X'].columns.isin(['gene_name']))
-
-
-fooD = XGBClf (input_df = df2['X']
-                , target = df2['y']
-                , sel_cv = skf_cv
-                , run_blind_test = True
-                , blind_test_df =  df2['X_bts']
-                , blind_test_target =  df2['y_bts']
-                , tts_split_type  = '80_20'
-                , var_type = 'mixed'
-                , resampling_type = 'none' # default
-)
-
-
-for k, v in fooD.items():
-    print('\nK:', k
-          , '\nTRAIN MCC:', fooD[k]['test_mcc']
-          ,  '\nBTS MCC:' , fooD[k]['bts_mcc'] )
-    
-#%%
-# # fit model no training data
-# model = XGBClassifier()
-# model.fit( df2['X'], df2['y'])
-# # feature importance
-# print(model.feature_importances_)
-# # plot
-# pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
-# pyplot.show()
-
+#%%
\ No newline at end of file