moved logo_skf function to del as using the MultClfs for combined data

2022-07-28 12:24:24 +01:00 · 2022-07-28 12:24:24 +01:00 · 2c50124b1b
commit 2c50124b1b
parent a6532ddfa3
8 changed files with 71 additions and 1735 deletions
--- a/scripts/ml/ml_functions/MultClfs_logo_skf.py
+++ b/scripts/ml/ml_functions/MultClfs_logo_skf.py
@ -77,9 +77,11 @@ import re
 import itertools
 from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.decomposition import PCA
+from sklearn.naive_bayes import ComplementNB
+from sklearn.dummy import DummyClassifier

 #%% GLOBALS
-#rs = {'random_state': 42}
+#rs = {'random_state': 42} # INSIDE FUNCTION CALL NOW
 #njobs = {'n_jobs': os.cpu_count() } # the number of jobs should equal the number of CPU cores

 scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
@ -90,8 +92,7 @@ scoring_fn =  ({ 'mcc'        : make_scorer(matthews_corrcoef)
                , 'roc_auc'   : make_scorer(roc_auc_score)
                , 'jcc'       : make_scorer(jaccard_score)
            }) 
- 
-# for sel_cv
+# for sel_cv INSIDE FUNCTION CALL NOW
 #skf_cv = StratifiedKFold(n_splits = 10
 #                          #, shuffle = False, random_state= None)
 #                           , shuffle = True, **rs)
@ -149,25 +150,25 @@ scoreBT_mapD = {'bts_mcc'          : 'MCC'
 ############################
 # Multiple Classification - Model Pipeline
 def MultModelsCl_logo_skf(input_df
-                       , target
-                       , sel_cv
-                       , tts_split_type
-                       , resampling_type
-                       #, group = None
-                       
-                       , add_cm = True # adds confusion matrix based on cross_val_predict
-                       , add_yn = True  # adds target var class numbers
-                       , var_type = ['numerical', 'categorical','mixed']
-                       , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 
+                 , target
+                 , sel_cv
+                 , tts_split_type
+                 , resampling_type
+                 #, group = None
+                
+                 , add_cm = True # adds confusion matrix based on cross_val_predict
+                 , add_yn = True  # adds target var class numbers
+                 , var_type = ['numerical', 'categorical','mixed']
+                 , scale_numeric = ['min_max', 'std', 'min_max_neg', 'none'] 

-                       , run_blind_test = True
-                       , blind_test_df = pd.DataFrame()
-                       , blind_test_target = pd.Series(dtype = int)
-                       , return_formatted_output = True
+                 , run_blind_test = True
+                 , blind_test_df = pd.DataFrame()
+                 , blind_test_target = pd.Series(dtype = int)
+                 , return_formatted_output = True

-                       , random_state = 42
-                       , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
-                       ):
+                 , random_state = 42
+                 , n_jobs = os.cpu_count() # the number of jobs should equal the number of CPU cores
+                 ):

    '''
    @ param input_df: input features 
@ -189,7 +190,15 @@ def MultModelsCl_logo_skf(input_df
 #%% Func globals        
    rs = {'random_state': random_state}
    njobs = {'n_jobs': n_jobs}
+    
+    skf_cv = StratifiedKFold(n_splits = 10
+                              #, shuffle = False, random_state= None)
+                              , shuffle = True,**rs)

+    rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                      , n_repeats = 3
+                                      , **rs)
+    logo = LeaveOneGroupOut()

    # select CV type:           
    # if group == None:
@ -252,8 +261,10 @@ def MultModelsCl_logo_skf(input_df
    #======================================================
    # Specify multiple Classification Models  
    #======================================================
-    models = [('AdaBoost Classifier'            , AdaBoostClassifier(**rs) )
-               , ('Bagging Classifier'          , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+    models = [('AdaBoost Classifier'         , AdaBoostClassifier(**rs) )
+              , ('Bagging Classifier'        , BaggingClassifier(**rs, **njobs, bootstrap = True, oob_score = True, verbose = 3, n_estimators = 100) )
+               #, ('Bernoulli NB'               , BernoulliNB() ) # pks Naive Bayes, CAUTION
+               , ('Complement NB'             , ComplementNB() )
               , ('Decision Tree'             , DecisionTreeClassifier(**rs) ) 
               , ('Extra Tree'                , ExtraTreeClassifier(**rs) )
               , ('Extra Trees'               , ExtraTreesClassifier(**rs) ) 
@ -265,23 +276,23 @@ def MultModelsCl_logo_skf(input_df
               , ('Logistic Regression'       , LogisticRegression(**rs) )
               , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
               , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
-               , ('Multinomial'               , MultinomialNB() )
-               , ('Naive Bayes'               , BernoulliNB() )
+               , ('Multinomial NB'            , MultinomialNB() )
               , ('Passive Aggresive'         , PassiveAggressiveClassifier(**rs, **njobs) )
               , ('QDA'                       , QuadraticDiscriminantAnalysis() )
               , ('Random Forest'             , RandomForestClassifier(**rs, n_estimators = 1000, **njobs ) ) 
               , ('Random Forest2'            , RandomForestClassifier(min_samples_leaf = 5
-                                                                       , n_estimators     = 1000
-                                                                       , bootstrap        = True
-                                                                       , oob_score        = True
-                                                                       , **njobs
-                                                                       , **rs
-                                                                       , max_features     = 'auto') ) 
-                , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
-                , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
-                , ('SVC'                       , SVC(**rs) ) 
-                , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-                , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+                                                                          , n_estimators     = 1000
+                                                                          , bootstrap        = True
+                                                                          , oob_score        = True
+                                                                          , **njobs
+                                                                          , **rs
+                                                                          , max_features     = 'auto') ) 
+               , ('Ridge Classifier'          , RidgeClassifier(**rs)  )
+               , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
+               , ('SVC'                       , SVC(**rs) ) 
+               , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
+               , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder = False, **njobs) )
+               , ('Dummy Classifier'          , DummyClassifier(strategy = 'most_frequent') )
             ]
                
    mm_skf_scoresD = {}