added intra_model_gscv.py that tell me within each model which hyperparasm are best, allows me to choose the models with the best hyperparams to then compare 'INTER' model

2022-03-18 17:52:06 +00:00 · 2022-03-18 17:52:06 +00:00 · ffd3ce6ee3
commit ffd3ce6ee3
parent d3b6fe13a6
1 changed files with 170 additions and 0 deletions
--- a/intra_model_gscv.py
+++ b/intra_model_gscv.py
@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 18 16:44:18 2022
+
+@author: tanu
+"""
+# Custom GridSearch <intra model>
+#https://stackoverflow.com/questions/23045318/grid-search-over-multiple-classifiers
+
+#%% ClfSwitcher()
+class ClfSwitcher(BaseEstimator):
+    def __init__(
+        self, 
+        estimator = SGDClassifier(),
+    ):
+        """
+        A Custom BaseEstimator that can switch between classifiers.
+        :param estimator: sklearn object - The classifier
+        """ 
+    
+        self.estimator = estimator
+    
+    def fit(self, X, y=None, **kwargs):
+        self.estimator.fit(X, y)
+        return self
+    
+    def predict(self, X, y=None):
+        return self.estimator.predict(X)
+    
+    def predict_proba(self, X):
+        return self.estimator.predict_proba(X)
+    
+    #def score(self, X, y):
+    #    return self.estimator.score(X, y)
+    
+    #def recall_score(self, X, y):
+    #    return self.estimator.recall_score(X, y)
+#%% Custom GridSearch: IntraModel[orig]
+def grid_search2(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']) :
+    
+    pipeline1 = Pipeline((
+    ('pre', MinMaxScaler())
+    , ('clf', DecisionTreeClassifier(**rs))
+    ))
+
+    pipeline2 = Pipeline((
+    ('pre', MinMaxScaler())
+    ,('clf', KNeighborsClassifier())
+    ))
+    
+    parameters1 = {
+        'clf__max_depth': [ 2,  4,  6, 8, 10]
+        , 'clf__criterion':['gini','entropy']
+        , "clf__max_features":["auto", None]
+        , "clf__max_leaf_nodes":[10,20,30,40]
+                 }
+
+    parameters2 = {
+    'clf__n_neighbors': [3, 7, 10],
+    'clf__weights': ['uniform', 'distance']
+    }
+
+    pips = [pipeline1
+            , pipeline2
+            ]  
+
+    pars = [parameters1
+            , parameters2
+            ]
+ 
+    print("\nstarting Gridsearch")
+    for i in range(len(pars)):
+        print('IIIII===>', i)
+        gs = GridSearchCV(pips[i], pars[i]
+                          , cv = skf_cv
+                          , **scoring_refit
+                          #, refit=False
+                          , **njobs
+                          , verbose=3)
+        gs = gs.fit(X, y)
+        print ("finished Gridsearch")
+        print ('\nBest model:', gs.best_params_)
+        print ('\nBest score:', gs.best_score_)
+#%% Custom grid_search: Intra-Model [with return]
+def grid_search(input_df, target
+                , skf_cv
+                , chosen_scoreD #scoring_refit 
+                #, var_type = ['numerical', 'categorical','mixed']
+                ):
+# Pipeline_params    
+    pipeline1 = Pipeline((
+    ('pre', MinMaxScaler())
+    , ('clf', DecisionTreeClassifier(**rs))
+    ))
+
+    pipeline2 = Pipeline((
+    ('pre', MinMaxScaler())
+    ,('clf', KNeighborsClassifier())
+    ))
+
+    parameters1 = {
+        'clf__max_depth': [ 2,  4,  6, 8, 10]
+        , 'clf__criterion':['gini','entropy']
+        , "clf__max_features":["auto", None]
+        , "clf__max_leaf_nodes":[10,20,30,40]
+                 }
+    parameters2 = {
+    'clf__n_neighbors': [3, 7, 10],
+    'clf__weights': ['uniform', 'distance']
+    }
+
+    all_parameters = [parameters1
+            , parameters2
+            ]
+    all_pipelines = [pipeline1
+            , pipeline2
+            ]   
+    print("\nStarting Gridsearch")
+    
+# Run gridsearch for all each model
+    out = {}
+    #for 
+    for i in range(len(all_parameters)):
+        model_name = str(all_pipelines[i].steps[1][1])
+        #model_name = str(model.steps[1][1])
+        #out[model_name] = dict()
+    
+        print("\nStarting Gridsearch for model:", model_name, i)
+        gs = GridSearchCV(all_pipelines[i], all_parameters[i]
+                          , cv = skf_cv
+                          #, **scoring_refit
+                          #, refit=False
+                          , **chosen_scoreD
+                          , **njobs
+                          , verbose=3)
+        gs = gs.fit(input_df,target)
+        print('=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
+        print ("Finished Gridsearch")
+        print ('\nScore type to choose best score:', chosen_score)
+         
+        best_model = gs.best_params_
+        best_score = gs.best_score_
+        
+        best_params = deepcopy(gs.best_params_)
+        print ('\nBest score:', best_score, '\ntype: ', type(best_score))
+        print ('\nBest params:', best_params, '\ntype: ', type(best_params))
+    
+        out[model_name] = best_params
+        out[model_name].update(chosen_scoreD.copy())
+        out[model_name].update({'best_score': gs.best_score_}.copy())
+    return(out)
+#%% call CUSTOM grid_search: INTRA model [with return]
+# call
+chosen_score =  {'scoring': 'recall'
+                 ,'refit': 'recall'}
+mcc_score_fn = {'chosen_scoreD': {'scoring': {'mcc': make_scorer(matthews_corrcoef)
+                                            ,'refit': 'mcc'}
+                                  }
+                }
+
+intra_models = grid_search(X, y
+            , skf_cv = skf_cv
+            , chosen_scoreD= chosen_score
+            #, **mcc_score_fn)# doesn't work
+pp.pprint(intra_models)
+
+# TO DO: combine other score metrics for the best model: This will tell you which hyperparams are the best with each metric
+    # shouldn't be hard, as the dicts for a given model type will have the same shape
+# TO DO: if you can specify different score and append to the model