ML_AI_training/intra_model_gscv.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 18 16:44:18 2022

@author: tanu
"""
# Custom GridSearch <intra model>
#https://stackoverflow.com/questions/23045318/grid-search-over-multiple-classifiers

#%% ClfSwitcher()
class ClfSwitcher(BaseEstimator):
    def __init__(
        self,
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """

        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    #def score(self, X, y):
    #    return self.estimator.score(X, y)

    #def recall_score(self, X, y):
    #    return self.estimator.recall_score(X, y)
#%% Custom GridSearch: IntraModel[orig]
def grid_search(input_df, target, sel_cv, var_type = ['numerical', 'categorical','mixed']) :

    pipeline1 = Pipeline((
    ('pre', MinMaxScaler())
    , ('clf', DecisionTreeClassifier(**rs))
    ))

    pipeline2 = Pipeline((
    ('pre', MinMaxScaler())
    ,('clf', KNeighborsClassifier())
    ))

    parameters1 = {
        'clf__max_depth': [ 2,  4,  6, 8, 10]
        , 'clf__criterion':['gini','entropy']
        , "clf__max_features":["auto", None]
        , "clf__max_leaf_nodes":[10,20,30,40]
                 }

    parameters2 = {
    'clf__n_neighbors': [3, 7, 10],
    'clf__weights': ['uniform', 'distance']
    }

    pips = [pipeline1
            , pipeline2
            ]

    pars = [parameters1
            , parameters2
            ]

    print("\nstarting Gridsearch")
    for i in range(len(pars)):
        print('IIIII===>', i)
        gs = GridSearchCV(pips[i], pars[i]
                          , cv = sel_cv
                          , **scoring_refit
                          #, refit=False
                          , **njobs
                          , verbose=3)
        gs = gs.fit(X, y)
        print ("finished Gridsearch")
        print ('\nBest model:', gs.best_params_)
        print ('\nBest score:', gs.best_score_)
# TODO: add
#         # summarize results
#         print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#         means = grid_result.cv_results_['mean_test_score']
#         stds = grid_result.cv_results_['std_test_score']
#         params = grid_result.cv_results_['params']
#         for mean, stdev, param in zip(means, stds, params):
#             print("%f (%f) with: %r" % (mean, stdev, param))

# CALL: grid_search [orig]
grid_search()

# #%% Custom grid_search: Intra-Model [with return]
def grid_search(input_df, target
                , sel_cv
                , chosen_scoreD #scoring_refit
                #, var_type = ['numerical', 'categorical','mixed']
                ):
# Pipeline_params
    pipeline1 = Pipeline((
    ('pre', MinMaxScaler())
    , ('clf', DecisionTreeClassifier(**rs))
    ))

    pipeline2 = Pipeline((
    ('pre', MinMaxScaler())
    ,('clf', KNeighborsClassifier())
    ))

    parameters1 = {
        'clf__max_depth': [ 2,  4,  6, 8, 10]
        , 'clf__criterion':['gini','entropy']
        , "clf__max_features":["auto", None]
        , "clf__max_leaf_nodes":[10,20,30,40]
                 }
    parameters2 = {
    'clf__n_neighbors': [3, 7, 10],
    'clf__weights': ['uniform', 'distance']
    }

    all_parameters = [parameters1
            , parameters2
            ]
    all_pipelines = [pipeline1
            , pipeline2
            ]
    print("\nStarting Gridsearch")

# Run gridsearch for all each model
    out = {}
    #for
    for i in range(len(all_parameters)):
        model_name = str(all_pipelines[i].steps[1][1])
        #model_name = str(model.steps[1][1])
        #out[model_name] = dict()

        print("\nStarting Gridsearch for model:", model_name, i)
        gs = GridSearchCV(all_pipelines[i], all_parameters[i]
                          , cv = sel_cv
                          #, **scoring_refit
                          #, refit=False
                          , **chosen_scoreD
                          , **njobs
                          , verbose=3)
        gs = gs.fit(input_df,target)
        print('=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
        print ("Finished Gridsearch")
        print ('\nScore type to choose best score:', chosen_score)

        best_model = gs.best_params_
        best_score = gs.best_score_

        best_params = deepcopy(gs.best_params_)
        print ('\nBest score:', best_score, '\ntype: ', type(best_score))
        print ('\nBest params:', best_params, '\ntype: ', type(best_params))

        out[model_name] = best_params
        out[model_name].update(chosen_scoreD.copy())
        out[model_name].update({'best_score': gs.best_score_}.copy())
    return(out)

# TODO:
# print, or see for each model mean test score and sd, sometimes they can be identical and your best model just picks one!
#%% call CUSTOM grid_search: INTRA model [with return]
# call
chosen_score =  {'scoring': 'recall'
                 ,'refit': 'recall'}
mcc_score_fn = {'chosen_scoreD': {'scoring': {'mcc': make_scorer(matthews_corrcoef)
                                            ,'refit': 'mcc'}
                                  }
                }
intra_models = grid_search(X, y
            , skf_cv = skf_cv
            , chosen_scoreD= chosen_score
            #, **mcc_score_fn)# doesn't work
pp.pprint(intra_models)

# TO DO: combine other score metrics for the best model: This will tell you which hyperparams are the best with each metric
    # shouldn't be hard, as the dicts for a given model type will have the same shape
# TO DO: if you can specify different score and append to the model