ML_AI_training/intra_model_gscv.py

184 lines
5.8 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 18 16:44:18 2022
@author: tanu
"""
# Custom GridSearch <intra model>
#https://stackoverflow.com/questions/23045318/grid-search-over-multiple-classifiers
#%% ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
#def score(self, X, y):
# return self.estimator.score(X, y)
#def recall_score(self, X, y):
# return self.estimator.recall_score(X, y)
#%% Custom GridSearch: IntraModel[orig]
def grid_search(input_df, target, sel_cv, var_type = ['numerical', 'categorical','mixed']) :
pipeline1 = Pipeline((
('pre', MinMaxScaler())
, ('clf', DecisionTreeClassifier(**rs))
))
pipeline2 = Pipeline((
('pre', MinMaxScaler())
,('clf', KNeighborsClassifier())
))
parameters1 = {
'clf__max_depth': [ 2, 4, 6, 8, 10]
, 'clf__criterion':['gini','entropy']
, "clf__max_features":["auto", None]
, "clf__max_leaf_nodes":[10,20,30,40]
}
parameters2 = {
'clf__n_neighbors': [3, 7, 10],
'clf__weights': ['uniform', 'distance']
}
pips = [pipeline1
, pipeline2
]
pars = [parameters1
, parameters2
]
print("\nstarting Gridsearch")
for i in range(len(pars)):
print('IIIII===>', i)
gs = GridSearchCV(pips[i], pars[i]
, cv = sel_cv
, **scoring_refit
#, refit=False
, **njobs
, verbose=3)
gs = gs.fit(X, y)
print ("finished Gridsearch")
print ('\nBest model:', gs.best_params_)
print ('\nBest score:', gs.best_score_)
# TODO: add
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
# print("%f (%f) with: %r" % (mean, stdev, param))
# CALL: grid_search [orig]
grid_search()
# #%% Custom grid_search: Intra-Model [with return]
def grid_search(input_df, target
, sel_cv
, chosen_scoreD #scoring_refit
#, var_type = ['numerical', 'categorical','mixed']
):
# Pipeline_params
pipeline1 = Pipeline((
('pre', MinMaxScaler())
, ('clf', DecisionTreeClassifier(**rs))
))
pipeline2 = Pipeline((
('pre', MinMaxScaler())
,('clf', KNeighborsClassifier())
))
parameters1 = {
'clf__max_depth': [ 2, 4, 6, 8, 10]
, 'clf__criterion':['gini','entropy']
, "clf__max_features":["auto", None]
, "clf__max_leaf_nodes":[10,20,30,40]
}
parameters2 = {
'clf__n_neighbors': [3, 7, 10],
'clf__weights': ['uniform', 'distance']
}
all_parameters = [parameters1
, parameters2
]
all_pipelines = [pipeline1
, pipeline2
]
print("\nStarting Gridsearch")
# Run gridsearch for all each model
out = {}
#for
for i in range(len(all_parameters)):
model_name = str(all_pipelines[i].steps[1][1])
#model_name = str(model.steps[1][1])
#out[model_name] = dict()
print("\nStarting Gridsearch for model:", model_name, i)
gs = GridSearchCV(all_pipelines[i], all_parameters[i]
, cv = sel_cv
#, **scoring_refit
#, refit=False
, **chosen_scoreD
, **njobs
, verbose=3)
gs = gs.fit(input_df,target)
print('=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
print ("Finished Gridsearch")
print ('\nScore type to choose best score:', chosen_score)
best_model = gs.best_params_
best_score = gs.best_score_
best_params = deepcopy(gs.best_params_)
print ('\nBest score:', best_score, '\ntype: ', type(best_score))
print ('\nBest params:', best_params, '\ntype: ', type(best_params))
out[model_name] = best_params
out[model_name].update(chosen_scoreD.copy())
out[model_name].update({'best_score': gs.best_score_}.copy())
return(out)
# TODO:
# print, or see for each model mean test score and sd, sometimes they can be identical and your best model just picks one!
#%% call CUSTOM grid_search: INTRA model [with return]
# call
chosen_score = {'scoring': 'recall'
,'refit': 'recall'}
mcc_score_fn = {'chosen_scoreD': {'scoring': {'mcc': make_scorer(matthews_corrcoef)
,'refit': 'mcc'}
}
}
intra_models = grid_search(X, y
, skf_cv = skf_cv
, chosen_scoreD= chosen_score
#, **mcc_score_fn)# doesn't work
pp.pprint(intra_models)
# TO DO: combine other score metrics for the best model: This will tell you which hyperparams are the best with each metric
# shouldn't be hard, as the dicts for a given model type will have the same shape
# TO DO: if you can specify different score and append to the model