added intra_model_gscv.py that tell me within each model which hyperparasm are best, allows me to choose the models with the best hyperparams to then compare 'INTER' model
This commit is contained in:
parent
d3b6fe13a6
commit
ffd3ce6ee3
1 changed files with 170 additions and 0 deletions
170
intra_model_gscv.py
Normal file
170
intra_model_gscv.py
Normal file
|
@ -0,0 +1,170 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Mar 18 16:44:18 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
# Custom GridSearch <intra model>
|
||||
#https://stackoverflow.com/questions/23045318/grid-search-over-multiple-classifiers
|
||||
|
||||
#%% ClfSwitcher()
|
||||
class ClfSwitcher(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
estimator = SGDClassifier(),
|
||||
):
|
||||
"""
|
||||
A Custom BaseEstimator that can switch between classifiers.
|
||||
:param estimator: sklearn object - The classifier
|
||||
"""
|
||||
|
||||
self.estimator = estimator
|
||||
|
||||
def fit(self, X, y=None, **kwargs):
|
||||
self.estimator.fit(X, y)
|
||||
return self
|
||||
|
||||
def predict(self, X, y=None):
|
||||
return self.estimator.predict(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
return self.estimator.predict_proba(X)
|
||||
|
||||
#def score(self, X, y):
|
||||
# return self.estimator.score(X, y)
|
||||
|
||||
#def recall_score(self, X, y):
|
||||
# return self.estimator.recall_score(X, y)
|
||||
#%% Custom GridSearch: IntraModel[orig]
|
||||
def grid_search2(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']) :
|
||||
|
||||
pipeline1 = Pipeline((
|
||||
('pre', MinMaxScaler())
|
||||
, ('clf', DecisionTreeClassifier(**rs))
|
||||
))
|
||||
|
||||
pipeline2 = Pipeline((
|
||||
('pre', MinMaxScaler())
|
||||
,('clf', KNeighborsClassifier())
|
||||
))
|
||||
|
||||
parameters1 = {
|
||||
'clf__max_depth': [ 2, 4, 6, 8, 10]
|
||||
, 'clf__criterion':['gini','entropy']
|
||||
, "clf__max_features":["auto", None]
|
||||
, "clf__max_leaf_nodes":[10,20,30,40]
|
||||
}
|
||||
|
||||
parameters2 = {
|
||||
'clf__n_neighbors': [3, 7, 10],
|
||||
'clf__weights': ['uniform', 'distance']
|
||||
}
|
||||
|
||||
pips = [pipeline1
|
||||
, pipeline2
|
||||
]
|
||||
|
||||
pars = [parameters1
|
||||
, parameters2
|
||||
]
|
||||
|
||||
print("\nstarting Gridsearch")
|
||||
for i in range(len(pars)):
|
||||
print('IIIII===>', i)
|
||||
gs = GridSearchCV(pips[i], pars[i]
|
||||
, cv = skf_cv
|
||||
, **scoring_refit
|
||||
#, refit=False
|
||||
, **njobs
|
||||
, verbose=3)
|
||||
gs = gs.fit(X, y)
|
||||
print ("finished Gridsearch")
|
||||
print ('\nBest model:', gs.best_params_)
|
||||
print ('\nBest score:', gs.best_score_)
|
||||
#%% Custom grid_search: Intra-Model [with return]
|
||||
def grid_search(input_df, target
|
||||
, skf_cv
|
||||
, chosen_scoreD #scoring_refit
|
||||
#, var_type = ['numerical', 'categorical','mixed']
|
||||
):
|
||||
# Pipeline_params
|
||||
pipeline1 = Pipeline((
|
||||
('pre', MinMaxScaler())
|
||||
, ('clf', DecisionTreeClassifier(**rs))
|
||||
))
|
||||
|
||||
pipeline2 = Pipeline((
|
||||
('pre', MinMaxScaler())
|
||||
,('clf', KNeighborsClassifier())
|
||||
))
|
||||
|
||||
parameters1 = {
|
||||
'clf__max_depth': [ 2, 4, 6, 8, 10]
|
||||
, 'clf__criterion':['gini','entropy']
|
||||
, "clf__max_features":["auto", None]
|
||||
, "clf__max_leaf_nodes":[10,20,30,40]
|
||||
}
|
||||
parameters2 = {
|
||||
'clf__n_neighbors': [3, 7, 10],
|
||||
'clf__weights': ['uniform', 'distance']
|
||||
}
|
||||
|
||||
all_parameters = [parameters1
|
||||
, parameters2
|
||||
]
|
||||
all_pipelines = [pipeline1
|
||||
, pipeline2
|
||||
]
|
||||
print("\nStarting Gridsearch")
|
||||
|
||||
# Run gridsearch for all each model
|
||||
out = {}
|
||||
#for
|
||||
for i in range(len(all_parameters)):
|
||||
model_name = str(all_pipelines[i].steps[1][1])
|
||||
#model_name = str(model.steps[1][1])
|
||||
#out[model_name] = dict()
|
||||
|
||||
print("\nStarting Gridsearch for model:", model_name, i)
|
||||
gs = GridSearchCV(all_pipelines[i], all_parameters[i]
|
||||
, cv = skf_cv
|
||||
#, **scoring_refit
|
||||
#, refit=False
|
||||
, **chosen_scoreD
|
||||
, **njobs
|
||||
, verbose=3)
|
||||
gs = gs.fit(input_df,target)
|
||||
print('=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
|
||||
print ("Finished Gridsearch")
|
||||
print ('\nScore type to choose best score:', chosen_score)
|
||||
|
||||
best_model = gs.best_params_
|
||||
best_score = gs.best_score_
|
||||
|
||||
best_params = deepcopy(gs.best_params_)
|
||||
print ('\nBest score:', best_score, '\ntype: ', type(best_score))
|
||||
print ('\nBest params:', best_params, '\ntype: ', type(best_params))
|
||||
|
||||
out[model_name] = best_params
|
||||
out[model_name].update(chosen_scoreD.copy())
|
||||
out[model_name].update({'best_score': gs.best_score_}.copy())
|
||||
return(out)
|
||||
#%% call CUSTOM grid_search: INTRA model [with return]
|
||||
# call
|
||||
chosen_score = {'scoring': 'recall'
|
||||
,'refit': 'recall'}
|
||||
mcc_score_fn = {'chosen_scoreD': {'scoring': {'mcc': make_scorer(matthews_corrcoef)
|
||||
,'refit': 'mcc'}
|
||||
}
|
||||
}
|
||||
|
||||
intra_models = grid_search(X, y
|
||||
, skf_cv = skf_cv
|
||||
, chosen_scoreD= chosen_score
|
||||
#, **mcc_score_fn)# doesn't work
|
||||
pp.pprint(intra_models)
|
||||
|
||||
# TO DO: combine other score metrics for the best model: This will tell you which hyperparams are the best with each metric
|
||||
# shouldn't be hard, as the dicts for a given model type will have the same shape
|
||||
# TO DO: if you can specify different score and append to the model
|
Loading…
Add table
Add a link
Reference in a new issue