184 lines
5.8 KiB
Python
Executable file
184 lines
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Fri Mar 18 16:44:18 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
# Custom GridSearch <intra model>
|
|
#https://stackoverflow.com/questions/23045318/grid-search-over-multiple-classifiers
|
|
|
|
#%% ClfSwitcher()
|
|
class ClfSwitcher(BaseEstimator):
|
|
def __init__(
|
|
self,
|
|
estimator = SGDClassifier(),
|
|
):
|
|
"""
|
|
A Custom BaseEstimator that can switch between classifiers.
|
|
:param estimator: sklearn object - The classifier
|
|
"""
|
|
|
|
self.estimator = estimator
|
|
|
|
def fit(self, X, y=None, **kwargs):
|
|
self.estimator.fit(X, y)
|
|
return self
|
|
|
|
def predict(self, X, y=None):
|
|
return self.estimator.predict(X)
|
|
|
|
def predict_proba(self, X):
|
|
return self.estimator.predict_proba(X)
|
|
|
|
#def score(self, X, y):
|
|
# return self.estimator.score(X, y)
|
|
|
|
#def recall_score(self, X, y):
|
|
# return self.estimator.recall_score(X, y)
|
|
#%% Custom GridSearch: IntraModel[orig]
|
|
def grid_search(input_df, target, sel_cv, var_type = ['numerical', 'categorical','mixed']) :
|
|
|
|
pipeline1 = Pipeline((
|
|
('pre', MinMaxScaler())
|
|
, ('clf', DecisionTreeClassifier(**rs))
|
|
))
|
|
|
|
pipeline2 = Pipeline((
|
|
('pre', MinMaxScaler())
|
|
,('clf', KNeighborsClassifier())
|
|
))
|
|
|
|
parameters1 = {
|
|
'clf__max_depth': [ 2, 4, 6, 8, 10]
|
|
, 'clf__criterion':['gini','entropy']
|
|
, "clf__max_features":["auto", None]
|
|
, "clf__max_leaf_nodes":[10,20,30,40]
|
|
}
|
|
|
|
parameters2 = {
|
|
'clf__n_neighbors': [3, 7, 10],
|
|
'clf__weights': ['uniform', 'distance']
|
|
}
|
|
|
|
pips = [pipeline1
|
|
, pipeline2
|
|
]
|
|
|
|
pars = [parameters1
|
|
, parameters2
|
|
]
|
|
|
|
print("\nstarting Gridsearch")
|
|
for i in range(len(pars)):
|
|
print('IIIII===>', i)
|
|
gs = GridSearchCV(pips[i], pars[i]
|
|
, cv = sel_cv
|
|
, **scoring_refit
|
|
#, refit=False
|
|
, **njobs
|
|
, verbose=3)
|
|
gs = gs.fit(X, y)
|
|
print ("finished Gridsearch")
|
|
print ('\nBest model:', gs.best_params_)
|
|
print ('\nBest score:', gs.best_score_)
|
|
# TODO: add
|
|
# # summarize results
|
|
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
|
|
# means = grid_result.cv_results_['mean_test_score']
|
|
# stds = grid_result.cv_results_['std_test_score']
|
|
# params = grid_result.cv_results_['params']
|
|
# for mean, stdev, param in zip(means, stds, params):
|
|
# print("%f (%f) with: %r" % (mean, stdev, param))
|
|
|
|
# CALL: grid_search [orig]
|
|
grid_search()
|
|
|
|
# #%% Custom grid_search: Intra-Model [with return]
|
|
def grid_search(input_df, target
|
|
, sel_cv
|
|
, chosen_scoreD #scoring_refit
|
|
#, var_type = ['numerical', 'categorical','mixed']
|
|
):
|
|
# Pipeline_params
|
|
pipeline1 = Pipeline((
|
|
('pre', MinMaxScaler())
|
|
, ('clf', DecisionTreeClassifier(**rs))
|
|
))
|
|
|
|
pipeline2 = Pipeline((
|
|
('pre', MinMaxScaler())
|
|
,('clf', KNeighborsClassifier())
|
|
))
|
|
|
|
parameters1 = {
|
|
'clf__max_depth': [ 2, 4, 6, 8, 10]
|
|
, 'clf__criterion':['gini','entropy']
|
|
, "clf__max_features":["auto", None]
|
|
, "clf__max_leaf_nodes":[10,20,30,40]
|
|
}
|
|
parameters2 = {
|
|
'clf__n_neighbors': [3, 7, 10],
|
|
'clf__weights': ['uniform', 'distance']
|
|
}
|
|
|
|
all_parameters = [parameters1
|
|
, parameters2
|
|
]
|
|
all_pipelines = [pipeline1
|
|
, pipeline2
|
|
]
|
|
print("\nStarting Gridsearch")
|
|
|
|
# Run gridsearch for all each model
|
|
out = {}
|
|
#for
|
|
for i in range(len(all_parameters)):
|
|
model_name = str(all_pipelines[i].steps[1][1])
|
|
#model_name = str(model.steps[1][1])
|
|
#out[model_name] = dict()
|
|
|
|
print("\nStarting Gridsearch for model:", model_name, i)
|
|
gs = GridSearchCV(all_pipelines[i], all_parameters[i]
|
|
, cv = sel_cv
|
|
#, **scoring_refit
|
|
#, refit=False
|
|
, **chosen_scoreD
|
|
, **njobs
|
|
, verbose=3)
|
|
gs = gs.fit(input_df,target)
|
|
print('=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=')
|
|
print ("Finished Gridsearch")
|
|
print ('\nScore type to choose best score:', chosen_score)
|
|
|
|
best_model = gs.best_params_
|
|
best_score = gs.best_score_
|
|
|
|
best_params = deepcopy(gs.best_params_)
|
|
print ('\nBest score:', best_score, '\ntype: ', type(best_score))
|
|
print ('\nBest params:', best_params, '\ntype: ', type(best_params))
|
|
|
|
out[model_name] = best_params
|
|
out[model_name].update(chosen_scoreD.copy())
|
|
out[model_name].update({'best_score': gs.best_score_}.copy())
|
|
return(out)
|
|
|
|
# TODO:
|
|
# print, or see for each model mean test score and sd, sometimes they can be identical and your best model just picks one!
|
|
#%% call CUSTOM grid_search: INTRA model [with return]
|
|
# call
|
|
chosen_score = {'scoring': 'recall'
|
|
,'refit': 'recall'}
|
|
mcc_score_fn = {'chosen_scoreD': {'scoring': {'mcc': make_scorer(matthews_corrcoef)
|
|
,'refit': 'mcc'}
|
|
}
|
|
}
|
|
intra_models = grid_search(X, y
|
|
, skf_cv = skf_cv
|
|
, chosen_scoreD= chosen_score
|
|
#, **mcc_score_fn)# doesn't work
|
|
pp.pprint(intra_models)
|
|
|
|
# TO DO: combine other score metrics for the best model: This will tell you which hyperparams are the best with each metric
|
|
# shouldn't be hard, as the dicts for a given model type will have the same shape
|
|
# TO DO: if you can specify different score and append to the model
|