diff --git a/intra_model_gscv.py b/intra_model_gscv.py new file mode 100644 index 0000000..40ef39e --- /dev/null +++ b/intra_model_gscv.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 18 16:44:18 2022 + +@author: tanu +""" +# Custom GridSearch +#https://stackoverflow.com/questions/23045318/grid-search-over-multiple-classifiers + +#%% ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + #def score(self, X, y): + # return self.estimator.score(X, y) + + #def recall_score(self, X, y): + # return self.estimator.recall_score(X, y) +#%% Custom GridSearch: IntraModel[orig] +def grid_search2(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']) : + + pipeline1 = Pipeline(( + ('pre', MinMaxScaler()) + , ('clf', DecisionTreeClassifier(**rs)) + )) + + pipeline2 = Pipeline(( + ('pre', MinMaxScaler()) + ,('clf', KNeighborsClassifier()) + )) + + parameters1 = { + 'clf__max_depth': [ 2, 4, 6, 8, 10] + , 'clf__criterion':['gini','entropy'] + , "clf__max_features":["auto", None] + , "clf__max_leaf_nodes":[10,20,30,40] + } + + parameters2 = { + 'clf__n_neighbors': [3, 7, 10], + 'clf__weights': ['uniform', 'distance'] + } + + pips = [pipeline1 + , pipeline2 + ] + + pars = [parameters1 + , parameters2 + ] + + print("\nstarting Gridsearch") + for i in range(len(pars)): + print('IIIII===>', i) + gs = GridSearchCV(pips[i], pars[i] + , cv = skf_cv + , **scoring_refit + #, refit=False + , **njobs + , verbose=3) + gs = gs.fit(X, y) + print ("finished Gridsearch") + print ('\nBest model:', gs.best_params_) + print ('\nBest score:', gs.best_score_) +#%% Custom grid_search: Intra-Model [with return] +def grid_search(input_df, target + , skf_cv + , chosen_scoreD #scoring_refit + #, var_type = ['numerical', 'categorical','mixed'] + ): +# Pipeline_params + pipeline1 = Pipeline(( + ('pre', MinMaxScaler()) + , ('clf', DecisionTreeClassifier(**rs)) + )) + + pipeline2 = Pipeline(( + ('pre', MinMaxScaler()) + ,('clf', KNeighborsClassifier()) + )) + + parameters1 = { + 'clf__max_depth': [ 2, 4, 6, 8, 10] + , 'clf__criterion':['gini','entropy'] + , "clf__max_features":["auto", None] + , "clf__max_leaf_nodes":[10,20,30,40] + } + parameters2 = { + 'clf__n_neighbors': [3, 7, 10], + 'clf__weights': ['uniform', 'distance'] + } + + all_parameters = [parameters1 + , parameters2 + ] + all_pipelines = [pipeline1 + , pipeline2 + ] + print("\nStarting Gridsearch") + +# Run gridsearch for all each model + out = {} + #for + for i in range(len(all_parameters)): + model_name = str(all_pipelines[i].steps[1][1]) + #model_name = str(model.steps[1][1]) + #out[model_name] = dict() + + print("\nStarting Gridsearch for model:", model_name, i) + gs = GridSearchCV(all_pipelines[i], all_parameters[i] + , cv = skf_cv + #, **scoring_refit + #, refit=False + , **chosen_scoreD + , **njobs + , verbose=3) + gs = gs.fit(input_df,target) + print('=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=') + print ("Finished Gridsearch") + print ('\nScore type to choose best score:', chosen_score) + + best_model = gs.best_params_ + best_score = gs.best_score_ + + best_params = deepcopy(gs.best_params_) + print ('\nBest score:', best_score, '\ntype: ', type(best_score)) + print ('\nBest params:', best_params, '\ntype: ', type(best_params)) + + out[model_name] = best_params + out[model_name].update(chosen_scoreD.copy()) + out[model_name].update({'best_score': gs.best_score_}.copy()) + return(out) +#%% call CUSTOM grid_search: INTRA model [with return] +# call +chosen_score = {'scoring': 'recall' + ,'refit': 'recall'} +mcc_score_fn = {'chosen_scoreD': {'scoring': {'mcc': make_scorer(matthews_corrcoef) + ,'refit': 'mcc'} + } + } + +intra_models = grid_search(X, y + , skf_cv = skf_cv + , chosen_scoreD= chosen_score + #, **mcc_score_fn)# doesn't work +pp.pprint(intra_models) + +# TO DO: combine other score metrics for the best model: This will tell you which hyperparams are the best with each metric + # shouldn't be hard, as the dicts for a given model type will have the same shape +# TO DO: if you can specify different score and append to the model