#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 09:50:37 2022 @author: tanu """ #https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers #%% # https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb import numpy as np import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn import datasets from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC from sklearn.base import BaseEstimator from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV #%% class EstimatorSelectionHelper: def __init__(self, models, params): self.models = models self.params = params self.keys = models.keys() self.grid_searches = {} def fit(self, X, y, **grid_kwargs): for key in self.keys: print('Running GridSearchCV for %s.' % key) model = self.models[key] params = self.params[key] grid_search = GridSearchCV(model, params, **grid_kwargs) grid_search.fit(X, y) self.grid_searches[key] = grid_search print('Done.') def score_summary(self, sort_by='mean_test_score'): frames = [] for name, grid_search in self.grid_searches.items(): frame = pd.DataFrame(grid_search.cv_results_) frame = frame.filter(regex='^(?!.*param_).*$') frame['estimator'] = len(frame)*[name] frames.append(frame) df = pd.concat(frames) df = df.sort_values([sort_by], ascending=False) df = df.reset_index() df = df.drop(['rank_test_score', 'index'], 1) columns = df.columns.tolist() columns.remove('estimator') columns = ['estimator']+columns df = df[columns] return df #%% breast_cancer = datasets.load_breast_cancer() X_cancer = breast_cancer.data y_cancer = breast_cancer.target models1 = { 'ExtraTreesClassifier': ExtraTreesClassifier(), 'RandomForestClassifier': RandomForestClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier() } params1 = { 'ExtraTreesClassifier': { 'n_estimators': [16, 32] }, 'RandomForestClassifier': [ { 'n_estimators': [16, 32] }, {'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}], 'AdaBoostClassifier': { 'n_estimators': [16, 32] }, 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] } } helper1 = EstimatorSelectionHelper(models1, params1) helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2) helper1.score_summary() mm_df = helper1.score_summary() # COMMENT: Not sure what scores is it mean of and the options available thus #%% class ClfSwitcher(BaseEstimator): def __init__( self, estimator = SGDClassifier(), ): """ A Custom BaseEstimator that can switch between classifiers. :param estimator: sklearn object - The classifier """ self.estimator = estimator def fit(self, X, y=None, **kwargs): self.estimator.fit(X, y) return self def predict(self, X, y=None): return self.estimator.predict(X) def predict_proba(self, X): return self.estimator.predict_proba(X) def score(self, X, y): return self.estimator.score(X, y) parameters = [ { 'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), #'tfidf__stop_words': ['english', None], 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), 'clf__estimator__max_iter': [50, 80], 'clf__estimator__tol': [1e-4], 'clf__estimator__loss': ['hinge', 'log', 'modified_huber'], }, { 'clf__estimator': [MultinomialNB()], #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), #'tfidf__stop_words': [None], 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), }, { 'clf__estimator': [LogisticRegression()], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['none', 'l1', 'l2', 'elasticnet'], 'max_iter': list(range(100,800,100)), 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], }, ] pipeline = Pipeline([ ('pre', MinMaxScaler()), ('clf', ClfSwitcher()), ]) gscv = GridSearchCV(pipeline , parameters , cv=5 , n_jobs=12 , return_train_score=False , verbose=3) #gscv.fit(train_data, train_labels) #%% my numerical data X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , test_size = 0.33 , random_state = 2 , shuffle = True , stratify = num_df_wtgt['mutation_class']) y_train.to_frame().value_counts().plot(kind = 'bar') y_test.to_frame().value_counts().plot(kind = 'bar') #%% gscv.fit(X_train, y_train) print('Best model:\n', gscv.best_params_) print('Best models score:\n', gscv.best_score_) gscv.score(X_test, y_test) # see how it does on test #=========================================== mod_pred = gscv.predict(X_test) fscore = f1_score(y_test, mod_pred) fscore #%% same as above # custom classifier class MyClassifier(BaseEstimator): def __init__(self, classifier_type: str = 'SGDClassifier'): """ A Custome BaseEstimator that can switch between classifiers. :param classifier_type: string - The switch for different classifiers """ self.classifier_type = classifier_type def fit(self, X, y=None): if self.classifier_type == 'SGDClassifier': self.classifier_ = SGDClassifier() elif self.classifier_type == 'MultinomialNB': self.classifier_ = MultinomialNB() else: raise ValueError('Unkown classifier type.') self.classifier_.fit(X, y) return self def predict(self, X, y=None): return self.classifier_.predict(X) def score(self, X, y): return self.estimator.score(X, y) pipeline = Pipeline([ ('pre', MinMaxScaler()) #, ('clf', ClfSwitcher() , ('clf', MyClassifier()) ]) # parameter_space = { # 'clf__classifier_type': ['SGDClassifier', 'MultinomialNB'] # } parameter_space = [ { 'clf__estimator': [SGDClassifier()], 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), 'clf__estimator__max_iter': [50, 80], 'clf__estimator__tol': [1e-4], 'clf__estimator__loss': ['hinge', 'log', 'modified_huber'], }, { 'clf__estimator': [MultinomialNB()], 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), }, ] search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5) search.fit(X_train, y_train) print('Best model:\n', search.best_params_) print('Best models score:\n', gscv.best_score_)