added practice and base_estimator for all the confusion in my head
This commit is contained in:
parent
e28a296d98
commit
97620c1bb0
3 changed files with 513 additions and 0 deletions
236
base_estimator.py
Normal file
236
base_estimator.py
Normal file
|
@ -0,0 +1,236 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Mar 15 09:50:37 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
|
||||
|
||||
#%%
|
||||
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.ensemble import ExtraTreesClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
#%%
|
||||
class EstimatorSelectionHelper:
|
||||
|
||||
def __init__(self, models, params):
|
||||
self.models = models
|
||||
self.params = params
|
||||
self.keys = models.keys()
|
||||
self.grid_searches = {}
|
||||
|
||||
def fit(self, X, y, **grid_kwargs):
|
||||
for key in self.keys:
|
||||
print('Running GridSearchCV for %s.' % key)
|
||||
model = self.models[key]
|
||||
params = self.params[key]
|
||||
grid_search = GridSearchCV(model, params, **grid_kwargs)
|
||||
grid_search.fit(X, y)
|
||||
self.grid_searches[key] = grid_search
|
||||
print('Done.')
|
||||
|
||||
def score_summary(self, sort_by='mean_test_score'):
|
||||
frames = []
|
||||
for name, grid_search in self.grid_searches.items():
|
||||
frame = pd.DataFrame(grid_search.cv_results_)
|
||||
frame = frame.filter(regex='^(?!.*param_).*$')
|
||||
frame['estimator'] = len(frame)*[name]
|
||||
frames.append(frame)
|
||||
df = pd.concat(frames)
|
||||
|
||||
df = df.sort_values([sort_by], ascending=False)
|
||||
df = df.reset_index()
|
||||
df = df.drop(['rank_test_score', 'index'], 1)
|
||||
|
||||
columns = df.columns.tolist()
|
||||
columns.remove('estimator')
|
||||
columns = ['estimator']+columns
|
||||
df = df[columns]
|
||||
return df
|
||||
|
||||
#%%
|
||||
breast_cancer = datasets.load_breast_cancer()
|
||||
X_cancer = breast_cancer.data
|
||||
y_cancer = breast_cancer.target
|
||||
|
||||
|
||||
models1 = {
|
||||
'ExtraTreesClassifier': ExtraTreesClassifier(),
|
||||
'RandomForestClassifier': RandomForestClassifier(),
|
||||
'AdaBoostClassifier': AdaBoostClassifier(),
|
||||
'GradientBoostingClassifier': GradientBoostingClassifier()
|
||||
}
|
||||
|
||||
params1 = {
|
||||
'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
|
||||
'RandomForestClassifier': [
|
||||
{ 'n_estimators': [16, 32] },
|
||||
{'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}],
|
||||
'AdaBoostClassifier': { 'n_estimators': [16, 32] },
|
||||
'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }
|
||||
}
|
||||
|
||||
|
||||
helper1 = EstimatorSelectionHelper(models1, params1)
|
||||
helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
|
||||
helper1.score_summary()
|
||||
|
||||
mm_df = helper1.score_summary()
|
||||
# COMMENT: Not sure what scores is it mean of and the options available thus
|
||||
|
||||
#%%
|
||||
|
||||
class ClfSwitcher(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
estimator = SGDClassifier(),
|
||||
):
|
||||
"""
|
||||
A Custom BaseEstimator that can switch between classifiers.
|
||||
:param estimator: sklearn object - The classifier
|
||||
"""
|
||||
|
||||
self.estimator = estimator
|
||||
|
||||
|
||||
def fit(self, X, y=None, **kwargs):
|
||||
self.estimator.fit(X, y)
|
||||
return self
|
||||
|
||||
def predict(self, X, y=None):
|
||||
return self.estimator.predict(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
return self.estimator.predict_proba(X)
|
||||
|
||||
def score(self, X, y):
|
||||
return self.estimator.score(X, y)
|
||||
|
||||
parameters = [
|
||||
{
|
||||
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
|
||||
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
|
||||
#'tfidf__stop_words': ['english', None],
|
||||
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
|
||||
'clf__estimator__max_iter': [50, 80],
|
||||
'clf__estimator__tol': [1e-4],
|
||||
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
|
||||
},
|
||||
{
|
||||
'clf__estimator': [MultinomialNB()],
|
||||
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
|
||||
#'tfidf__stop_words': [None],
|
||||
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
|
||||
},
|
||||
]
|
||||
|
||||
pipeline = Pipeline([
|
||||
('pre', MinMaxScaler()),
|
||||
('clf', ClfSwitcher()),
|
||||
])
|
||||
|
||||
|
||||
gscv = GridSearchCV(pipeline
|
||||
, parameters
|
||||
, cv=5
|
||||
, n_jobs=12
|
||||
, return_train_score=False
|
||||
, verbose=3)
|
||||
|
||||
#gscv.fit(train_data, train_labels)
|
||||
|
||||
#%% my numerical data
|
||||
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
|
||||
, num_df_wtgt['mutation_class']
|
||||
, test_size = 0.33
|
||||
, random_state = 2
|
||||
, shuffle = True
|
||||
, stratify = num_df_wtgt['mutation_class'])
|
||||
|
||||
y_train.to_frame().value_counts().plot(kind = 'bar')
|
||||
y_test.to_frame().value_counts().plot(kind = 'bar')
|
||||
#%%
|
||||
gscv.fit(X_train, y_train)
|
||||
print('Best model:\n', gscv.best_params_)
|
||||
print('Best models score:\n', gscv.best_score_)
|
||||
gscv.score(X_test, y_test) # see how it does on test
|
||||
|
||||
#===========================================
|
||||
mod_pred = gscv.predict(X_test)
|
||||
|
||||
fscore = f1_score(y_test, mod_pred)
|
||||
fscore
|
||||
#%% same as above
|
||||
# custom classifier
|
||||
|
||||
class MyClassifier(BaseEstimator):
|
||||
|
||||
def __init__(self, classifier_type: str = 'SGDClassifier'):
|
||||
"""
|
||||
A Custome BaseEstimator that can switch between classifiers.
|
||||
:param classifier_type: string - The switch for different classifiers
|
||||
"""
|
||||
self.classifier_type = classifier_type
|
||||
|
||||
|
||||
def fit(self, X, y=None):
|
||||
if self.classifier_type == 'SGDClassifier':
|
||||
self.classifier_ = SGDClassifier()
|
||||
elif self.classifier_type == 'MultinomialNB':
|
||||
self.classifier_ = MultinomialNB()
|
||||
else:
|
||||
raise ValueError('Unkown classifier type.')
|
||||
|
||||
self.classifier_.fit(X, y)
|
||||
return self
|
||||
|
||||
def predict(self, X, y=None):
|
||||
return self.classifier_.predict(X)
|
||||
|
||||
def score(self, X, y):
|
||||
return self.estimator.score(X, y)
|
||||
|
||||
pipeline = Pipeline([
|
||||
('pre', MinMaxScaler())
|
||||
#, ('clf', ClfSwitcher()
|
||||
, ('clf', MyClassifier())
|
||||
])
|
||||
|
||||
# parameter_space = {
|
||||
# 'clf__classifier_type': ['SGDClassifier', 'MultinomialNB']
|
||||
# }
|
||||
|
||||
parameter_space = [
|
||||
{
|
||||
'clf__estimator': [SGDClassifier()],
|
||||
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
|
||||
'clf__estimator__max_iter': [50, 80],
|
||||
'clf__estimator__tol': [1e-4],
|
||||
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
|
||||
},
|
||||
{
|
||||
'clf__estimator': [MultinomialNB()],
|
||||
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
|
||||
},
|
||||
]
|
||||
|
||||
search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5)
|
||||
search.fit(X_train, y_train)
|
||||
print('Best model:\n', search.best_params_)
|
||||
print('Best models score:\n', gscv.best_score_)
|
Loading…
Add table
Add a link
Reference in a new issue