ML_AI_training/base_estimator.py

244 lines
7.4 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 09:50:37 2022
@author: tanu
"""
#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
#%%
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#%%
class EstimatorSelectionHelper:
def __init__(self, models, params):
self.models = models
self.params = params
self.keys = models.keys()
self.grid_searches = {}
def fit(self, X, y, **grid_kwargs):
for key in self.keys:
print('Running GridSearchCV for %s.' % key)
model = self.models[key]
params = self.params[key]
grid_search = GridSearchCV(model, params, **grid_kwargs)
grid_search.fit(X, y)
self.grid_searches[key] = grid_search
print('Done.')
def score_summary(self, sort_by='mean_test_score'):
frames = []
for name, grid_search in self.grid_searches.items():
frame = pd.DataFrame(grid_search.cv_results_)
frame = frame.filter(regex='^(?!.*param_).*$')
frame['estimator'] = len(frame)*[name]
frames.append(frame)
df = pd.concat(frames)
df = df.sort_values([sort_by], ascending=False)
df = df.reset_index()
df = df.drop(['rank_test_score', 'index'], 1)
columns = df.columns.tolist()
columns.remove('estimator')
columns = ['estimator']+columns
df = df[columns]
return df
#%%
breast_cancer = datasets.load_breast_cancer()
X_cancer = breast_cancer.data
y_cancer = breast_cancer.target
models1 = {
'ExtraTreesClassifier': ExtraTreesClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier()
}
params1 = {
'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
'RandomForestClassifier': [
{ 'n_estimators': [16, 32] },
{'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}],
'AdaBoostClassifier': { 'n_estimators': [16, 32] },
'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }
}
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
helper1.score_summary()
mm_df = helper1.score_summary()
# COMMENT: Not sure what scores is it mean of and the options available thus
#%%
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
#'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
#'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
{
'clf__estimator': [LogisticRegression()],
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'max_iter': list(range(100,800,100)),
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
},
]
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
gscv = GridSearchCV(pipeline
, parameters
, cv=5
, n_jobs=12
, return_train_score=False
, verbose=3)
#gscv.fit(train_data, train_labels)
#%% my numerical data
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, test_size = 0.33
, random_state = 2
, shuffle = True
, stratify = num_df_wtgt['mutation_class'])
y_train.to_frame().value_counts().plot(kind = 'bar')
y_test.to_frame().value_counts().plot(kind = 'bar')
#%%
gscv.fit(X_train, y_train)
print('Best model:\n', gscv.best_params_)
print('Best models score:\n', gscv.best_score_)
gscv.score(X_test, y_test) # see how it does on test
#===========================================
mod_pred = gscv.predict(X_test)
fscore = f1_score(y_test, mod_pred)
fscore
#%% same as above
# custom classifier
class MyClassifier(BaseEstimator):
def __init__(self, classifier_type: str = 'SGDClassifier'):
"""
A Custome BaseEstimator that can switch between classifiers.
:param classifier_type: string - The switch for different classifiers
"""
self.classifier_type = classifier_type
def fit(self, X, y=None):
if self.classifier_type == 'SGDClassifier':
self.classifier_ = SGDClassifier()
elif self.classifier_type == 'MultinomialNB':
self.classifier_ = MultinomialNB()
else:
raise ValueError('Unkown classifier type.')
self.classifier_.fit(X, y)
return self
def predict(self, X, y=None):
return self.classifier_.predict(X)
def score(self, X, y):
return self.estimator.score(X, y)
pipeline = Pipeline([
('pre', MinMaxScaler())
#, ('clf', ClfSwitcher()
, ('clf', MyClassifier())
])
# parameter_space = {
# 'clf__classifier_type': ['SGDClassifier', 'MultinomialNB']
# }
parameter_space = [
{
'clf__estimator': [SGDClassifier()],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5)
search.fit(X_train, y_train)
print('Best model:\n', search.best_params_)
print('Best models score:\n', gscv.best_score_)