ML_AI_training/base_estimator.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 09:50:37 2022

@author: tanu
"""
#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers

#%%
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV


from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#%%
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')

    def score_summary(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)

        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)

        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df

#%%
breast_cancer = datasets.load_breast_cancer()
X_cancer = breast_cancer.data
y_cancer = breast_cancer.target


models1 = {
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
}

params1 = {
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': [
        { 'n_estimators': [16, 32] },
        {'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}],
    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }
}


helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
helper1.score_summary()

mm_df = helper1.score_summary()
# COMMENT: Not sure what scores is it mean of and the options available thus

#%%

class ClfSwitcher(BaseEstimator):
    def __init__(
        self,
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

parameters = [
    {
        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        #'tfidf__stop_words': ['english', None],
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        'clf__estimator': [MultinomialNB()],
        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        #'tfidf__stop_words': [None],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },

    {
        'clf__estimator': [LogisticRegression()],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'max_iter': list(range(100,800,100)),
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    },
]

pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
])


gscv = GridSearchCV(pipeline
                    , parameters
                    , cv=5
                    , n_jobs=12
                    , return_train_score=False
                    , verbose=3)

#gscv.fit(train_data, train_labels)

#%% my numerical data
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']
                                                    , test_size    = 0.33
                                                    , random_state = 2
                                                    , shuffle      = True
                                                    , stratify     = num_df_wtgt['mutation_class'])

y_train.to_frame().value_counts().plot(kind = 'bar')
y_test.to_frame().value_counts().plot(kind = 'bar')
#%%
gscv.fit(X_train, y_train)
print('Best model:\n', gscv.best_params_)
print('Best models score:\n', gscv.best_score_)
gscv.score(X_test, y_test) # see how it does on test

#===========================================
mod_pred = gscv.predict(X_test)

fscore  = f1_score(y_test, mod_pred)
fscore
#%% same as above
# custom classifier

class MyClassifier(BaseEstimator):

    def __init__(self, classifier_type: str = 'SGDClassifier'):
        """
        A Custome BaseEstimator that can switch between classifiers.
        :param classifier_type: string - The switch for different classifiers
        """
        self.classifier_type = classifier_type


    def fit(self, X, y=None):
        if self.classifier_type == 'SGDClassifier':
            self.classifier_ = SGDClassifier()
        elif self.classifier_type == 'MultinomialNB':
            self.classifier_ = MultinomialNB()
        else:
            raise ValueError('Unkown classifier type.')

        self.classifier_.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.classifier_.predict(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

pipeline = Pipeline([
     ('pre', MinMaxScaler())
    #, ('clf', ClfSwitcher()
    , ('clf', MyClassifier())
])

# parameter_space = {
#     'clf__classifier_type': ['SGDClassifier', 'MultinomialNB']
# }

parameter_space = [
    {
        'clf__estimator': [SGDClassifier()],
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
]

search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5)
search.fit(X_train, y_train)
print('Best model:\n', search.best_params_)
print('Best models score:\n', gscv.best_score_)