ML_AI_training/base_estimator3.py

#%% Import libs
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
#######################################################
y.to_frame().value_counts().plot(kind = 'bar')

blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')

scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 #,  'jaccard'   : make_scorer(jaccard_score)
            })

#%% Logistic Regression + hyperparam: GridSearch
# Note: cannot have '___' in estimator names
# '__' is used only before stating the param names
# '__' is usef in both places when  using clf_switcher

mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}

# FIXME: solver and penalty conflict, consider using 1
grid_params_log_reg = [{
        #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        #'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__max_iter': list(range(100,800,100)),
        'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    }]

pipe_log_reg = Pipeline([
    ('pre', MinMaxScaler())
    ,('clf', LogisticRegression(**rs))])

gs_log_reg = GridSearchCV(pipe_log_reg
               , param_grid = grid_params_log_reg
               , scoring ='f1' , refit = 'f1' # works
               #, scoring = mcc_score_fn, refit = 'mcc'
               #, scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know
               , cv = 10
               , n_jobs = 10# based on /proc/cpuinfo
               , return_train_score = False
               , verbose = 3)

gs_log_reg.fit(X_train, y_train)
#gs_log_reg_fit = gs_log_reg.fit(X_train, y_train)
#gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it
#pp.pprint(gs_log_reg_fit_res)
#y_predict = gs_log_reg.predict(X_test)

#print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict))
print('Best model:\n', gs_log_reg.best_params_)
print('Best models score:\n', gs_log_reg.best_score_)

#GridSearchCV giving score from the best estimator different from the one indicated in refit parameter
#https://stackoverflow.com/questions/66116996/gridsearchcv-giving-score-from-the-best-estimator-different-from-the-one-indicat

#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()

class ClfSwitcher(BaseEstimator):
    def __init__(
        self,
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

parameters = [
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        #'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
]

pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
])


gscv = GridSearchCV(pipeline
                    , parameters
                    , scoring = 'f1', refit = 'f1'
                    , cv = 10
                    , n_jobs = 10 #based on /proc/cpuinfo
                    , return_train_score = False
                    , verbose = 3)

# Fit
gscv.fit(X_train, y_train)
print('Best model:\n', gscv.best_params_)
print('Best models score:\n', gscv.best_score_, ':' ,round(gscv.best_score_, 2))
# gscv.score(X_test, y_test) # see how it does on test
# check_score = f1_score(y_train, gscv.predict(X_train))
# check_score # should be the same as the best score when the same metric used!
# mod_pred = gscv.predict(X_test)
# fscore  = f1_score(y_test, mod_pred)
# fscore

gscv_fit_be = gscv.fit(X_train, y_train)
gscv_fit_be_res = gscv_fit_be.cv_results_
print('\nMean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2))

best_model = gscv.best_params_
best_model.keys()
best_model.values

cross_val_score(LogisticRegression(random_state=42
                                   , solver='liblinear'
                                   , max_iter = 100)
                              , X_train
                              , y_train
                              , cv = 10)


cval =round(mean(cross_val_score(LogisticRegression(random_state=42
                                   , solver='liblinear'
                                   , max_iter = 100)
                              , X_train
                              , y_train
                              , cv = 10)),2)

########check
print('Best models score:', round(gscv.best_score_, 2))
print('Mean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2))
print('Best models cval:', cval)