added scripts for playing base_estimator

2022-03-17 18:20:19 +00:00 · 2022-03-17 18:20:19 +00:00 · de05652ef6
commit de05652ef6
parent 5138036d8b
2 changed files with 364 additions and 0 deletions
--- a/base_estimator2.py
+++ b/base_estimator2.py
@ -0,0 +1,195 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 09:50:37 2022
@author: tanu
 """
 #https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
 #%%
 # https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 #%%
 #%% my numerical data
 X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']
                                                    , test_size    = 0.33
                                                    , random_state = 2
                                                    , shuffle      = True
                                                    , stratify     = num_df_wtgt['mutation_class'])
 y_train.to_frame().value_counts().plot(kind = 'bar')
 y_test.to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 #,  'jaccard'   : make_scorer(jaccard_score)
            })    
 #%% ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        #'tfidf__stop_words': ['english', None],
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        'clf__estimator': [MultinomialNB()],
        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        #'tfidf__stop_words': [None],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
    # {
    #     'clf__estimator': [LogisticRegression()],
    #     #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    #     'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    #     'clf__estimator__max__iter': list(range(100,800,100)),
    #     'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    # },
 ]
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 gscv = GridSearchCV(pipeline
                    , parameters
                    , cv=5
                    , n_jobs=12
                    , return_train_score=False
                    , verbose=3)
 #gscv.fit(train_data, train_labels)
 # Fit 
 gscv.fit(X_train, y_train)
 print('Best model:\n', gscv.best_params_)
 print('Best models score:\n', gscv.best_score_)
 gscv.score(X_test, y_test) # see how it does on test
 mod_pred = gscv.predict(X_test)
 fscore  = f1_score(y_test, mod_pred)
 fscore
 #%% GridSearchCV: single model
 #https://stackoverflow.com/questions/71079357/invalid-parameter-clf-learning-rate-for-estimator-pipeline
 pipe_xgb = Pipeline([('clf', XGBClassifier(random_state=42, use_label_encoder=False) )])
 grid_params_xgb = [{'clf__max__depth': [2, 4],
                    'clf__n__estimators': [50, 100],
                    'clf__learning__rate': [0.0001, 0.001]}]
 gs_xgb = GridSearchCV(estimator = pipe_xgb,
                param_grid = grid_params_xgb,
                scoring='accuracy',
                cv=10, 
                n_jobs=5)
 gs_xgb.fit(X_train, y_train)
 y_predict = gs_xgb.predict(X_test)
 print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict))
 print('Best model:\n', gs_xgb.best_params_)
 print('Best models score:\n', gs_xgb.best_score_)
 # Best model:
 # {'clf__learning__rate': 0.0001, 'clf__max__depth': 2, 'clf__n__estimators': 50}
 #NOTE: takes time to run!
 #%% model
 # Note: cannot have '___' in estimator names
 # '__' is used only before stating the param names
 # '__' is usef in both places when  using clf_switcher
 pipe_log_reg = Pipeline([('clf', LogisticRegression(random_state=42) )])
 grid_params_log_reg = [{
        #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__max_iter': list(range(100,800,100)),
        'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    }]
 gs_log_reg = GridSearchCV(estimator = pipe_log_reg
               , param_grid = grid_params_log_reg
               , scoring='accuracy'# works
              # , scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know 
               #, scoring = ['accuracy','f1', 'recall']
               #, refit = 'recall'
               , cv=10
               , n_jobs=5)
 gs_log_reg.fit(X_train, y_train)
 #y_predict = gs_log_reg.predict(X_test)
 gs_log_reg_fit = gs_log_reg.fit(X_train, y_train)
 gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it
 #y_predict = gs_log_reg.predict(X_test)
 print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict))
 print('Best model:\n', gs_log_reg.best_params_)
 print('Best models score:\n', gs_log_reg.best_score_)
 # note:  For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric.
 # If this is not needed, refit should be set to False explicitly. True was passed.
 #refit : boolean, string, or callable, default=True
 #Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a string denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, refit can be set to a function which returns the selected best_index_ given cv_results_. The refitted estimator is made available at the best_estimator_ attribute and permits using predict directly on this GridSearchCV instance. Also for multiple metric evaluation, the attributes best_index_, best_score_ and best_params_ will only be available if refit is set and all of them will be determined w.r.t this specific scorer. best_score_ is not 
 # returned if refit is callable. See scoring parameter to know more about multiple metric evaluation.
 # This GridSearchCV instance was initialized with `refit=False`. predict is available only after refitting on the best parameters. You can refit an estimator manually using the `best_params_` attribute
 #https://stackoverflow.com/questions/57986374/how-to-fix-the-error-for-multi-metric-scoring-for-oneclasssvm-and-gridsearchcv
 # PROBLEM: using multiple scoring metrics with GridSearchCV
 #https://stackoverflow.com/questions/53973563/using-multiple-metric-evaluation-with-gridsearchcv
--- a/base_estimator3.py
+++ b/base_estimator3.py
@ -0,0 +1,169 @@
 #%% Import libs
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 #%% Get train-test split and scoring functions
 X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']
                                                    , test_size    = 0.33
                                                    , random_state = 2
                                                    , shuffle      = True
                                                    , stratify     = num_df_wtgt['mutation_class'])
 y_train.to_frame().value_counts().plot(kind = 'bar')
 y_test.to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 #,  'jaccard'   : make_scorer(jaccard_score)
            })    
 #%% Logistic Regression + hyperparam: GridSearch
 # Note: cannot have '___' in estimator names
 # '__' is used only before stating the param names
 # '__' is usef in both places when  using clf_switcher
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 # FIXME: solver and penalty conflict, consider using 1
 grid_params_log_reg = [{
        #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        #'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__max_iter': list(range(100,800,100)),
        'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    }]
 pipe_log_reg = Pipeline([
    ('pre', MinMaxScaler())
    ,('clf', LogisticRegression(**rs))])
 gs_log_reg = GridSearchCV(pipe_log_reg
               , param_grid = grid_params_log_reg
               , scoring ='f1' , refit = 'f1' # works
               #, scoring = mcc_score_fn, refit = 'mcc'
               #, scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know 
               , cv = 10
               , n_jobs = 10# based on /proc/cpuinfo
               , return_train_score = False
               , verbose = 3) 
 gs_log_reg.fit(X_train, y_train)
 #gs_log_reg_fit = gs_log_reg.fit(X_train, y_train)
 #gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it
 #pp.pprint(gs_log_reg_fit_res)
 #y_predict = gs_log_reg.predict(X_test)
 #print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict))
 print('Best model:\n', gs_log_reg.best_params_)
 print('Best models score:\n', gs_log_reg.best_score_)
 #GridSearchCV giving score from the best estimator different from the one indicated in refit parameter
 #https://stackoverflow.com/questions/66116996/gridsearchcv-giving-score-from-the-best-estimator-different-from-the-one-indicat
 #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        #'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
 ]
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 gscv = GridSearchCV(pipeline
                    , parameters
                    , scoring = 'f1', refit = 'f1'
                    , cv = 10
                    , n_jobs = 10 #based on /proc/cpuinfo
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv.fit(X_train, y_train)
 print('Best model:\n', gscv.best_params_)
 print('Best models score:\n', gscv.best_score_, ':' ,round(gscv.best_score_, 2))
 # gscv.score(X_test, y_test) # see how it does on test
 # check_score = f1_score(y_train, gscv.predict(X_train))
 # check_score # should be the same as the best score when the same metric used!
 # mod_pred = gscv.predict(X_test)
 # fscore  = f1_score(y_test, mod_pred)
 # fscore
 gscv_fit_be = gscv.fit(X_train, y_train)
 gscv_fit_be_res = gscv_fit_be.cv_results_
 print('\nMean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2))
 best_model = gscv.best_params_
 best_model.keys()
 best_model.values
 cross_val_score(LogisticRegression(random_state=42
                                   , solver='liblinear'
                                   , max_iter = 100)
                              , X_train
                              , y_train
                              , cv = 10)
 cval =round(mean(cross_val_score(LogisticRegression(random_state=42
                                   , solver='liblinear'
                                   , max_iter = 100)
                              , X_train
                              , y_train
                              , cv = 10)),2)
 ########check
 print('Best models score:', round(gscv.best_score_, 2))
 print('Mean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2))
 print('Best models cval:', cval)