added practice and base_estimator for all the confusion in my head

2022-03-16 10:12:59 +00:00 · 2022-03-16 10:12:59 +00:00 · 97620c1bb0
commit 97620c1bb0
parent e28a296d98
3 changed files with 513 additions and 0 deletions
--- a/MultClassPipe3_CALL.py
+++ b/MultClassPipe3_CALL.py
@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 # stratified shuffle split
 X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']
                                                    , test_size = 0.33
                                                    , **rs
                                                    , shuffle = True
                                                    , stratify = num_df_wtgt['mutation_class'])
 y_train.to_frame().value_counts().plot(kind = 'bar')
 y_test.to_frame().value_counts().plot(kind = 'bar')
 MultClassPipelineCV(X_train, X_test, y_train, y_test
         , input_df = num_df_wtgt[numerical_FN]
         , var_type = 'numerical')
 skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test
         , input_df = num_df_wtgt[numerical_FN]
         , var_type = 'numerical')
 pp.pprint(skf_cv_scores)
 # construct a df
 skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
 skf_cv_scores_df
 skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
 skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)
--- a/base_estimator.py
+++ b/base_estimator.py
@ -0,0 +1,236 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 09:50:37 2022
@author: tanu
 """
 #https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
 #%%
 # https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 #%%
 class EstimatorSelectionHelper:
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')
    def score_summary(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)
        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df
 #%%
 breast_cancer = datasets.load_breast_cancer()
 X_cancer = breast_cancer.data
 y_cancer = breast_cancer.target
 models1 = { 
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
 }
 params1 = { 
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': [
        { 'n_estimators': [16, 32] },
        {'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}],
    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }
 }
 helper1 = EstimatorSelectionHelper(models1, params1)
 helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
 helper1.score_summary()
 mm_df = helper1.score_summary()
 # COMMENT: Not sure what scores is it mean of and the options available thus
 #%%
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        #'tfidf__stop_words': ['english', None],
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        'clf__estimator': [MultinomialNB()],
        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
        #'tfidf__stop_words': [None],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
 ]
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 gscv = GridSearchCV(pipeline
                    , parameters
                    , cv=5
                    , n_jobs=12
                    , return_train_score=False
                    , verbose=3)
 #gscv.fit(train_data, train_labels)
 #%% my numerical data
 X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']
                                                    , test_size    = 0.33
                                                    , random_state = 2
                                                    , shuffle      = True
                                                    , stratify     = num_df_wtgt['mutation_class'])
 y_train.to_frame().value_counts().plot(kind = 'bar')
 y_test.to_frame().value_counts().plot(kind = 'bar')
 #%%
 gscv.fit(X_train, y_train)
 print('Best model:\n', gscv.best_params_)
 print('Best models score:\n', gscv.best_score_)
 gscv.score(X_test, y_test) # see how it does on test
 #===========================================
 mod_pred = gscv.predict(X_test)
 fscore  = f1_score(y_test, mod_pred)
 fscore
 #%% same as above
 # custom classifier
 class MyClassifier(BaseEstimator):
    def __init__(self, classifier_type: str = 'SGDClassifier'):
        """
        A Custome BaseEstimator that can switch between classifiers.
        :param classifier_type: string - The switch for different classifiers
        """
        self.classifier_type = classifier_type
    def fit(self, X, y=None):
        if self.classifier_type == 'SGDClassifier':
            self.classifier_ = SGDClassifier()
        elif self.classifier_type == 'MultinomialNB':
            self.classifier_ = MultinomialNB()
        else:
            raise ValueError('Unkown classifier type.')
        self.classifier_.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.classifier_.predict(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 pipeline = Pipeline([
     ('pre', MinMaxScaler())
    #, ('clf', ClfSwitcher()    
    , ('clf', MyClassifier())
 ])
 # parameter_space = {
 #     'clf__classifier_type': ['SGDClassifier', 'MultinomialNB']
 # }
 parameter_space = [
    {
        'clf__estimator': [SGDClassifier()], 
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
 ]
 search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5)
 search.fit(X_train, y_train)
 print('Best model:\n', search.best_params_)
 print('Best models score:\n', gscv.best_score_)
--- a/practice_cv.py
+++ b/practice_cv.py
@ -0,0 +1,244 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.datasets import load_wine
 from sklearn.model_selection import KFold
 wine = load_wine()
 X_train, y_train = wine.data, wine.target
 model = Pipeline([
 ('pre', StandardScaler()),
 ('knn', KNeighborsClassifier())
 ])
 model.fit(X_train,y_train)
 from sklearn.model_selection import cross_validate
 val = cross_validate(model,X_train,y_train, cv = 10)
 val['test_score'].mean()
 my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
 # for scoring in ({'accuracy'     : make_scorer(accuracy_score)
 #                  , 'fscore'     : make_scorer(f1_score)
 #                  , 'mcc'        : make_scorer(matthews_corrcoef)
 #                  ,  'precision' : make_scorer(precision_score)
 #                  ,  'recall'    : make_scorer(recall_score)
 #                  ,  'roc_auc'   : make_scorer(roc_auc_score)
 #                  ,  'jaccard'   : make_scorer(jaccard_score)
 #             }
 #                 ,'accuracy', 'fscore', 'MCC', 'Precision', 'Recall', 'ROC_AUC', 'jaccard'):
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 #,  'jaccard'   : make_scorer(jaccard_score)
            })    
 val2 = cross_validate(model,X_train,y_train, cv = 10
               , scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc' )
               #, scoring=scoring_fn
               , return_train_score=False)
 val2
 print(val2['test_f1'])
 print(mean(val2['test_accuracy']))
 print(mean(val2['test_f1']))
 #print(mean(val2['train_f1']))
 print(mean(val2['test_precision']))
 #print(mean(val2['train_precision']))
 print(mean(val2['test_recall']))
 print(mean(val2['test_roc_auc']))
 #%%
 val3 = cross_validate(model
                      , X_train
                      , y_train
                      , cv = 10
                      , scoring = scoring_fn
                      , return_train_score=False)
 val3
 print(mean(val3['test_accuracy']))
 print(mean(val3['test_fscore']))
 print(mean(val3['test_mcc']))
 print(mean(val3['test_precision']))
 print(mean(val3['test_recall']))
 print(mean(val3['test_roc_auc'])) # differs
 #======================
 # with CV.split
 scores = []
 scores
 #best_svr = SVR(kernel='rbf')
 model = Pipeline([
 ('pre', StandardScaler()),
 ('knn', KNeighborsClassifier())
 ])
 cv = KFold(n_splits=10
           #, random_state=42
           #, shuffle=True)
           )
 for train_index, test_index in cv.split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index]
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
 mean(scores)
 ################
 scores_skf = []
 skf = StratifiedKFold(n_splits = 10
                          #, shuffle = True
                          #, **r
                          )
 for train_index, test_index in skf.split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)
    X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index]
    model.fit(X_train, y_train)
    scores_skf.append(model.score(X_test, y_test))
 mean(scores_skf)
 val = cross_validate(model, X_train,y_train , cv = 10)
 val['test_score'].mean()
 #%% compare loopity loop vs CV with SKF
 rs = {'random_state': 42}
 X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
                                                    , num_df_wtgt['mutation_class']
                                                    , test_size    = 0.33
                                                    , **rs
                                                    , shuffle      = True
                                                    , stratify     = num_df_wtgt['mutation_class'])
 log_reg = LogisticRegression(**rs)
 nb      = BernoulliNB()
 knn     = KNeighborsClassifier()
 svm     = SVC(**rs)
 model_single_pipeline = Pipeline([
 ('pre', MinMaxScaler())
 , ('model', log_reg)
 #, ('model', nb)
 #, ('model', knn)
 ])
 skf_cv = cross_validate(model_single_pipeline
                     #, X_train
                     #, y_train
                      , num_df_wtgt[numerical_FN]
                      , num_df_wtgt['mutation_class']
                      , cv = 10
                      , scoring = scoring_fn
                      , return_train_score=True)
 skf_cv
 print(round(mean(skf_cv['test_accuracy']),2))
 print(round(mean(skf_cv['test_fscore']),2))
 print(round(mean(skf_cv['test_mcc']),2))
 print(round(mean(skf_cv['test_precision']),2))
 print(round(mean(skf_cv['test_recall']),2))
 print(round(mean(skf_cv['test_roc_auc']),2)) # differs
 # %% Extracting skf_cv mean values and assiging to a dict
 models_single = [
         ('Logistic Regression'  , log_reg) 
         #, ('Naive Bayes'        , nb)
         #, ('K-Nearest Neighbors', knn) 
         # , ('SVM'                , svm) 
         ]
 foo_single = {}
 for model_name, model in models_single:
    print(model_name)
    #model_name_dict = {'model_name': model_name}
    foo_single[model_name] = {}
    for key, value in skf_cv.items():
        print('\nkey:', key, '\nvalue:', value)
        print('\nmean value:', mean(value))
        foo_single[model_name][key] = round(mean(value),2)
        pp.pprint(foo_single)
 foo_single_df = pd.DataFrame(foo_single)
 foo_single_df
 foo_single_df.filter(like='test_', axis=0)
 # ONLY for a single score
 cval_score = cross_val_score(model
                      , num_df_wtgt[numerical_FN]
                      , num_df_wtgt['mutation_class']
                      , scoring = 'f1_macro'
                      , cv=10)
 print(cval_score)
 print(round(mean(cval_score), 2))
 # %% Running multiple model with CV
 log_reg = LogisticRegression(**rs)
 nb      = BernoulliNB()
 knn     = KNeighborsClassifier()
 svm     = SVC(**rs)
 models = [
         ('Logistic Regression'  , log_reg) 
         , ('Naive Bayes'        , nb)
         , ('K-Nearest Neighbors', knn) 
          , ('SVM'                , svm) 
         ]
 foo = {}
 for model_name, model_fn in models:
    # print('\nModel_name:', model_name
    #       , '\nModel func:', model_fn
    #       , '\nList of models:', models)
    model_pipeline = Pipeline([
        ('pre'     , MinMaxScaler())
        , ('model' , model_fn)])
    print('Running model pipeline:', model_pipeline)
    skf_cv = cross_validate(model_pipeline
                          , X_train
                          , y_train
                          , cv = 10
                          , scoring = scoring_fn
                          , return_train_score = True)
    foo[model_name] = {}
    for key, value in skf_cv.items():
        print('\nkey:', key, '\nvalue:', value)
        print('\nmean value:', mean(value))
        foo[model_name][key] = round(mean(value),2)
 pp.pprint(foo)
 # construtc df 
 foo_df = pd.DataFrame(foo)
 foo_df
 scores_df = foo_df.filter(like='test_', axis=0)
 a = pd.DataFrame(foo)
 b = pd.DataFrame.from_dict(foo)
 c = pd.DataFrame.from_records(foo)