added practice and base_estimator for all the confusion in my head

2022-03-16 10:12:59 +00:00 · 2022-03-16 10:12:59 +00:00 · 97620c1bb0
commit 97620c1bb0
parent e28a296d98
3 changed files with 513 additions and 0 deletions
--- a/MultClassPipe3_CALL.py
+++ b/MultClassPipe3_CALL.py
@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 15 11:09:50 2022
+
+@author: tanu
+"""
+# stratified shuffle split
+X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
+                                                    , num_df_wtgt['mutation_class']
+                                                    , test_size = 0.33
+                                                    , **rs
+                                                    , shuffle = True
+                                                    , stratify = num_df_wtgt['mutation_class'])
+
+y_train.to_frame().value_counts().plot(kind = 'bar')
+y_test.to_frame().value_counts().plot(kind = 'bar')
+
+MultClassPipelineCV(X_train, X_test, y_train, y_test
+         , input_df = num_df_wtgt[numerical_FN]
+         , var_type = 'numerical')
+
+
+skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test
+         , input_df = num_df_wtgt[numerical_FN]
+         , var_type = 'numerical')
+
+pp.pprint(skf_cv_scores)
+# construct a df
+skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
+skf_cv_scores_df
+skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
+skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)
--- a/base_estimator.py
+++ b/base_estimator.py
@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 15 09:50:37 2022
+
+@author: tanu
+"""
+#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
+
+#%%
+# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import GridSearchCV
+
+
+from sklearn import datasets
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.svm import SVC
+
+from sklearn.base import BaseEstimator
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import GridSearchCV
+#%%
+class EstimatorSelectionHelper:
+    
+    def __init__(self, models, params):
+        self.models = models
+        self.params = params
+        self.keys = models.keys()
+        self.grid_searches = {}
+    
+    def fit(self, X, y, **grid_kwargs):
+        for key in self.keys:
+            print('Running GridSearchCV for %s.' % key)
+            model = self.models[key]
+            params = self.params[key]
+            grid_search = GridSearchCV(model, params, **grid_kwargs)
+            grid_search.fit(X, y)
+            self.grid_searches[key] = grid_search
+        print('Done.')
+    
+    def score_summary(self, sort_by='mean_test_score'):
+        frames = []
+        for name, grid_search in self.grid_searches.items():
+            frame = pd.DataFrame(grid_search.cv_results_)
+            frame = frame.filter(regex='^(?!.*param_).*$')
+            frame['estimator'] = len(frame)*[name]
+            frames.append(frame)
+        df = pd.concat(frames)
+        
+        df = df.sort_values([sort_by], ascending=False)
+        df = df.reset_index()
+        df = df.drop(['rank_test_score', 'index'], 1)
+        
+        columns = df.columns.tolist()
+        columns.remove('estimator')
+        columns = ['estimator']+columns
+        df = df[columns]
+        return df
+    
+#%%
+breast_cancer = datasets.load_breast_cancer()
+X_cancer = breast_cancer.data
+y_cancer = breast_cancer.target
+
+
+models1 = { 
+    'ExtraTreesClassifier': ExtraTreesClassifier(),
+    'RandomForestClassifier': RandomForestClassifier(),
+    'AdaBoostClassifier': AdaBoostClassifier(),
+    'GradientBoostingClassifier': GradientBoostingClassifier()
+}
+
+params1 = { 
+    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
+    'RandomForestClassifier': [
+        { 'n_estimators': [16, 32] },
+        {'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}],
+    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
+    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }
+}
+    
+
+helper1 = EstimatorSelectionHelper(models1, params1)
+helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
+helper1.score_summary()
+
+mm_df = helper1.score_summary()
+# COMMENT: Not sure what scores is it mean of and the options available thus
+
+#%%
+
+class ClfSwitcher(BaseEstimator):
+    def __init__(
+        self, 
+        estimator = SGDClassifier(),
+    ):
+        """
+        A Custom BaseEstimator that can switch between classifiers.
+        :param estimator: sklearn object - The classifier
+        """ 
+    
+        self.estimator = estimator
+    
+    
+    def fit(self, X, y=None, **kwargs):
+        self.estimator.fit(X, y)
+        return self
+    
+    def predict(self, X, y=None):
+        return self.estimator.predict(X)
+    
+    def predict_proba(self, X):
+        return self.estimator.predict_proba(X)
+    
+    def score(self, X, y):
+        return self.estimator.score(X, y)
+    
+parameters = [
+    {
+        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
+        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
+        #'tfidf__stop_words': ['english', None],
+        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
+        'clf__estimator__max_iter': [50, 80],
+        'clf__estimator__tol': [1e-4],
+        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
+    },
+    {
+        'clf__estimator': [MultinomialNB()],
+        #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
+        #'tfidf__stop_words': [None],
+        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
+    },
+]
+
+pipeline = Pipeline([
+    ('pre', MinMaxScaler()),
+    ('clf', ClfSwitcher()),
+])
+
+
+gscv = GridSearchCV(pipeline
+                    , parameters
+                    , cv=5
+                    , n_jobs=12
+                    , return_train_score=False
+                    , verbose=3)
+
+#gscv.fit(train_data, train_labels)
+
+#%% my numerical data
+X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
+                                                    , num_df_wtgt['mutation_class']
+                                                    , test_size    = 0.33
+                                                    , random_state = 2
+                                                    , shuffle      = True
+                                                    , stratify     = num_df_wtgt['mutation_class'])
+
+y_train.to_frame().value_counts().plot(kind = 'bar')
+y_test.to_frame().value_counts().plot(kind = 'bar')
+#%%
+gscv.fit(X_train, y_train)
+print('Best model:\n', gscv.best_params_)
+print('Best models score:\n', gscv.best_score_)
+gscv.score(X_test, y_test) # see how it does on test
+
+#===========================================
+mod_pred = gscv.predict(X_test)
+
+fscore  = f1_score(y_test, mod_pred)
+fscore
+#%% same as above
+# custom classifier
+
+class MyClassifier(BaseEstimator):
+
+    def __init__(self, classifier_type: str = 'SGDClassifier'):
+        """
+        A Custome BaseEstimator that can switch between classifiers.
+        :param classifier_type: string - The switch for different classifiers
+        """
+        self.classifier_type = classifier_type
+
+
+    def fit(self, X, y=None):
+        if self.classifier_type == 'SGDClassifier':
+            self.classifier_ = SGDClassifier()
+        elif self.classifier_type == 'MultinomialNB':
+            self.classifier_ = MultinomialNB()
+        else:
+            raise ValueError('Unkown classifier type.')
+
+        self.classifier_.fit(X, y)
+        return self
+
+    def predict(self, X, y=None):
+        return self.classifier_.predict(X)
+    
+    def score(self, X, y):
+        return self.estimator.score(X, y)
+
+pipeline = Pipeline([
+     ('pre', MinMaxScaler())
+    #, ('clf', ClfSwitcher()    
+    , ('clf', MyClassifier())
+])
+
+# parameter_space = {
+#     'clf__classifier_type': ['SGDClassifier', 'MultinomialNB']
+# }
+
+parameter_space = [
+    {
+        'clf__estimator': [SGDClassifier()], 
+        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
+        'clf__estimator__max_iter': [50, 80],
+        'clf__estimator__tol': [1e-4],
+        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
+    },
+    {
+        'clf__estimator': [MultinomialNB()],
+        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
+    },
+]
+
+search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5)
+search.fit(X_train, y_train)
+print('Best model:\n', search.best_params_)
+print('Best models score:\n', gscv.best_score_)
--- a/practice_cv.py
+++ b/practice_cv.py
@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 15 11:09:50 2022
+
+@author: tanu
+"""
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.datasets import load_wine
+from sklearn.model_selection import KFold
+
+wine = load_wine()
+X_train, y_train = wine.data, wine.target
+model = Pipeline([
+('pre', StandardScaler()),
+('knn', KNeighborsClassifier())
+])
+model.fit(X_train,y_train)
+
+from sklearn.model_selection import cross_validate
+val = cross_validate(model,X_train,y_train, cv = 10)
+val['test_score'].mean()
+
+my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
+
+
+# for scoring in ({'accuracy'     : make_scorer(accuracy_score)
+#                  , 'fscore'     : make_scorer(f1_score)
+#                  , 'mcc'        : make_scorer(matthews_corrcoef)
+#                  ,  'precision' : make_scorer(precision_score)
+#                  ,  'recall'    : make_scorer(recall_score)
+#                  ,  'roc_auc'   : make_scorer(roc_auc_score)
+#                  ,  'jaccard'   : make_scorer(jaccard_score)
+#             }
+#                 ,'accuracy', 'fscore', 'MCC', 'Precision', 'Recall', 'ROC_AUC', 'jaccard'):
+
+scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
+                 , 'fscore'     : make_scorer(f1_score)
+                 , 'mcc'        : make_scorer(matthews_corrcoef)
+                 ,  'precision' : make_scorer(precision_score)
+                 ,  'recall'    : make_scorer(recall_score)
+                 ,  'roc_auc'   : make_scorer(roc_auc_score)
+                 #,  'jaccard'   : make_scorer(jaccard_score)
+            })    
+    
+val2 = cross_validate(model,X_train,y_train, cv = 10
+                      
+               , scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc' )
+               #, scoring=scoring_fn
+              
+               , return_train_score=False)
+
+val2
+print(val2['test_f1'])
+print(mean(val2['test_accuracy']))
+print(mean(val2['test_f1']))
+#print(mean(val2['train_f1']))
+print(mean(val2['test_precision']))
+#print(mean(val2['train_precision']))
+print(mean(val2['test_recall']))
+print(mean(val2['test_roc_auc']))
+
+#%%
+val3 = cross_validate(model
+                      , X_train
+                      , y_train
+                      , cv = 10
+                      , scoring = scoring_fn
+                      , return_train_score=False)
+
+val3
+print(mean(val3['test_accuracy']))
+print(mean(val3['test_fscore']))
+print(mean(val3['test_mcc']))
+print(mean(val3['test_precision']))
+print(mean(val3['test_recall']))
+print(mean(val3['test_roc_auc'])) # differs
+
+#======================
+# with CV.split
+scores = []
+scores
+#best_svr = SVR(kernel='rbf')
+model = Pipeline([
+('pre', StandardScaler()),
+('knn', KNeighborsClassifier())
+])
+cv = KFold(n_splits=10
+           #, random_state=42
+           #, shuffle=True)
+           )
+for train_index, test_index in cv.split(num_df_wtgt[numerical_FN]
+                                                    , num_df_wtgt['mutation_class']):
+    #print("Train Index: ", train_index, "\n")
+    #print("Test Index: ", test_index)
+
+    X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index]
+    model.fit(X_train, y_train)
+    scores.append(model.score(X_test, y_test))
+
+mean(scores)
+
+################
+scores_skf = []
+skf = StratifiedKFold(n_splits = 10
+                          #, shuffle = True
+                          #, **r
+                          )
+
+for train_index, test_index in skf.split(num_df_wtgt[numerical_FN]
+                                                    , num_df_wtgt['mutation_class']):
+    #print("Train Index: ", train_index, "\n")
+    #print("Test Index: ", test_index)
+
+    X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index]
+    model.fit(X_train, y_train)
+    scores_skf.append(model.score(X_test, y_test))
+
+mean(scores_skf)
+
+
+val = cross_validate(model, X_train,y_train , cv = 10)
+val['test_score'].mean()
+#%% compare loopity loop vs CV with SKF
+rs = {'random_state': 42}
+X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
+                                                    , num_df_wtgt['mutation_class']
+                                                    , test_size    = 0.33
+                                                    , **rs
+                                                    , shuffle      = True
+                                                    , stratify     = num_df_wtgt['mutation_class'])
+
+log_reg = LogisticRegression(**rs)
+nb      = BernoulliNB()
+knn     = KNeighborsClassifier()
+svm     = SVC(**rs)
+
+model_single_pipeline = Pipeline([
+('pre', MinMaxScaler())
+, ('model', log_reg)
+#, ('model', nb)
+#, ('model', knn)
+
+])
+
+skf_cv = cross_validate(model_single_pipeline
+                     #, X_train
+                     #, y_train
+                      , num_df_wtgt[numerical_FN]
+                      , num_df_wtgt['mutation_class']
+                      , cv = 10
+                      , scoring = scoring_fn
+                      , return_train_score=True)
+
+skf_cv
+print(round(mean(skf_cv['test_accuracy']),2))
+print(round(mean(skf_cv['test_fscore']),2))
+print(round(mean(skf_cv['test_mcc']),2))
+print(round(mean(skf_cv['test_precision']),2))
+print(round(mean(skf_cv['test_recall']),2))
+print(round(mean(skf_cv['test_roc_auc']),2)) # differs
+
+
+# %% Extracting skf_cv mean values and assiging to a dict
+models_single = [
+         ('Logistic Regression'  , log_reg) 
+         #, ('Naive Bayes'        , nb)
+         #, ('K-Nearest Neighbors', knn) 
+         # , ('SVM'                , svm) 
+         ]
+
+foo_single = {}
+for model_name, model in models_single:
+    print(model_name)
+    #model_name_dict = {'model_name': model_name}
+    foo_single[model_name] = {}
+    for key, value in skf_cv.items():
+        print('\nkey:', key, '\nvalue:', value)
+        print('\nmean value:', mean(value))
+        foo_single[model_name][key] = round(mean(value),2)
+        pp.pprint(foo_single)
+
+foo_single_df = pd.DataFrame(foo_single)
+foo_single_df
+foo_single_df.filter(like='test_', axis=0)
+
+# ONLY for a single score
+cval_score = cross_val_score(model
+                      , num_df_wtgt[numerical_FN]
+                      , num_df_wtgt['mutation_class']
+                      , scoring = 'f1_macro'
+                      , cv=10)
+print(cval_score)
+print(round(mean(cval_score), 2))
+
+
+# %% Running multiple model with CV
+log_reg = LogisticRegression(**rs)
+nb      = BernoulliNB()
+knn     = KNeighborsClassifier()
+svm     = SVC(**rs)
+
+models = [
+         ('Logistic Regression'  , log_reg) 
+         , ('Naive Bayes'        , nb)
+         , ('K-Nearest Neighbors', knn) 
+          , ('SVM'                , svm) 
+         ]
+
+foo = {}
+for model_name, model_fn in models:
+    # print('\nModel_name:', model_name
+    #       , '\nModel func:', model_fn
+    #       , '\nList of models:', models)
+    
+    model_pipeline = Pipeline([
+        ('pre'     , MinMaxScaler())
+        , ('model' , model_fn)])
+    print('Running model pipeline:', model_pipeline)
+    skf_cv = cross_validate(model_pipeline
+                          , X_train
+                          , y_train
+                          , cv = 10
+                          , scoring = scoring_fn
+                          , return_train_score = True)
+    foo[model_name] = {}
+    for key, value in skf_cv.items():
+        print('\nkey:', key, '\nvalue:', value)
+        print('\nmean value:', mean(value))
+        foo[model_name][key] = round(mean(value),2)
+pp.pprint(foo)
+
+# construtc df 
+foo_df = pd.DataFrame(foo)
+foo_df
+scores_df = foo_df.filter(like='test_', axis=0)
+
+
+a = pd.DataFrame(foo)
+b = pd.DataFrame.from_dict(foo)
+c = pd.DataFrame.from_records(foo)
+
+
+