finally made the fs work within class and without

2022-05-21 13:30:04 +01:00 · 2022-05-21 13:30:04 +01:00 · 39cd7b4259
commit 39cd7b4259
parent 4a9e9dfedf
3 changed files with 95 additions and 110 deletions
--- a/UQ_FS_eg.py
+++ b/UQ_FS_eg.py
@ -25,6 +25,7 @@ X_eg, y_eg = make_classification(n_samples=1000,
 pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=9)),
                 ('classifier', LogisticRegression())])
 search_space = [{'selector__k': [5, 6, 7, 10]},
--- a/UQ_LR_FS_p2.py
+++ b/UQ_LR_FS_p2.py
@ -21,18 +21,25 @@ class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
-        #feature = RFECV()
+        #feature = RFECV(SGDClassifier())
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
        #self.feature = feature
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        #self.feature.fit(X, y)
        return self
    # def transform(self, X, y=None):
    #     #self.estimator.transform(X, y)
    #     self.feature.transform(X)
    #     return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
@ -42,23 +49,26 @@ class ClfSwitcher(BaseEstimator):
    def score(self, X, y):
        return self.estimator.score(X, y)
 #%%
 parameters = [
-    # {'feature__fs__estimator': LogisticRegression(**rs)
+    
-    #  , 'feature__fs__cv': [10]
+    # {'fs__feature__min_features_to_select': [1]
-    #  , 'feature__fs__scoring': ['matthews_corrcoef']
+    #  , 'fs__feature__scoring': ['matthews_corrcoef']
-    #  },
+    #  , 'fs__feature__cv': [skf_cv]},
    {'fs__min_features_to_select': [1]
     #, 'fs__scoring': ['matthews_corrcoef']
     , 'fs__cv': [skf_cv]},
    {
-        'clf__estimator': [LogisticRegression(**rs)],
+     'clf__estimator': [LogisticRegression(**rs)],
-        'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+      #'clf__estimator__C': np.logspace(0, 4, 10),
-        #'clf__estimator__C': np.logspace(0, 4, 10),
+      'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
-        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
+      'clf__estimator__max_iter': list(range(100,800,100)),
-        'clf__estimator__max_iter': list(range(100,800,100)),
+      'clf__estimator__solver': ['saga']
        'clf__estimator__solver': ['saga']
    }#,
    # {
    #     'clf__estimator': [MODEL2(**rs)],
    #     #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    #     'clf__estimator__C': np.logspace(0, 4, 10),
    #     'clf__estimator__penalty': ['l2', 'none'],
    #     'clf__estimator__max_iter': list(range(100,800,100)),
@ -68,13 +78,14 @@ parameters = [
 #%% Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler())
-#   , ('fs',  RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef'))
+    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn
-   , ('selector', SelectKBest(mutual_info_classif, k=6))
+#    , ('fs', ClfSwitcher())
-   , ('clf', ClfSwitcher())
+    , ('clf',  ClfSwitcher())
-])
+    ])
-#%% Grid search i.e hyperparameter tuning and refitting on mcc
+#%%
-mod_fs = GridSearchCV(pipeline
+# Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_lr = GridSearchCV(pipeline
                    , parameters
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
@ -82,6 +93,66 @@ mod_fs = GridSearchCV(pipeline
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_lr.fit(X, y)
 ####
 gscv_lr_fit = gscv_lr.fit(X, y)
 gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
 gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
 #%% Grid search i.e hyperparameter tuning and refitting on mcc
 param_grid2 = [
    {'fs__min_features_to_select': [1]
    , 'fs__cv': [skf_cv]
    },
    { 
        #'clf__estimator': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l2'],
        'clf__max_iter': list(range(100,200,100)),
        #'clf__solver': ['newton-cg', 'lbfgs', 'sag']
        'clf__solver': ['sag']
    }, 
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l1', 'l2'],
        'clf__max_iter': list(range(100,200,100)),
        'clf__solver': ['liblinear']
    }
 ]
 # step 4: create  pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler())
    #, ('fs', model_rfecv)
    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
    , ('clf',  LogisticRegression(**rs))])
 # step 5: Perform Gridsearch CV
 gs_final = GridSearchCV(pipeline
                        , param_grid2
                        , cv = skf_cv
                        , scoring = mcc_score_fn, refit = 'mcc'
                        , verbose = 1
                        , return_train_score = False
                        , **njobs)
 #%% Fit 
 mod_fs_fit = mod_fs.fit(X, y)
 mod_fs_fbm = mod_fs_fit.best_params_
--- a/uq_ml_models/UQ_LR_FS.py
+++ b/uq_ml_models/UQ_LR_FS.py
@ -12,85 +12,6 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Import libs
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })    
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% Get data
 y.to_frame().value_counts().plot(kind = 'bar')
 blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
 # %% Logistic Regression + FS + hyperparameter
 # https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/
 # from sklearn.feature_selection import SelectKBest, mutual_info_classif
 # # Create pipeline
 # pipe = Pipeline([
 #     ('pre', MinMaxScaler())
 #     , ('fs', RFECV( LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef', **njobs,verbose = 3))
 #     #, ('fs', SelectKBest(mutual_info_classif, k=5))
 #     , ('clf', LogisticRegression(**rs))
 # ])
 # # Create search space
 # param_grid = [{'fs__step': [1]},
 #     {
 #         'clf': [LogisticRegression(**rs)],
 #         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
 #         'clf__C': np.logspace(0, 4, 10),
 #         'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
 #         'clf__max_iter': list(range(100,800,100)),
 #         'clf__solver': ['saga']
 #     },
 #     {
 #         'clf': [LogisticRegression(**rs)],
 #         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
 #         'clf__C': np.logspace(0, 4, 10),
 #         'clf__penalty': ['l2', 'none'],
 #         'clf__max_iter': list(range(100,800,100)),
 #         'clf__solver': ['newton-cg', 'lbfgs', 'sag']
 #     }, 
 #     {
 #         'clf': [LogisticRegression(**rs)],
 #         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
 #         'clf__C': np.logspace(0, 4, 10),
 #         'clf__penalty': ['l1', 'l2'],
 #         'clf__max_iter': list(range(100,800,100)),
 #         'clf__solver': ['liblinear']
 #     }]
 # # Run Grid search
 # gscv_fs_lr = GridSearchCV(pipe
 #                           , param_grid
 #                           , cv = skf_cv
 #                           , scoring = mcc_score_fn, refit = 'mcc'
 #                           , verbose = 3)
 # gscv_fs_lr_fit = gscv_fs_lr.fit(X, y)
 # gscv_fs_lr_fit_be_mod = gscv_fs_lr_fit.best_params_
 # gscv_fs_lr_fit_be_res = gscv_fs_lr_fit.cv_results_
 # print('Best model:\n', gscv_fs_lr_fit_be_mod)
 # print('Best models score:\n', gscv_fs_lr_fit.best_score_, ':' , round(gscv_fs_lr_fit.best_score_, 2))
 # #print('\nMean test score from fit results:', round(mean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2))
 # print('\nMean test score from fit results:', round(np.nanmean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2))
 ##############################################################################
 #MANUAL
 #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
 model_lr = LogisticRegression(**rs)
 model_rfecv = RFECV(estimator = model_lr
@ -99,15 +20,6 @@ model_rfecv = RFECV(estimator = model_lr
                    , scoring = 'matthews_corrcoef'
                    )
 # model_rfecv = SequentialFeatureSelector(estimator = model_lr
 #                                           , n_features_to_select = 'auto'
 #                                           , tol = None
 # #                                         , cv = 10
 #                                           , cv = rskf_cv
 # #                                          , direction ='backward'
 #                                           , direction ='forward'
 #                                           , **njobs)
 param_grid2 = [
    {
        #'clf__estimator': [LogisticRegression(**rs)],
@ -155,14 +67,15 @@ pipeline = Pipeline([('pre', MinMaxScaler())
 # Fit
 lr_fs_fit = pipeline.fit(X,y)
-lr_fs_fit_be_mod = lr_fs_fit.best_params_
+#lr_fs_fit_be_mod = lr_fs_fit.best_params_
-lr_fs_fit_be_res = lr_fs_fit.cv_results_
+#lr_fs_fit_be_res = lr_fs_fit.cv_results_
 dir(lr_fs_fit)
 print('Best model:\n', lr_fs_fit_be_mod)
 print('Best models score:\n', lr_fs_fit.best_score_, ':' , round(lr_fs_fit.best_score_, 2))
 pipeline.predict(X_bts)
-lr_fs.predict(X_bts)
+lr_fs_fit.predict(X_bts)
 test_predict = pipeline.predict(X_bts)
 print(test_predict)
@ -238,4 +151,4 @@ lr_df
 #FIXME: tidy the index of the formatted df
-###############################################################################
+###############################################################################