finally made the fs work within class and without

2022-05-21 13:30:04 +01:00 · 2022-05-21 13:30:04 +01:00 · 39cd7b4259
commit 39cd7b4259
parent 4a9e9dfedf
3 changed files with 95 additions and 110 deletions
--- a/UQ_FS_eg.py
+++ b/UQ_FS_eg.py
@ -25,6 +25,7 @@ X_eg, y_eg = make_classification(n_samples=1000,

 pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_classif, k=9)),
+                 
                 ('classifier', LogisticRegression())])

 search_space = [{'selector__k': [5, 6, 7, 10]},
--- a/UQ_LR_FS_p2.py
+++ b/UQ_LR_FS_p2.py
@ -21,18 +21,25 @@ class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
-        #feature = RFECV()
+        #feature = RFECV(SGDClassifier())
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
+        #self.feature = feature
    
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
+        #self.feature.fit(X, y)
        return self

+    # def transform(self, X, y=None):
+    #     #self.estimator.transform(X, y)
+    #     self.feature.transform(X)
+    #     return self
+
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    
@ -42,23 +49,26 @@ class ClfSwitcher(BaseEstimator):
    def score(self, X, y):
        return self.estimator.score(X, y)
    
+#%%
 parameters = [
-    # {'feature__fs__estimator': LogisticRegression(**rs)
-    #  , 'feature__fs__cv': [10]
-    #  , 'feature__fs__scoring': ['matthews_corrcoef']
-    #  },
+    
+    # {'fs__feature__min_features_to_select': [1]
+    #  , 'fs__feature__scoring': ['matthews_corrcoef']
+    #  , 'fs__feature__cv': [skf_cv]},
+    
+    {'fs__min_features_to_select': [1]
+     #, 'fs__scoring': ['matthews_corrcoef']
+     , 'fs__cv': [skf_cv]},
    
    {
-        'clf__estimator': [LogisticRegression(**rs)],
-        'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        #'clf__estimator__C': np.logspace(0, 4, 10),
-        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
-        'clf__estimator__max_iter': list(range(100,800,100)),
-        'clf__estimator__solver': ['saga']
+     'clf__estimator': [LogisticRegression(**rs)],
+      #'clf__estimator__C': np.logspace(0, 4, 10),
+      'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
+      'clf__estimator__max_iter': list(range(100,800,100)),
+      'clf__estimator__solver': ['saga']
    }#,
    # {
    #     'clf__estimator': [MODEL2(**rs)],
-    #     #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    #     'clf__estimator__C': np.logspace(0, 4, 10),
    #     'clf__estimator__penalty': ['l2', 'none'],
    #     'clf__estimator__max_iter': list(range(100,800,100)),
@ -68,13 +78,14 @@ parameters = [
 #%% Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler())
-#   , ('fs',  RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef'))
-   , ('selector', SelectKBest(mutual_info_classif, k=6))
-   , ('clf', ClfSwitcher())
-])
+    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn
+#    , ('fs', ClfSwitcher())
+    , ('clf',  ClfSwitcher())
+    ])

-#%% Grid search i.e hyperparameter tuning and refitting on mcc
-mod_fs = GridSearchCV(pipeline
+#%%
+# Grid search i.e hyperparameter tuning and refitting on mcc
+gscv_lr = GridSearchCV(pipeline
                    , parameters
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
@ -82,6 +93,66 @@ mod_fs = GridSearchCV(pipeline
                    , return_train_score = False
                    , verbose = 3)

+# Fit 
+gscv_lr.fit(X, y)
+
+####
+gscv_lr_fit = gscv_lr.fit(X, y)
+gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
+gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
+
+#%% Grid search i.e hyperparameter tuning and refitting on mcc
+
+param_grid2 = [
+    
+    {'fs__min_features_to_select': [1]
+    , 'fs__cv': [skf_cv]
+    },
+    
+    
+    { 
+        #'clf__estimator': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l2'],
+        'clf__max_iter': list(range(100,200,100)),
+        #'clf__solver': ['newton-cg', 'lbfgs', 'sag']
+        'clf__solver': ['sag']
+
+    }, 
+    {
+        #'clf__estimator': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l1', 'l2'],
+        'clf__max_iter': list(range(100,200,100)),
+        'clf__solver': ['liblinear']
+    }
+
+]
+# step 4: create  pipeline
+pipeline = Pipeline([
+    ('pre', MinMaxScaler())
+    #, ('fs', model_rfecv)
+    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
+    , ('clf',  LogisticRegression(**rs))])
+   
+# step 5: Perform Gridsearch CV
+gs_final = GridSearchCV(pipeline
+                        , param_grid2
+                        , cv = skf_cv
+                        , scoring = mcc_score_fn, refit = 'mcc'
+                        , verbose = 1
+                        , return_train_score = False
+                        , **njobs)
+
+
+
+
+
+
+
+
+
+
 #%% Fit 
 mod_fs_fit = mod_fs.fit(X, y)
 mod_fs_fbm = mod_fs_fit.best_params_
--- a/uq_ml_models/UQ_LR_FS.py
+++ b/uq_ml_models/UQ_LR_FS.py
@ -12,85 +12,6 @@ Created on Tue Mar 15 11:09:50 2022

@author: tanu
 """
-#%% Import libs
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-
-scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
-                 , 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 ,  'precision' : make_scorer(precision_score)
-                 ,  'recall'    : make_scorer(recall_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 ,  'jaccard'   : make_scorer(jaccard_score)
-            })    
-
-mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
-jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-#%% Get data
-y.to_frame().value_counts().plot(kind = 'bar')
-blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
-
-# %% Logistic Regression + FS + hyperparameter
-# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/
-# from sklearn.feature_selection import SelectKBest, mutual_info_classif
-
-# # Create pipeline
-# pipe = Pipeline([
-#     ('pre', MinMaxScaler())
-#     , ('fs', RFECV( LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef', **njobs,verbose = 3))
-#     #, ('fs', SelectKBest(mutual_info_classif, k=5))
-#     , ('clf', LogisticRegression(**rs))
-# ])
-
-# # Create search space
-# param_grid = [{'fs__step': [1]},
-
-#     {
-#         'clf': [LogisticRegression(**rs)],
-#         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-#         'clf__C': np.logspace(0, 4, 10),
-#         'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
-#         'clf__max_iter': list(range(100,800,100)),
-#         'clf__solver': ['saga']
-#     },
-#     {
-#         'clf': [LogisticRegression(**rs)],
-#         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-#         'clf__C': np.logspace(0, 4, 10),
-#         'clf__penalty': ['l2', 'none'],
-#         'clf__max_iter': list(range(100,800,100)),
-#         'clf__solver': ['newton-cg', 'lbfgs', 'sag']
-#     }, 
-#     {
-#         'clf': [LogisticRegression(**rs)],
-#         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-#         'clf__C': np.logspace(0, 4, 10),
-#         'clf__penalty': ['l1', 'l2'],
-#         'clf__max_iter': list(range(100,800,100)),
-#         'clf__solver': ['liblinear']
-#     }]
-
-# # Run Grid search
-# gscv_fs_lr = GridSearchCV(pipe
-#                           , param_grid
-#                           , cv = skf_cv
-#                           , scoring = mcc_score_fn, refit = 'mcc'
-#                           , verbose = 3)
-
-# gscv_fs_lr_fit = gscv_fs_lr.fit(X, y)
-# gscv_fs_lr_fit_be_mod = gscv_fs_lr_fit.best_params_
-# gscv_fs_lr_fit_be_res = gscv_fs_lr_fit.cv_results_
-
-# print('Best model:\n', gscv_fs_lr_fit_be_mod)
-# print('Best models score:\n', gscv_fs_lr_fit.best_score_, ':' , round(gscv_fs_lr_fit.best_score_, 2))
-
-# #print('\nMean test score from fit results:', round(mean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2))
-# print('\nMean test score from fit results:', round(np.nanmean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2))
-
-##############################################################################
-#MANUAL
-
 #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
 model_lr = LogisticRegression(**rs)
 model_rfecv = RFECV(estimator = model_lr
@ -99,15 +20,6 @@ model_rfecv = RFECV(estimator = model_lr
                    , scoring = 'matthews_corrcoef'
                    )

-# model_rfecv = SequentialFeatureSelector(estimator = model_lr
-#                                           , n_features_to_select = 'auto'
-#                                           , tol = None
-# #                                         , cv = 10
-#                                           , cv = rskf_cv
-# #                                          , direction ='backward'
-#                                           , direction ='forward'
-#                                           , **njobs)
-
 param_grid2 = [
    {
        #'clf__estimator': [LogisticRegression(**rs)],
@ -155,14 +67,15 @@ pipeline = Pipeline([('pre', MinMaxScaler())
  
 # Fit
 lr_fs_fit = pipeline.fit(X,y)
-lr_fs_fit_be_mod = lr_fs_fit.best_params_
-lr_fs_fit_be_res = lr_fs_fit.cv_results_
+#lr_fs_fit_be_mod = lr_fs_fit.best_params_
+#lr_fs_fit_be_res = lr_fs_fit.cv_results_
+dir(lr_fs_fit)

 print('Best model:\n', lr_fs_fit_be_mod)
 print('Best models score:\n', lr_fs_fit.best_score_, ':' , round(lr_fs_fit.best_score_, 2))

 pipeline.predict(X_bts)
-lr_fs.predict(X_bts)
+lr_fs_fit.predict(X_bts)

 test_predict = pipeline.predict(X_bts)
 print(test_predict)
@ -238,4 +151,4 @@ lr_df

 #FIXME: tidy the index of the formatted df

-###############################################################################
+###############################################################################