From 39cd7b4259de689f9a30d403f10e28dcac38ea8b Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Sat, 21 May 2022 13:30:04 +0100
Subject: [PATCH] finally made the fs work within class and without

---
 UQ_FS_eg.py              |   1 +
 UQ_LR_FS_p2.py           | 107 ++++++++++++++++++++++++++++++++-------
 uq_ml_models/UQ_LR_FS.py |  97 ++---------------------------------
 3 files changed, 95 insertions(+), 110 deletions(-)

diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py
index 80fa6a3..1c590ba 100644
--- a/UQ_FS_eg.py
+++ b/UQ_FS_eg.py
@@ -25,6 +25,7 @@ X_eg, y_eg = make_classification(n_samples=1000,
 
 pipe = Pipeline([('scaler', StandardScaler()),
                  ('selector', SelectKBest(mutual_info_classif, k=9)),
+                 
                  ('classifier', LogisticRegression())])
 
 search_space = [{'selector__k': [5, 6, 7, 10]},
diff --git a/UQ_LR_FS_p2.py b/UQ_LR_FS_p2.py
index 05ea68a..f399795 100644
--- a/UQ_LR_FS_p2.py
+++ b/UQ_LR_FS_p2.py
@@ -21,18 +21,25 @@ class ClfSwitcher(BaseEstimator):
     def __init__(
         self, 
         estimator = SGDClassifier(),
-        #feature = RFECV()
+        #feature = RFECV(SGDClassifier())
     ):
         """
         A Custom BaseEstimator that can switch between classifiers.
         :param estimator: sklearn object - The classifier
         """ 
         self.estimator = estimator
+        #self.feature = feature
     
     def fit(self, X, y=None, **kwargs):
         self.estimator.fit(X, y)
+        #self.feature.fit(X, y)
         return self
 
+    # def transform(self, X, y=None):
+    #     #self.estimator.transform(X, y)
+    #     self.feature.transform(X)
+    #     return self
+
     def predict(self, X, y=None):
         return self.estimator.predict(X)
     
@@ -42,23 +49,26 @@ class ClfSwitcher(BaseEstimator):
     def score(self, X, y):
         return self.estimator.score(X, y)
     
+#%%
 parameters = [
-    # {'feature__fs__estimator': LogisticRegression(**rs)
-    #  , 'feature__fs__cv': [10]
-    #  , 'feature__fs__scoring': ['matthews_corrcoef']
-    #  },
+    
+    # {'fs__feature__min_features_to_select': [1]
+    #  , 'fs__feature__scoring': ['matthews_corrcoef']
+    #  , 'fs__feature__cv': [skf_cv]},
+    
+    {'fs__min_features_to_select': [1]
+     #, 'fs__scoring': ['matthews_corrcoef']
+     , 'fs__cv': [skf_cv]},
     
     {
-        'clf__estimator': [LogisticRegression(**rs)],
-        'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        #'clf__estimator__C': np.logspace(0, 4, 10),
-        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
-        'clf__estimator__max_iter': list(range(100,800,100)),
-        'clf__estimator__solver': ['saga']
+     'clf__estimator': [LogisticRegression(**rs)],
+      #'clf__estimator__C': np.logspace(0, 4, 10),
+      'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
+      'clf__estimator__max_iter': list(range(100,800,100)),
+      'clf__estimator__solver': ['saga']
     }#,
     # {
     #     'clf__estimator': [MODEL2(**rs)],
-    #     #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
     #     'clf__estimator__C': np.logspace(0, 4, 10),
     #     'clf__estimator__penalty': ['l2', 'none'],
     #     'clf__estimator__max_iter': list(range(100,800,100)),
@@ -68,13 +78,14 @@ parameters = [
 #%% Create pipeline
 pipeline = Pipeline([
     ('pre', MinMaxScaler())
-#   , ('fs',  RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef'))
-   , ('selector', SelectKBest(mutual_info_classif, k=6))
-   , ('clf', ClfSwitcher())
-])
+    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn
+#    , ('fs', ClfSwitcher())
+    , ('clf',  ClfSwitcher())
+    ])
 
-#%% Grid search i.e hyperparameter tuning and refitting on mcc
-mod_fs = GridSearchCV(pipeline
+#%%
+# Grid search i.e hyperparameter tuning and refitting on mcc
+gscv_lr = GridSearchCV(pipeline
                     , parameters
                     , scoring = mcc_score_fn, refit = 'mcc'
                     , cv = skf_cv
@@ -82,6 +93,66 @@ mod_fs = GridSearchCV(pipeline
                     , return_train_score = False
                     , verbose = 3)
 
+# Fit 
+gscv_lr.fit(X, y)
+
+####
+gscv_lr_fit = gscv_lr.fit(X, y)
+gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
+gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
+
+#%% Grid search i.e hyperparameter tuning and refitting on mcc
+
+param_grid2 = [
+    
+    {'fs__min_features_to_select': [1]
+    , 'fs__cv': [skf_cv]
+    },
+    
+    
+    { 
+        #'clf__estimator': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l2'],
+        'clf__max_iter': list(range(100,200,100)),
+        #'clf__solver': ['newton-cg', 'lbfgs', 'sag']
+        'clf__solver': ['sag']
+
+    }, 
+    {
+        #'clf__estimator': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l1', 'l2'],
+        'clf__max_iter': list(range(100,200,100)),
+        'clf__solver': ['liblinear']
+    }
+
+]
+# step 4: create  pipeline
+pipeline = Pipeline([
+    ('pre', MinMaxScaler())
+    #, ('fs', model_rfecv)
+    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
+    , ('clf',  LogisticRegression(**rs))])
+   
+# step 5: Perform Gridsearch CV
+gs_final = GridSearchCV(pipeline
+                        , param_grid2
+                        , cv = skf_cv
+                        , scoring = mcc_score_fn, refit = 'mcc'
+                        , verbose = 1
+                        , return_train_score = False
+                        , **njobs)
+
+
+
+
+
+
+
+
+
+
 #%% Fit 
 mod_fs_fit = mod_fs.fit(X, y)
 mod_fs_fbm = mod_fs_fit.best_params_
diff --git a/uq_ml_models/UQ_LR_FS.py b/uq_ml_models/UQ_LR_FS.py
index 0346809..6910fab 100644
--- a/uq_ml_models/UQ_LR_FS.py
+++ b/uq_ml_models/UQ_LR_FS.py
@@ -12,85 +12,6 @@ Created on Tue Mar 15 11:09:50 2022
 
 @author: tanu
 """
-#%% Import libs
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-
-scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
-                 , 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 ,  'precision' : make_scorer(precision_score)
-                 ,  'recall'    : make_scorer(recall_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 ,  'jaccard'   : make_scorer(jaccard_score)
-            })    
-
-mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
-jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-#%% Get data
-y.to_frame().value_counts().plot(kind = 'bar')
-blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
-
-# %% Logistic Regression + FS + hyperparameter
-# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/
-# from sklearn.feature_selection import SelectKBest, mutual_info_classif
-
-# # Create pipeline
-# pipe = Pipeline([
-#     ('pre', MinMaxScaler())
-#     , ('fs', RFECV( LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef', **njobs,verbose = 3))
-#     #, ('fs', SelectKBest(mutual_info_classif, k=5))
-#     , ('clf', LogisticRegression(**rs))
-# ])
-
-# # Create search space
-# param_grid = [{'fs__step': [1]},
-
-#     {
-#         'clf': [LogisticRegression(**rs)],
-#         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-#         'clf__C': np.logspace(0, 4, 10),
-#         'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
-#         'clf__max_iter': list(range(100,800,100)),
-#         'clf__solver': ['saga']
-#     },
-#     {
-#         'clf': [LogisticRegression(**rs)],
-#         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-#         'clf__C': np.logspace(0, 4, 10),
-#         'clf__penalty': ['l2', 'none'],
-#         'clf__max_iter': list(range(100,800,100)),
-#         'clf__solver': ['newton-cg', 'lbfgs', 'sag']
-#     }, 
-#     {
-#         'clf': [LogisticRegression(**rs)],
-#         #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-#         'clf__C': np.logspace(0, 4, 10),
-#         'clf__penalty': ['l1', 'l2'],
-#         'clf__max_iter': list(range(100,800,100)),
-#         'clf__solver': ['liblinear']
-#     }]
-
-# # Run Grid search
-# gscv_fs_lr = GridSearchCV(pipe
-#                           , param_grid
-#                           , cv = skf_cv
-#                           , scoring = mcc_score_fn, refit = 'mcc'
-#                           , verbose = 3)
-
-# gscv_fs_lr_fit = gscv_fs_lr.fit(X, y)
-# gscv_fs_lr_fit_be_mod = gscv_fs_lr_fit.best_params_
-# gscv_fs_lr_fit_be_res = gscv_fs_lr_fit.cv_results_
-
-# print('Best model:\n', gscv_fs_lr_fit_be_mod)
-# print('Best models score:\n', gscv_fs_lr_fit.best_score_, ':' , round(gscv_fs_lr_fit.best_score_, 2))
-
-# #print('\nMean test score from fit results:', round(mean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2))
-# print('\nMean test score from fit results:', round(np.nanmean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2))
-
-##############################################################################
-#MANUAL
-
 #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
 model_lr = LogisticRegression(**rs)
 model_rfecv = RFECV(estimator = model_lr
@@ -99,15 +20,6 @@ model_rfecv = RFECV(estimator = model_lr
                     , scoring = 'matthews_corrcoef'
                     )
 
-# model_rfecv = SequentialFeatureSelector(estimator = model_lr
-#                                           , n_features_to_select = 'auto'
-#                                           , tol = None
-# #                                         , cv = 10
-#                                           , cv = rskf_cv
-# #                                          , direction ='backward'
-#                                           , direction ='forward'
-#                                           , **njobs)
-
 param_grid2 = [
     {
         #'clf__estimator': [LogisticRegression(**rs)],
@@ -155,14 +67,15 @@ pipeline = Pipeline([('pre', MinMaxScaler())
   
 # Fit
 lr_fs_fit = pipeline.fit(X,y)
-lr_fs_fit_be_mod = lr_fs_fit.best_params_
-lr_fs_fit_be_res = lr_fs_fit.cv_results_
+#lr_fs_fit_be_mod = lr_fs_fit.best_params_
+#lr_fs_fit_be_res = lr_fs_fit.cv_results_
+dir(lr_fs_fit)
 
 print('Best model:\n', lr_fs_fit_be_mod)
 print('Best models score:\n', lr_fs_fit.best_score_, ':' , round(lr_fs_fit.best_score_, 2))
 
 pipeline.predict(X_bts)
-lr_fs.predict(X_bts)
+lr_fs_fit.predict(X_bts)
 
 test_predict = pipeline.predict(X_bts)
 print(test_predict)
@@ -238,4 +151,4 @@ lr_df
 
 #FIXME: tidy the index of the formatted df
 
-###############################################################################
\ No newline at end of file
+###############################################################################