added UQ_LR FS2.py that has the FS run with LR model as part of pipeline and gridsearch

2022-05-21 13:30:45 +01:00 · 2022-05-21 13:30:45 +01:00 · 52cc16f3fa
commit 52cc16f3fa
parent 39cd7b4259
1 changed files with 182 additions and 0 deletions
--- a/uq_ml_models/UQ_LR_FS2.py
+++ b/uq_ml_models/UQ_LR_FS2.py
@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon May 16 05:59:12 2022
+
+@author: tanu
+"""
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 15 11:09:50 2022
+
+@author: tanu
+"""
+#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
+model_lr = LogisticRegression(**rs)
+model_rfecv = RFECV(estimator = model_lr
+                    , cv = skf_cv
+                    #, cv = 10
+                    , min_features_to_select = 1 # default
+                    , scoring = 'matthews_corrcoef'
+                    )
+
+param_grid2 = [
+    {
+        #'clf__estimator': [LogisticRegression(**rs)],
+        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+        'C': np.logspace(0, 4, 10),
+        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
+        'max_iter': list(range(100,800,100)),
+        'solver': ['saga']
+    },
+    {
+        #'clf__estimator': [LogisticRegression(**rs)],
+        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+        'C': np.logspace(0, 4, 10),
+        'penalty': ['l2', 'none'],
+        'max_iter': list(range(100,800,100)),
+        'solver': ['newton-cg', 'lbfgs', 'sag']
+    }, 
+    {
+        #'clf__estimator': [LogisticRegression(**rs)],
+        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+        'C': np.logspace(0, 4, 10),
+        'penalty': ['l1', 'l2'],
+        'max_iter': list(range(100,800,100)),
+        'solver': ['liblinear']
+    }
+
+]    
+#-------------------------------------------------------------------------------
+# Grid search CV + FS
+gscv_lr = GridSearchCV(estimator = model_lr
+                    , param_grid = param_grid2
+                    , scoring = mcc_score_fn, refit = 'mcc'
+                    , cv = skf_cv
+                    , return_train_score = False
+                    , verbose = 3
+                    , **njobs)
+
+#------------------------------------------------------------------------------
+################
+# NOTE: GS is going into pipeline, 
+# Cannot get BEST model out
+################ 
+# Create pipeline
+pipeline = Pipeline([('pre', MinMaxScaler())
+                     #, ('fs', sfs_selector)
+                     , ('fs', model_rfecv )
+                     , ('clf', gscv_lr)])
+  
+# Fit # dont assign fit
+#lr_fs_fit = pipeline.fit(X,y)
+pipeline.fit(X,y)
+
+pipeline.best_params_
+
+#https://github.com/scikit-learn/scikit-learn/issues/7536
+n_fs = gscv_lr.best_estimator_.n_features_in_
+n_fs
+
+sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
+print('\nNo. of features selected with RFECV for model'
+      , pipeline.named_steps['clf'].estimator
+      , ':', n_fs
+      , '\nThese are:', sel_features
+      )
+##############################################################
+# THIS ONE
+#########
+# Make Pipeline go into GS with FS
+#########
+
+# step 1: specify model 
+#modLR = LogisticRegression(**rs)
+
+# step 2: specify fs
+#model_rfecv = RFECV(estimator = model_lr
+                   # , cv = skf_cv
+                    #, min_features_to_select = 1 # default
+                    #, scoring = 'matthews_corrcoef'
+                    #)
+
+# step 3: specify param grid as dict
+param_grid2 = [
+    
+    {'fs__min_features_to_select': [1]
+    , 'fs__cv': [skf_cv]
+    #, 'fs__scoring': ['matthews_corrcoef']},
+    #, 'fs__scoring': [mcc_score_fn]}
+    },
+    
+    
+    { 
+        #'clf__estimator': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l2'],
+        'clf__max_iter': list(range(100,200,100)),
+        #'clf__solver': ['newton-cg', 'lbfgs', 'sag']
+        'clf__solver': ['sag']
+
+    }, 
+    {
+        #'clf__estimator': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l1', 'l2'],
+        'clf__max_iter': list(range(100,200,100)),
+        'clf__solver': ['liblinear']
+    }
+
+]
+# step 4: create  pipeline
+pipeline = Pipeline([
+    ('pre', MinMaxScaler())
+    #, ('fs', model_rfecv)
+    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
+    , ('clf',  LogisticRegression(**rs))])
+   
+# step 5: Perform Gridsearch CV
+gs_final = GridSearchCV(pipeline
+                        , param_grid2
+                        , cv = skf_cv
+                        , scoring = mcc_score_fn, refit = 'mcc'
+                        , verbose = 1
+                        , return_train_score = False
+                        , **njobs)
+
+#fit
+gs_final.fit(X,y)
+gs_final.best_params_
+gs_final.best_score_
+
+# assign the fit
+gsfit = gs_final.fit(X,y)
+#gsfit.best_estimator_
+gsfit.best_params_
+gsfit.best_score_
+
+# Now get the features out
+all_features = gs_final.feature_names_in_
+#all_features = gsfit.feature_names_in_
+
+sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
+n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_
+
+
+# get model name
+model_name  = gs_final.best_estimator_.named_steps['clf']
+b_model_params = gs_final.best_params_
+
+print('\n========================================'
+      , '\nRunning model:'
+      , '\nModel name:', model_name
+      , '\n==============================================='
+      , '\nRunning feature selection with RFECV for model'
+      , '\nTotal no. of features in model:', len(all_features)
+      , '\nThese are:\n',  all_features, '\n\n'
+      , '\nNo of features for best model: ', n_sf
+      , '\nThese are:', sel_features, '\n\n'
+      , '\nBest Model hyperparams:', b_model_params
+
+      )