added UQ_LR FS2.py that has the FS run with LR model as part of pipeline and gridsearch

2022-05-21 13:30:45 +01:00 · 2022-05-21 13:30:45 +01:00 · 52cc16f3fa
commit 52cc16f3fa
parent 39cd7b4259
1 changed files with 182 additions and 0 deletions
--- a/uq_ml_models/UQ_LR_FS2.py
+++ b/uq_ml_models/UQ_LR_FS2.py
@ -0,0 +1,182 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 16 05:59:12 2022
@author: tanu
 """
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
 model_lr = LogisticRegression(**rs)
 model_rfecv = RFECV(estimator = model_lr
                    , cv = skf_cv
                    #, cv = 10
                    , min_features_to_select = 1 # default
                    , scoring = 'matthews_corrcoef'
                    )
 param_grid2 = [
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'max_iter': list(range(100,800,100)),
        'solver': ['saga']
    },
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l2', 'none'],
        'max_iter': list(range(100,800,100)),
        'solver': ['newton-cg', 'lbfgs', 'sag']
    }, 
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l1', 'l2'],
        'max_iter': list(range(100,800,100)),
        'solver': ['liblinear']
    }
 ]    
 #-------------------------------------------------------------------------------
 # Grid search CV + FS
 gscv_lr = GridSearchCV(estimator = model_lr
                    , param_grid = param_grid2
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , return_train_score = False
                    , verbose = 3
                    , **njobs)
 #------------------------------------------------------------------------------
 ################
 # NOTE: GS is going into pipeline, 
 # Cannot get BEST model out
 ################ 
 # Create pipeline
 pipeline = Pipeline([('pre', MinMaxScaler())
                     #, ('fs', sfs_selector)
                     , ('fs', model_rfecv )
                     , ('clf', gscv_lr)])
 # Fit # dont assign fit
 #lr_fs_fit = pipeline.fit(X,y)
 pipeline.fit(X,y)
 pipeline.best_params_
 #https://github.com/scikit-learn/scikit-learn/issues/7536
 n_fs = gscv_lr.best_estimator_.n_features_in_
 n_fs
 sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
 print('\nNo. of features selected with RFECV for model'
      , pipeline.named_steps['clf'].estimator
      , ':', n_fs
      , '\nThese are:', sel_features
      )
 ##############################################################
 # THIS ONE
 #########
 # Make Pipeline go into GS with FS
 #########
 # step 1: specify model 
 #modLR = LogisticRegression(**rs)
 # step 2: specify fs
 #model_rfecv = RFECV(estimator = model_lr
                   # , cv = skf_cv
                    #, min_features_to_select = 1 # default
                    #, scoring = 'matthews_corrcoef'
                    #)
 # step 3: specify param grid as dict
 param_grid2 = [
    {'fs__min_features_to_select': [1]
    , 'fs__cv': [skf_cv]
    #, 'fs__scoring': ['matthews_corrcoef']},
    #, 'fs__scoring': [mcc_score_fn]}
    },
    { 
        #'clf__estimator': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l2'],
        'clf__max_iter': list(range(100,200,100)),
        #'clf__solver': ['newton-cg', 'lbfgs', 'sag']
        'clf__solver': ['sag']
    }, 
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l1', 'l2'],
        'clf__max_iter': list(range(100,200,100)),
        'clf__solver': ['liblinear']
    }
 ]
 # step 4: create  pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler())
    #, ('fs', model_rfecv)
    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
    , ('clf',  LogisticRegression(**rs))])
 # step 5: Perform Gridsearch CV
 gs_final = GridSearchCV(pipeline
                        , param_grid2
                        , cv = skf_cv
                        , scoring = mcc_score_fn, refit = 'mcc'
                        , verbose = 1
                        , return_train_score = False
                        , **njobs)
 #fit
 gs_final.fit(X,y)
 gs_final.best_params_
 gs_final.best_score_
 # assign the fit
 gsfit = gs_final.fit(X,y)
 #gsfit.best_estimator_
 gsfit.best_params_
 gsfit.best_score_
 # Now get the features out
 all_features = gs_final.feature_names_in_
 #all_features = gsfit.feature_names_in_
 sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
 n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_
 # get model name
 model_name  = gs_final.best_estimator_.named_steps['clf']
 b_model_params = gs_final.best_params_
 print('\n========================================'
      , '\nRunning model:'
      , '\nModel name:', model_name
      , '\n==============================================='
      , '\nRunning feature selection with RFECV for model'
      , '\nTotal no. of features in model:', len(all_features)
      , '\nThese are:\n',  all_features, '\n\n'
      , '\nNo of features for best model: ', n_sf
      , '\nThese are:', sel_features, '\n\n'
      , '\nBest Model hyperparams:', b_model_params
      )