ML_AI_training/uq_ml_models/UQ_LR_FS2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 16 05:59:12 2022

@author: tanu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022

@author: tanu
"""
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
# model_lr = LogisticRegression(**rs)
# model_rfecv = RFECV(estimator = model_lr
#                     , cv = skf_cv
#                     #, cv = 10
#                     , min_features_to_select = 1 # default
#                     , scoring = 'matthews_corrcoef'
#                     )

# param_grid2 = [
#     {
#         #'clf': [LogisticRegression(**rs)],
#         #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#         'C': np.logspace(0, 4, 10),
#         'penalty': ['none', 'l1', 'l2', 'elasticnet'],
#         'max_iter': list(range(100,800,100)),
#         'solver': ['saga']
#     },
#     {
#         #'clf': [LogisticRegression(**rs)],
#         #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#         'C': np.logspace(0, 4, 10),
#         'penalty': ['l2', 'none'],
#         'max_iter': list(range(100,800,100)),
#         'solver': ['newton-cg', 'lbfgs', 'sag']
#     },
#     {
#         #'clf': [LogisticRegression(**rs)],
#         #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#         'C': np.logspace(0, 4, 10),
#         'penalty': ['l1', 'l2'],
#         'max_iter': list(range(100,800,100)),
#         'solver': ['liblinear']
#     }

# ]
# #-------------------------------------------------------------------------------
# # Grid search CV + FS
# gscv_lr = GridSearchCV(estimator = model_lr
#                     , param_grid = param_grid2
#                     , scoring = mcc_score_fn, refit = 'mcc'
#                     , cv = skf_cv
#                     , return_train_score = False
#                     , verbose = 3
#                     , **njobs)

#------------------------------------------------------------------------------
################
# NOTE: GS is going into pipeline,
# Cannot get BEST model out
# https://stackoverflow.com/questions/55609339/how-to-perform-feature-selection-with-gridsearchcv-in-sklearn-in-python
################
# Create pipeline
# pipeline = Pipeline([('pre', MinMaxScaler())
#                      #, ('fs', sfs_selector)
#                      , ('fs', model_rfecv )
#                      , ('clf', gscv_lr)])

# # Fit # dont assign fit
# #lr_fs_fit = pipeline.fit(X,y)
# pipeline.fit(X,y)

# pipeline.best_params_

# #https://github.com/scikit-learn/scikit-learn/issues/7536
# n_fs = gscv_lr.best_estimator_.n_features_in_
# n_fs

# sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
# print('\nNo. of features selected with RFECV for model'
#       , pipeline.named_steps['clf'].estimator
#       , ':', n_fs
#       , '\nThese are:', sel_features
#       )
##############################################################
# THIS ONE
#########
# Make Pipeline go into GS with FS
#########

# step 1: specify model
#modLR = LogisticRegression(**rs)

# step 2: specify fs
#model_rfecv = RFECV(estimator = model_lr
                   # , cv = skf_cv
                    #, min_features_to_select = 1 # default
                    #, scoring = 'matthews_corrcoef'
                    #)

# step 3: specify param grid as dict
param_grid2 = [

    {'fs__min_features_to_select': [1]
    , 'fs__cv': [skf_cv]
    },

    # {
    #     #'clf': [LogisticRegression(**rs)],
    #     'clf__C': np.logspace(0, 4, 10),
    #     'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    #     'clf__max_iter': list(range(100,800,100)),
    #     'clf__solver': ['saga']
    # },
    # {
    #     #'clf': [LogisticRegression(**rs)],
    #     'clf__C': np.logspace(0, 4, 10),
    #     'clf__penalty': ['l2', 'none'],
    #     'clf__max_iter': list(range(100,800,100)),
    #     'clf__solver': ['newton-cg', 'lbfgs', 'sag']
    # },
    # {
    #     #'clf': [LogisticRegression(**rs)],
    #     'clf__C': np.logspace(0, 4, 10),
    #     'clf__penalty': ['l1', 'l2'],
    #     'clf__max_iter': list(range(100,800,100)),
    #     'clf__solver': ['liblinear']
    # }

    {  #'clf': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l2'],
        'clf__max_iter': [100],
        'clf__solver': ['liblinear']
    },

    {  #'clf': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l2'],
        'clf__max_iter':[100],
        'clf__solver': ['saga']
    }

]
# step 4: create  pipeline
pipeline = Pipeline([
    ('pre', MinMaxScaler())
    #, ('fs', model_rfecv)
    , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
    , ('clf',  LogisticRegression(**rs))])

# step 5: Perform Gridsearch CV
gs_final = GridSearchCV(pipeline
                        , param_grid2
                        , cv = skf_cv
                        , scoring = mcc_score_fn, refit = 'mcc'
                        , verbose = 1
                        , return_train_score = False
                        , **njobs)

#fit
gs_final.fit(X,y)
gs_final.best_params_
gs_final.best_score_
gs_final.best_estimator_

# assign the fit
#gsfit = gs_final.fit(X,y)
#gsfit.best_estimator_
#gsfit.best_params_
#gsfit.best_score_

test_predict = gs_final.predict(X_bts)
print(test_predict)
print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))

# Now get the features out
all_features = gs_final.feature_names_in_
#all_features = gsfit.feature_names_in_

sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_

# get model name
model_name  = gs_final.best_estimator_.named_steps['clf']
b_model_params = gs_final.best_params_

print('\n========================================'
      , '\nRunning model:'
      , '\nModel name:', model_name
      , '\n==============================================='
      , '\nRunning feature selection with RFECV for model'
      , '\nTotal no. of features in model:', len(all_features)
      , '\nThese are:\n',  all_features, '\n\n'
      , '\nNo of features for best model: ', n_sf
      , '\nThese are:', sel_features, '\n\n'
      , '\nBest Model hyperparams:', b_model_params

      )


######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))

#test_predict = gscv_lr_fit.predict(X_bts)
test_predict =  gs_final.predict(X_bts)
print(test_predict)

print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))

# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict