added UQ_LR FS2.py that has the FS run with LR model as part of pipeline and gridsearch
This commit is contained in:
parent
39cd7b4259
commit
52cc16f3fa
1 changed files with 182 additions and 0 deletions
182
uq_ml_models/UQ_LR_FS2.py
Normal file
182
uq_ml_models/UQ_LR_FS2.py
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon May 16 05:59:12 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Mar 15 11:09:50 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
|
||||||
|
model_lr = LogisticRegression(**rs)
|
||||||
|
model_rfecv = RFECV(estimator = model_lr
|
||||||
|
, cv = skf_cv
|
||||||
|
#, cv = 10
|
||||||
|
, min_features_to_select = 1 # default
|
||||||
|
, scoring = 'matthews_corrcoef'
|
||||||
|
)
|
||||||
|
|
||||||
|
param_grid2 = [
|
||||||
|
{
|
||||||
|
#'clf__estimator': [LogisticRegression(**rs)],
|
||||||
|
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||||
|
'C': np.logspace(0, 4, 10),
|
||||||
|
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
|
||||||
|
'max_iter': list(range(100,800,100)),
|
||||||
|
'solver': ['saga']
|
||||||
|
},
|
||||||
|
{
|
||||||
|
#'clf__estimator': [LogisticRegression(**rs)],
|
||||||
|
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||||
|
'C': np.logspace(0, 4, 10),
|
||||||
|
'penalty': ['l2', 'none'],
|
||||||
|
'max_iter': list(range(100,800,100)),
|
||||||
|
'solver': ['newton-cg', 'lbfgs', 'sag']
|
||||||
|
},
|
||||||
|
{
|
||||||
|
#'clf__estimator': [LogisticRegression(**rs)],
|
||||||
|
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||||
|
'C': np.logspace(0, 4, 10),
|
||||||
|
'penalty': ['l1', 'l2'],
|
||||||
|
'max_iter': list(range(100,800,100)),
|
||||||
|
'solver': ['liblinear']
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
#-------------------------------------------------------------------------------
|
||||||
|
# Grid search CV + FS
|
||||||
|
gscv_lr = GridSearchCV(estimator = model_lr
|
||||||
|
, param_grid = param_grid2
|
||||||
|
, scoring = mcc_score_fn, refit = 'mcc'
|
||||||
|
, cv = skf_cv
|
||||||
|
, return_train_score = False
|
||||||
|
, verbose = 3
|
||||||
|
, **njobs)
|
||||||
|
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
################
|
||||||
|
# NOTE: GS is going into pipeline,
|
||||||
|
# Cannot get BEST model out
|
||||||
|
################
|
||||||
|
# Create pipeline
|
||||||
|
pipeline = Pipeline([('pre', MinMaxScaler())
|
||||||
|
#, ('fs', sfs_selector)
|
||||||
|
, ('fs', model_rfecv )
|
||||||
|
, ('clf', gscv_lr)])
|
||||||
|
|
||||||
|
# Fit # dont assign fit
|
||||||
|
#lr_fs_fit = pipeline.fit(X,y)
|
||||||
|
pipeline.fit(X,y)
|
||||||
|
|
||||||
|
pipeline.best_params_
|
||||||
|
|
||||||
|
#https://github.com/scikit-learn/scikit-learn/issues/7536
|
||||||
|
n_fs = gscv_lr.best_estimator_.n_features_in_
|
||||||
|
n_fs
|
||||||
|
|
||||||
|
sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
|
||||||
|
print('\nNo. of features selected with RFECV for model'
|
||||||
|
, pipeline.named_steps['clf'].estimator
|
||||||
|
, ':', n_fs
|
||||||
|
, '\nThese are:', sel_features
|
||||||
|
)
|
||||||
|
##############################################################
|
||||||
|
# THIS ONE
|
||||||
|
#########
|
||||||
|
# Make Pipeline go into GS with FS
|
||||||
|
#########
|
||||||
|
|
||||||
|
# step 1: specify model
|
||||||
|
#modLR = LogisticRegression(**rs)
|
||||||
|
|
||||||
|
# step 2: specify fs
|
||||||
|
#model_rfecv = RFECV(estimator = model_lr
|
||||||
|
# , cv = skf_cv
|
||||||
|
#, min_features_to_select = 1 # default
|
||||||
|
#, scoring = 'matthews_corrcoef'
|
||||||
|
#)
|
||||||
|
|
||||||
|
# step 3: specify param grid as dict
|
||||||
|
param_grid2 = [
|
||||||
|
|
||||||
|
{'fs__min_features_to_select': [1]
|
||||||
|
, 'fs__cv': [skf_cv]
|
||||||
|
#, 'fs__scoring': ['matthews_corrcoef']},
|
||||||
|
#, 'fs__scoring': [mcc_score_fn]}
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
#'clf__estimator': [LogisticRegression(**rs)],
|
||||||
|
'clf__C': np.logspace(0, 4, 10),
|
||||||
|
'clf__penalty': ['l2'],
|
||||||
|
'clf__max_iter': list(range(100,200,100)),
|
||||||
|
#'clf__solver': ['newton-cg', 'lbfgs', 'sag']
|
||||||
|
'clf__solver': ['sag']
|
||||||
|
|
||||||
|
},
|
||||||
|
{
|
||||||
|
#'clf__estimator': [LogisticRegression(**rs)],
|
||||||
|
'clf__C': np.logspace(0, 4, 10),
|
||||||
|
'clf__penalty': ['l1', 'l2'],
|
||||||
|
'clf__max_iter': list(range(100,200,100)),
|
||||||
|
'clf__solver': ['liblinear']
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
# step 4: create pipeline
|
||||||
|
pipeline = Pipeline([
|
||||||
|
('pre', MinMaxScaler())
|
||||||
|
#, ('fs', model_rfecv)
|
||||||
|
, ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
|
||||||
|
, ('clf', LogisticRegression(**rs))])
|
||||||
|
|
||||||
|
# step 5: Perform Gridsearch CV
|
||||||
|
gs_final = GridSearchCV(pipeline
|
||||||
|
, param_grid2
|
||||||
|
, cv = skf_cv
|
||||||
|
, scoring = mcc_score_fn, refit = 'mcc'
|
||||||
|
, verbose = 1
|
||||||
|
, return_train_score = False
|
||||||
|
, **njobs)
|
||||||
|
|
||||||
|
#fit
|
||||||
|
gs_final.fit(X,y)
|
||||||
|
gs_final.best_params_
|
||||||
|
gs_final.best_score_
|
||||||
|
|
||||||
|
# assign the fit
|
||||||
|
gsfit = gs_final.fit(X,y)
|
||||||
|
#gsfit.best_estimator_
|
||||||
|
gsfit.best_params_
|
||||||
|
gsfit.best_score_
|
||||||
|
|
||||||
|
# Now get the features out
|
||||||
|
all_features = gs_final.feature_names_in_
|
||||||
|
#all_features = gsfit.feature_names_in_
|
||||||
|
|
||||||
|
sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
|
||||||
|
n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_
|
||||||
|
|
||||||
|
|
||||||
|
# get model name
|
||||||
|
model_name = gs_final.best_estimator_.named_steps['clf']
|
||||||
|
b_model_params = gs_final.best_params_
|
||||||
|
|
||||||
|
print('\n========================================'
|
||||||
|
, '\nRunning model:'
|
||||||
|
, '\nModel name:', model_name
|
||||||
|
, '\n==============================================='
|
||||||
|
, '\nRunning feature selection with RFECV for model'
|
||||||
|
, '\nTotal no. of features in model:', len(all_features)
|
||||||
|
, '\nThese are:\n', all_features, '\n\n'
|
||||||
|
, '\nNo of features for best model: ', n_sf
|
||||||
|
, '\nThese are:', sel_features, '\n\n'
|
||||||
|
, '\nBest Model hyperparams:', b_model_params
|
||||||
|
|
||||||
|
)
|
Loading…
Add table
Add a link
Reference in a new issue