copy of ML dir to an FS-only version
This commit is contained in:
parent
52cc16f3fa
commit
80e6b3af96
23 changed files with 3115 additions and 243 deletions
|
@ -13,50 +13,50 @@ Created on Tue Mar 15 11:09:50 2022
|
|||
@author: tanu
|
||||
"""
|
||||
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
|
||||
model_lr = LogisticRegression(**rs)
|
||||
model_rfecv = RFECV(estimator = model_lr
|
||||
, cv = skf_cv
|
||||
#, cv = 10
|
||||
, min_features_to_select = 1 # default
|
||||
, scoring = 'matthews_corrcoef'
|
||||
)
|
||||
# model_lr = LogisticRegression(**rs)
|
||||
# model_rfecv = RFECV(estimator = model_lr
|
||||
# , cv = skf_cv
|
||||
# #, cv = 10
|
||||
# , min_features_to_select = 1 # default
|
||||
# , scoring = 'matthews_corrcoef'
|
||||
# )
|
||||
|
||||
param_grid2 = [
|
||||
{
|
||||
#'clf__estimator': [LogisticRegression(**rs)],
|
||||
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||
'C': np.logspace(0, 4, 10),
|
||||
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
|
||||
'max_iter': list(range(100,800,100)),
|
||||
'solver': ['saga']
|
||||
},
|
||||
{
|
||||
#'clf__estimator': [LogisticRegression(**rs)],
|
||||
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||
'C': np.logspace(0, 4, 10),
|
||||
'penalty': ['l2', 'none'],
|
||||
'max_iter': list(range(100,800,100)),
|
||||
'solver': ['newton-cg', 'lbfgs', 'sag']
|
||||
},
|
||||
{
|
||||
#'clf__estimator': [LogisticRegression(**rs)],
|
||||
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||
'C': np.logspace(0, 4, 10),
|
||||
'penalty': ['l1', 'l2'],
|
||||
'max_iter': list(range(100,800,100)),
|
||||
'solver': ['liblinear']
|
||||
}
|
||||
# param_grid2 = [
|
||||
# {
|
||||
# #'clf': [LogisticRegression(**rs)],
|
||||
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||
# 'C': np.logspace(0, 4, 10),
|
||||
# 'penalty': ['none', 'l1', 'l2', 'elasticnet'],
|
||||
# 'max_iter': list(range(100,800,100)),
|
||||
# 'solver': ['saga']
|
||||
# },
|
||||
# {
|
||||
# #'clf': [LogisticRegression(**rs)],
|
||||
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||
# 'C': np.logspace(0, 4, 10),
|
||||
# 'penalty': ['l2', 'none'],
|
||||
# 'max_iter': list(range(100,800,100)),
|
||||
# 'solver': ['newton-cg', 'lbfgs', 'sag']
|
||||
# },
|
||||
# {
|
||||
# #'clf': [LogisticRegression(**rs)],
|
||||
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
||||
# 'C': np.logspace(0, 4, 10),
|
||||
# 'penalty': ['l1', 'l2'],
|
||||
# 'max_iter': list(range(100,800,100)),
|
||||
# 'solver': ['liblinear']
|
||||
# }
|
||||
|
||||
]
|
||||
#-------------------------------------------------------------------------------
|
||||
# Grid search CV + FS
|
||||
gscv_lr = GridSearchCV(estimator = model_lr
|
||||
, param_grid = param_grid2
|
||||
, scoring = mcc_score_fn, refit = 'mcc'
|
||||
, cv = skf_cv
|
||||
, return_train_score = False
|
||||
, verbose = 3
|
||||
, **njobs)
|
||||
# ]
|
||||
# #-------------------------------------------------------------------------------
|
||||
# # Grid search CV + FS
|
||||
# gscv_lr = GridSearchCV(estimator = model_lr
|
||||
# , param_grid = param_grid2
|
||||
# , scoring = mcc_score_fn, refit = 'mcc'
|
||||
# , cv = skf_cv
|
||||
# , return_train_score = False
|
||||
# , verbose = 3
|
||||
# , **njobs)
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
################
|
||||
|
@ -64,27 +64,27 @@ gscv_lr = GridSearchCV(estimator = model_lr
|
|||
# Cannot get BEST model out
|
||||
################
|
||||
# Create pipeline
|
||||
pipeline = Pipeline([('pre', MinMaxScaler())
|
||||
#, ('fs', sfs_selector)
|
||||
, ('fs', model_rfecv )
|
||||
, ('clf', gscv_lr)])
|
||||
# pipeline = Pipeline([('pre', MinMaxScaler())
|
||||
# #, ('fs', sfs_selector)
|
||||
# , ('fs', model_rfecv )
|
||||
# , ('clf', gscv_lr)])
|
||||
|
||||
# Fit # dont assign fit
|
||||
#lr_fs_fit = pipeline.fit(X,y)
|
||||
pipeline.fit(X,y)
|
||||
# # Fit # dont assign fit
|
||||
# #lr_fs_fit = pipeline.fit(X,y)
|
||||
# pipeline.fit(X,y)
|
||||
|
||||
pipeline.best_params_
|
||||
# pipeline.best_params_
|
||||
|
||||
#https://github.com/scikit-learn/scikit-learn/issues/7536
|
||||
n_fs = gscv_lr.best_estimator_.n_features_in_
|
||||
n_fs
|
||||
# #https://github.com/scikit-learn/scikit-learn/issues/7536
|
||||
# n_fs = gscv_lr.best_estimator_.n_features_in_
|
||||
# n_fs
|
||||
|
||||
sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
|
||||
print('\nNo. of features selected with RFECV for model'
|
||||
, pipeline.named_steps['clf'].estimator
|
||||
, ':', n_fs
|
||||
, '\nThese are:', sel_features
|
||||
)
|
||||
# sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
|
||||
# print('\nNo. of features selected with RFECV for model'
|
||||
# , pipeline.named_steps['clf'].estimator
|
||||
# , ':', n_fs
|
||||
# , '\nThese are:', sel_features
|
||||
# )
|
||||
##############################################################
|
||||
# THIS ONE
|
||||
#########
|
||||
|
@ -106,28 +106,45 @@ param_grid2 = [
|
|||
|
||||
{'fs__min_features_to_select': [1]
|
||||
, 'fs__cv': [skf_cv]
|
||||
#, 'fs__scoring': ['matthews_corrcoef']},
|
||||
#, 'fs__scoring': [mcc_score_fn]}
|
||||
},
|
||||
|
||||
# {
|
||||
# #'clf': [LogisticRegression(**rs)],
|
||||
# 'clf__C': np.logspace(0, 4, 10),
|
||||
# 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
|
||||
# 'clf__max_iter': list(range(100,800,100)),
|
||||
# 'clf__solver': ['saga']
|
||||
# },
|
||||
# {
|
||||
# #'clf': [LogisticRegression(**rs)],
|
||||
# 'clf__C': np.logspace(0, 4, 10),
|
||||
# 'clf__penalty': ['l2', 'none'],
|
||||
# 'clf__max_iter': list(range(100,800,100)),
|
||||
# 'clf__solver': ['newton-cg', 'lbfgs', 'sag']
|
||||
# },
|
||||
# {
|
||||
# #'clf': [LogisticRegression(**rs)],
|
||||
# 'clf__C': np.logspace(0, 4, 10),
|
||||
# 'clf__penalty': ['l1', 'l2'],
|
||||
# 'clf__max_iter': list(range(100,800,100)),
|
||||
# 'clf__solver': ['liblinear']
|
||||
# }
|
||||
|
||||
{
|
||||
#'clf__estimator': [LogisticRegression(**rs)],
|
||||
{ #'clf': [LogisticRegression(**rs)],
|
||||
'clf__C': np.logspace(0, 4, 10),
|
||||
'clf__penalty': ['l2'],
|
||||
'clf__max_iter': list(range(100,200,100)),
|
||||
#'clf__solver': ['newton-cg', 'lbfgs', 'sag']
|
||||
'clf__solver': ['sag']
|
||||
|
||||
},
|
||||
{
|
||||
#'clf__estimator': [LogisticRegression(**rs)],
|
||||
'clf__C': np.logspace(0, 4, 10),
|
||||
'clf__penalty': ['l1', 'l2'],
|
||||
'clf__max_iter': list(range(100,200,100)),
|
||||
'clf__max_iter': [100],
|
||||
'clf__solver': ['liblinear']
|
||||
},
|
||||
|
||||
{ #'clf': [LogisticRegression(**rs)],
|
||||
'clf__C': np.logspace(0, 4, 10),
|
||||
'clf__penalty': ['l2'],
|
||||
'clf__max_iter':[100],
|
||||
'clf__solver': ['saga']
|
||||
}
|
||||
|
||||
|
||||
]
|
||||
# step 4: create pipeline
|
||||
pipeline = Pipeline([
|
||||
|
@ -149,12 +166,34 @@ gs_final = GridSearchCV(pipeline
|
|||
gs_final.fit(X,y)
|
||||
gs_final.best_params_
|
||||
gs_final.best_score_
|
||||
gs_final.best_estimator_
|
||||
|
||||
# assign the fit
|
||||
gsfit = gs_final.fit(X,y)
|
||||
#gsfit = gs_final.fit(X,y)
|
||||
#gsfit.best_estimator_
|
||||
gsfit.best_params_
|
||||
gsfit.best_score_
|
||||
#gsfit.best_params_
|
||||
#gsfit.best_score_
|
||||
|
||||
test_predict = gs_final.predict(X_bts)
|
||||
print(test_predict)
|
||||
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
|
||||
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Now get the features out
|
||||
all_features = gs_final.feature_names_in_
|
||||
|
@ -163,7 +202,6 @@ all_features = gs_final.feature_names_in_
|
|||
sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
|
||||
n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_
|
||||
|
||||
|
||||
# get model name
|
||||
model_name = gs_final.best_estimator_.named_steps['clf']
|
||||
b_model_params = gs_final.best_params_
|
||||
|
@ -179,4 +217,37 @@ print('\n========================================'
|
|||
, '\nThese are:', sel_features, '\n\n'
|
||||
, '\nBest Model hyperparams:', b_model_params
|
||||
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
######################################
|
||||
# Blind test
|
||||
######################################
|
||||
# See how it does on the BLIND test
|
||||
#print('\nBlind test score, mcc:', ))
|
||||
|
||||
#test_predict = gscv_lr_fit.predict(X_bts)
|
||||
test_predict = gs_final.predict(X_bts)
|
||||
print(test_predict)
|
||||
|
||||
print(accuracy_score(y_bts, test_predict))
|
||||
print(matthews_corrcoef(y_bts, test_predict))
|
||||
|
||||
# create a dict with all scores
|
||||
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
|
||||
'bts_fscore':None
|
||||
, 'bts_mcc':None
|
||||
, 'bts_precision':None
|
||||
, 'bts_recall':None
|
||||
, 'bts_accuracy':None
|
||||
, 'bts_roc_auc':None
|
||||
, 'bts_jaccard':None }
|
||||
lr_bts_dict
|
||||
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
|
||||
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
|
||||
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
|
||||
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
|
||||
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
|
||||
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
|
||||
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
|
||||
lr_bts_dict
|
Loading…
Add table
Add a link
Reference in a new issue