From 39cd7b4259de689f9a30d403f10e28dcac38ea8b Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Sat, 21 May 2022 13:30:04 +0100 Subject: [PATCH] finally made the fs work within class and without --- UQ_FS_eg.py | 1 + UQ_LR_FS_p2.py | 107 ++++++++++++++++++++++++++++++++------- uq_ml_models/UQ_LR_FS.py | 97 ++--------------------------------- 3 files changed, 95 insertions(+), 110 deletions(-) diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py index 80fa6a3..1c590ba 100644 --- a/UQ_FS_eg.py +++ b/UQ_FS_eg.py @@ -25,6 +25,7 @@ X_eg, y_eg = make_classification(n_samples=1000, pipe = Pipeline([('scaler', StandardScaler()), ('selector', SelectKBest(mutual_info_classif, k=9)), + ('classifier', LogisticRegression())]) search_space = [{'selector__k': [5, 6, 7, 10]}, diff --git a/UQ_LR_FS_p2.py b/UQ_LR_FS_p2.py index 05ea68a..f399795 100644 --- a/UQ_LR_FS_p2.py +++ b/UQ_LR_FS_p2.py @@ -21,18 +21,25 @@ class ClfSwitcher(BaseEstimator): def __init__( self, estimator = SGDClassifier(), - #feature = RFECV() + #feature = RFECV(SGDClassifier()) ): """ A Custom BaseEstimator that can switch between classifiers. :param estimator: sklearn object - The classifier """ self.estimator = estimator + #self.feature = feature def fit(self, X, y=None, **kwargs): self.estimator.fit(X, y) + #self.feature.fit(X, y) return self + # def transform(self, X, y=None): + # #self.estimator.transform(X, y) + # self.feature.transform(X) + # return self + def predict(self, X, y=None): return self.estimator.predict(X) @@ -42,23 +49,26 @@ class ClfSwitcher(BaseEstimator): def score(self, X, y): return self.estimator.score(X, y) +#%% parameters = [ - # {'feature__fs__estimator': LogisticRegression(**rs) - # , 'feature__fs__cv': [10] - # , 'feature__fs__scoring': ['matthews_corrcoef'] - # }, + + # {'fs__feature__min_features_to_select': [1] + # , 'fs__feature__scoring': ['matthews_corrcoef'] + # , 'fs__feature__cv': [skf_cv]}, + + {'fs__min_features_to_select': [1] + #, 'fs__scoring': ['matthews_corrcoef'] + , 'fs__cv': [skf_cv]}, { - 'clf__estimator': [LogisticRegression(**rs)], - 'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - #'clf__estimator__C': np.logspace(0, 4, 10), - 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], - 'clf__estimator__max_iter': list(range(100,800,100)), - 'clf__estimator__solver': ['saga'] + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['saga'] }#, # { # 'clf__estimator': [MODEL2(**rs)], - # #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # 'clf__estimator__C': np.logspace(0, 4, 10), # 'clf__estimator__penalty': ['l2', 'none'], # 'clf__estimator__max_iter': list(range(100,800,100)), @@ -68,13 +78,14 @@ parameters = [ #%% Create pipeline pipeline = Pipeline([ ('pre', MinMaxScaler()) -# , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef')) - , ('selector', SelectKBest(mutual_info_classif, k=6)) - , ('clf', ClfSwitcher()) -]) + , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn +# , ('fs', ClfSwitcher()) + , ('clf', ClfSwitcher()) + ]) -#%% Grid search i.e hyperparameter tuning and refitting on mcc -mod_fs = GridSearchCV(pipeline +#%% +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_lr = GridSearchCV(pipeline , parameters , scoring = mcc_score_fn, refit = 'mcc' , cv = skf_cv @@ -82,6 +93,66 @@ mod_fs = GridSearchCV(pipeline , return_train_score = False , verbose = 3) +# Fit +gscv_lr.fit(X, y) + +#### +gscv_lr_fit = gscv_lr.fit(X, y) +gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ +gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ + +#%% Grid search i.e hyperparameter tuning and refitting on mcc + +param_grid2 = [ + + {'fs__min_features_to_select': [1] + , 'fs__cv': [skf_cv] + }, + + + { + #'clf__estimator': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l2'], + 'clf__max_iter': list(range(100,200,100)), + #'clf__solver': ['newton-cg', 'lbfgs', 'sag'] + 'clf__solver': ['sag'] + + }, + { + #'clf__estimator': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l1', 'l2'], + 'clf__max_iter': list(range(100,200,100)), + 'clf__solver': ['liblinear'] + } + +] +# step 4: create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()) + #, ('fs', model_rfecv) + , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef')) + , ('clf', LogisticRegression(**rs))]) + +# step 5: Perform Gridsearch CV +gs_final = GridSearchCV(pipeline + , param_grid2 + , cv = skf_cv + , scoring = mcc_score_fn, refit = 'mcc' + , verbose = 1 + , return_train_score = False + , **njobs) + + + + + + + + + + #%% Fit mod_fs_fit = mod_fs.fit(X, y) mod_fs_fbm = mod_fs_fit.best_params_ diff --git a/uq_ml_models/UQ_LR_FS.py b/uq_ml_models/UQ_LR_FS.py index 0346809..6910fab 100644 --- a/uq_ml_models/UQ_LR_FS.py +++ b/uq_ml_models/UQ_LR_FS.py @@ -12,85 +12,6 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ -#%% Import libs -rs = {'random_state': 42} -njobs = {'n_jobs': 10} - -scoring_fn = ({'accuracy' : make_scorer(accuracy_score) - , 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jaccard' : make_scorer(jaccard_score) - }) - -mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} -jacc_score_fn = {'jcc': make_scorer(jaccard_score)} -#%% Get data -y.to_frame().value_counts().plot(kind = 'bar') -blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') - -# %% Logistic Regression + FS + hyperparameter -# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/ -# from sklearn.feature_selection import SelectKBest, mutual_info_classif - -# # Create pipeline -# pipe = Pipeline([ -# ('pre', MinMaxScaler()) -# , ('fs', RFECV( LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef', **njobs,verbose = 3)) -# #, ('fs', SelectKBest(mutual_info_classif, k=5)) -# , ('clf', LogisticRegression(**rs)) -# ]) - -# # Create search space -# param_grid = [{'fs__step': [1]}, - -# { -# 'clf': [LogisticRegression(**rs)], -# #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], -# 'clf__C': np.logspace(0, 4, 10), -# 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], -# 'clf__max_iter': list(range(100,800,100)), -# 'clf__solver': ['saga'] -# }, -# { -# 'clf': [LogisticRegression(**rs)], -# #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], -# 'clf__C': np.logspace(0, 4, 10), -# 'clf__penalty': ['l2', 'none'], -# 'clf__max_iter': list(range(100,800,100)), -# 'clf__solver': ['newton-cg', 'lbfgs', 'sag'] -# }, -# { -# 'clf': [LogisticRegression(**rs)], -# #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], -# 'clf__C': np.logspace(0, 4, 10), -# 'clf__penalty': ['l1', 'l2'], -# 'clf__max_iter': list(range(100,800,100)), -# 'clf__solver': ['liblinear'] -# }] - -# # Run Grid search -# gscv_fs_lr = GridSearchCV(pipe -# , param_grid -# , cv = skf_cv -# , scoring = mcc_score_fn, refit = 'mcc' -# , verbose = 3) - -# gscv_fs_lr_fit = gscv_fs_lr.fit(X, y) -# gscv_fs_lr_fit_be_mod = gscv_fs_lr_fit.best_params_ -# gscv_fs_lr_fit_be_res = gscv_fs_lr_fit.cv_results_ - -# print('Best model:\n', gscv_fs_lr_fit_be_mod) -# print('Best models score:\n', gscv_fs_lr_fit.best_score_, ':' , round(gscv_fs_lr_fit.best_score_, 2)) - -# #print('\nMean test score from fit results:', round(mean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2)) -# print('\nMean test score from fit results:', round(np.nanmean(gscv_fs_lr_fit_be_res['mean_test_mcc']),2)) - -############################################################################## -#MANUAL - #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() model_lr = LogisticRegression(**rs) model_rfecv = RFECV(estimator = model_lr @@ -99,15 +20,6 @@ model_rfecv = RFECV(estimator = model_lr , scoring = 'matthews_corrcoef' ) -# model_rfecv = SequentialFeatureSelector(estimator = model_lr -# , n_features_to_select = 'auto' -# , tol = None -# # , cv = 10 -# , cv = rskf_cv -# # , direction ='backward' -# , direction ='forward' -# , **njobs) - param_grid2 = [ { #'clf__estimator': [LogisticRegression(**rs)], @@ -155,14 +67,15 @@ pipeline = Pipeline([('pre', MinMaxScaler()) # Fit lr_fs_fit = pipeline.fit(X,y) -lr_fs_fit_be_mod = lr_fs_fit.best_params_ -lr_fs_fit_be_res = lr_fs_fit.cv_results_ +#lr_fs_fit_be_mod = lr_fs_fit.best_params_ +#lr_fs_fit_be_res = lr_fs_fit.cv_results_ +dir(lr_fs_fit) print('Best model:\n', lr_fs_fit_be_mod) print('Best models score:\n', lr_fs_fit.best_score_, ':' , round(lr_fs_fit.best_score_, 2)) pipeline.predict(X_bts) -lr_fs.predict(X_bts) +lr_fs_fit.predict(X_bts) test_predict = pipeline.predict(X_bts) print(test_predict) @@ -238,4 +151,4 @@ lr_df #FIXME: tidy the index of the formatted df -############################################################################### \ No newline at end of file +###############################################################################