diff --git a/uq_ml_models/UQ_LR_FS2.py b/uq_ml_models/UQ_LR_FS2.py new file mode 100644 index 0000000..6719ae8 --- /dev/null +++ b/uq_ml_models/UQ_LR_FS2.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() +model_lr = LogisticRegression(**rs) +model_rfecv = RFECV(estimator = model_lr + , cv = skf_cv + #, cv = 10 + , min_features_to_select = 1 # default + , scoring = 'matthews_corrcoef' + ) + +param_grid2 = [ + { + #'clf__estimator': [LogisticRegression(**rs)], + #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'C': np.logspace(0, 4, 10), + 'penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'max_iter': list(range(100,800,100)), + 'solver': ['saga'] + }, + { + #'clf__estimator': [LogisticRegression(**rs)], + #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'C': np.logspace(0, 4, 10), + 'penalty': ['l2', 'none'], + 'max_iter': list(range(100,800,100)), + 'solver': ['newton-cg', 'lbfgs', 'sag'] + }, + { + #'clf__estimator': [LogisticRegression(**rs)], + #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'C': np.logspace(0, 4, 10), + 'penalty': ['l1', 'l2'], + 'max_iter': list(range(100,800,100)), + 'solver': ['liblinear'] + } + +] +#------------------------------------------------------------------------------- +# Grid search CV + FS +gscv_lr = GridSearchCV(estimator = model_lr + , param_grid = param_grid2 + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , return_train_score = False + , verbose = 3 + , **njobs) + +#------------------------------------------------------------------------------ +################ +# NOTE: GS is going into pipeline, +# Cannot get BEST model out +################ +# Create pipeline +pipeline = Pipeline([('pre', MinMaxScaler()) + #, ('fs', sfs_selector) + , ('fs', model_rfecv ) + , ('clf', gscv_lr)]) + +# Fit # dont assign fit +#lr_fs_fit = pipeline.fit(X,y) +pipeline.fit(X,y) + +pipeline.best_params_ + +#https://github.com/scikit-learn/scikit-learn/issues/7536 +n_fs = gscv_lr.best_estimator_.n_features_in_ +n_fs + +sel_features = X.columns[pipeline.named_steps['fs'].get_support()] +print('\nNo. of features selected with RFECV for model' + , pipeline.named_steps['clf'].estimator + , ':', n_fs + , '\nThese are:', sel_features + ) +############################################################## +# THIS ONE +######### +# Make Pipeline go into GS with FS +######### + +# step 1: specify model +#modLR = LogisticRegression(**rs) + +# step 2: specify fs +#model_rfecv = RFECV(estimator = model_lr + # , cv = skf_cv + #, min_features_to_select = 1 # default + #, scoring = 'matthews_corrcoef' + #) + +# step 3: specify param grid as dict +param_grid2 = [ + + {'fs__min_features_to_select': [1] + , 'fs__cv': [skf_cv] + #, 'fs__scoring': ['matthews_corrcoef']}, + #, 'fs__scoring': [mcc_score_fn]} + }, + + + { + #'clf__estimator': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l2'], + 'clf__max_iter': list(range(100,200,100)), + #'clf__solver': ['newton-cg', 'lbfgs', 'sag'] + 'clf__solver': ['sag'] + + }, + { + #'clf__estimator': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l1', 'l2'], + 'clf__max_iter': list(range(100,200,100)), + 'clf__solver': ['liblinear'] + } + +] +# step 4: create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()) + #, ('fs', model_rfecv) + , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef')) + , ('clf', LogisticRegression(**rs))]) + +# step 5: Perform Gridsearch CV +gs_final = GridSearchCV(pipeline + , param_grid2 + , cv = skf_cv + , scoring = mcc_score_fn, refit = 'mcc' + , verbose = 1 + , return_train_score = False + , **njobs) + +#fit +gs_final.fit(X,y) +gs_final.best_params_ +gs_final.best_score_ + +# assign the fit +gsfit = gs_final.fit(X,y) +#gsfit.best_estimator_ +gsfit.best_params_ +gsfit.best_score_ + +# Now get the features out +all_features = gs_final.feature_names_in_ +#all_features = gsfit.feature_names_in_ + +sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()] +n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_ + + +# get model name +model_name = gs_final.best_estimator_.named_steps['clf'] +b_model_params = gs_final.best_params_ + +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + + ) \ No newline at end of file