diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py new file mode 100644 index 0000000..80fa6a3 --- /dev/null +++ b/UQ_FS_eg.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat May 21 02:52:36 2022 + +@author: tanu +""" +# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.datasets import make_classification +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.feature_selection import SelectKBest, mutual_info_classif +#pd.options.plotting.backend = "plotly" +X_eg, y_eg = make_classification(n_samples=1000, + n_features=30, + n_informative=5, + n_redundant=5, + n_classes=2, + random_state=123) + +pipe = Pipeline([('scaler', StandardScaler()), + ('selector', SelectKBest(mutual_info_classif, k=9)), + ('classifier', LogisticRegression())]) + +search_space = [{'selector__k': [5, 6, 7, 10]}, + {'classifier': [LogisticRegression()], + 'classifier__C': [0.01,1.0], + 'classifier__solver': ['saga', 'lbfgs']}, + {'classifier': [RandomForestClassifier(n_estimators=100)], + 'classifier__max_depth': [5, 10, None]}, + {'classifier': [KNeighborsClassifier()], + 'classifier__n_neighbors': [3, 7, 11], + 'classifier__weights': ['uniform', 'distance']}] + + + +clf = GridSearchCV(pipe, search_space, cv=10, verbose=0) + +clf2 = clf.fit(X_eg, y_eg) +clf2._check_feature_names +clf2.best_estimator_.named_steps['selector'].n_features_in_ + +clf2.best_estimator_ #n of best features +clf2.best_params_ +clf2.best_estimator_.get_params +clf2.get_feature_names() + + + +clf3 = clf2.best_estimator_ # +clf3._final_estimator +clf3._final_estimator.C +clf3._final_estimator.solver + + +fs_bmod = clf2.best_estimator_ +print('\nbest model with feature selection:', fs_bmod) + + diff --git a/UQ_LR_FS_p2.py b/UQ_LR_FS_p2.py new file mode 100644 index 0000000..05ea68a --- /dev/null +++ b/UQ_LR_FS_p2.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Import libraries, data, and scoring func: UQ_pnca_ML.py +rs = {'random_state': 42} +njobs = {'n_jobs': 10} +#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() + +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + #feature = RFECV() + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + # {'feature__fs__estimator': LogisticRegression(**rs) + # , 'feature__fs__cv': [10] + # , 'feature__fs__scoring': ['matthews_corrcoef'] + # }, + + { + 'clf__estimator': [LogisticRegression(**rs)], + 'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + #'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['saga'] + }#, + # { + # 'clf__estimator': [MODEL2(**rs)], + # #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + # 'clf__estimator__C': np.logspace(0, 4, 10), + # 'clf__estimator__penalty': ['l2', 'none'], + # 'clf__estimator__max_iter': list(range(100,800,100)), + # 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] + # }, +] +#%% Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()) +# , ('fs', RFECV(LogisticRegression(**rs), cv = rskf_cv, scoring = 'matthews_corrcoef')) + , ('selector', SelectKBest(mutual_info_classif, k=6)) + , ('clf', ClfSwitcher()) +]) + +#%% Grid search i.e hyperparameter tuning and refitting on mcc +mod_fs = GridSearchCV(pipeline + , parameters + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +#%% Fit +mod_fs_fit = mod_fs.fit(X, y) +mod_fs_fbm = mod_fs_fit.best_params_ +mod_fs_fbmr = mod_fs_fit.cv_results_ +mod_fs_fbs = mod_fs_fit.best_score_ +print('Best model:\n', mod_fs_fbm) +print('Best models score:\n', mod_fs_fbs, ':' , round(mod_fs_fbs, 2)) + +#print('\nMean test score from fit results:', round(mean(mod_fs_fbmr['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(mod_fs_fbmr['mean_test_mcc']),2)) + +############################################################################### +#%% Blind test +###################################### +# Blind test +###################################### +test_predict = mod_fs_fit.predict(X_bts) +print(test_predict) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))