diff --git a/UQ_FS_eg.py b/UQ_FS_eg.py index 1c590ba..49ae5c9 100644 --- a/UQ_FS_eg.py +++ b/UQ_FS_eg.py @@ -49,8 +49,7 @@ clf2.best_estimator_.named_steps['selector'].n_features_in_ clf2.best_estimator_ #n of best features clf2.best_params_ clf2.best_estimator_.get_params -clf2.get_feature_names() - +clf2.get_feature_names( clf3 = clf2.best_estimator_ # @@ -62,4 +61,37 @@ clf3._final_estimator.solver fs_bmod = clf2.best_estimator_ print('\nbest model with feature selection:', fs_bmod) +######################################################### +# my data + +pipe = Pipeline([ + ('pre', MinMaxScaler()) + ('selector', RFECV(LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef')) + , ('classifier', LogisticRegression(**rs))]) + +search_space = [{'selector__min_features_to_select': [1,2]}, + {'classifier': [LogisticRegression()], + #'classifier__C': np.logspace(0, 4, 10), + 'classifier__C': [2, 2.8], + 'classifier__max_iter': [100], + 'classifier__penalty': ['l1', 'l2'], + 'classifier__solver': ['saga'] + }] #, + #{'classifier': [RandomForestClassifier(n_estimators=100)], + # 'classifier__max_depth': [5, 10, None]}, + #{'classifier': [KNeighborsClassifier()], + # 'classifier__n_neighbors': [3, 7, 11], + # 'classifier__weights': ['uniform', 'distance'] + #}] + +clf = GridSearchCV(pipe, search_space, cv=skf_cv, scoring = mcc_score_fn, refit = 'mcc', verbose=0) + +clf.fit(X, y) +clf.best_params_ +clf.best_score_ + +tp = clf.predict(X_bts) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) + diff --git a/UQ_LR_FS_p1.py b/UQ_LR_FS_p1.py index 4c444df..e02a64f 100644 --- a/UQ_LR_FS_p1.py +++ b/UQ_LR_FS_p1.py @@ -12,60 +12,20 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ -#%% Import libs -import numpy as np -import pandas as pd -from sklearn.model_selection import GridSearchCV -from sklearn import datasets -from sklearn.ensemble import ExtraTreesClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.svm import SVC -from sklearn.base import BaseEstimator -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import SGDClassifier -from sklearn.pipeline import Pipeline -from sklearn.model_selection import GridSearchCV -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder -from xgboost import XGBClassifier -##################### -from sklearn.feature_selection import RFE -from sklearn.feature_selection import RFECV -from sklearn.linear_model import LogisticRegression -from sklearn.feature_selection import SelectFromModel -from sklearn.feature_selection import SequentialFeatureSelector +# Attempting feature selection for LR WITHOUT ClfSwitcher Class +#%% Import libraries, data, and scoring func: UQ_pnca_ML.py rs = {'random_state': 42} njobs = {'n_jobs': 10} -#%% - -y.to_frame().value_counts().plot(kind = 'bar') -blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') - -scoring_fn = ({'accuracy' : make_scorer(accuracy_score) - , 'fscore' : make_scorer(f1_score) - , 'mcc' : make_scorer(matthews_corrcoef) - , 'precision' : make_scorer(precision_score) - , 'recall' : make_scorer(recall_score) - , 'roc_auc' : make_scorer(roc_auc_score) - , 'jaccard' : make_scorer(jaccard_score) - }) - -mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} -jacc_score_fn = {'jcc': make_scorer(jaccard_score)} - -#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() +#%% Logistic Regression + hyperparam + FS: Pipeline takes GridSearchCV (not the other way round!) model_lr = LogisticRegression(**rs) model_rfecv = RFECV(estimator = model_lr - , cv = rskf_cv + , cv = skf_cv #, cv = 10 , scoring = 'matthews_corrcoef' ) - -# model_rfecv = SequentialFeatureSelector(estimator = model_lr +# model_sfs = SequentialFeatureSelector(estimator = model_lr # , n_features_to_select = 'auto' # , tol = None # # , cv = 10 @@ -74,23 +34,9 @@ model_rfecv = RFECV(estimator = model_lr # , direction ='forward' # , **njobs) -# param_grid = [ -# { 'C': np.logspace(0, 4, 10), -# 'penalty': ['l1', 'l2'], -# 'max_iter': [100], -# 'solver': ['saga'] -# }#, -# # { 'C': [1], -# # 'penalty': ['l1'], -# # 'max_iter': [100], -# # 'solver': ['saga'] -# # } -# ] - param_grid2 = [ { #'clf__estimator': [LogisticRegression(**rs)], - #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'C': np.logspace(0, 4, 10), 'penalty': ['none', 'l1', 'l2', 'elasticnet'], 'max_iter': list(range(100,800,100)), @@ -98,7 +44,6 @@ param_grid2 = [ }, { #'clf__estimator': [LogisticRegression(**rs)], - #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'C': np.logspace(0, 4, 10), 'penalty': ['l2', 'none'], 'max_iter': list(range(100,800,100)), @@ -106,13 +51,24 @@ param_grid2 = [ }, { #'clf__estimator': [LogisticRegression(**rs)], - #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'C': np.logspace(0, 4, 10), 'penalty': ['l1', 'l2'], 'max_iter': list(range(100,800,100)), 'solver': ['liblinear'] } - + +# lesser params for testing + # { 'C': np.logspace(0, 4, 10), + # 'penalty': ['l1', 'l2'], + # 'max_iter': [100], + # 'solver': ['saga'] + # }, + # { 'C': [1], + # 'penalty': ['l1'], + # 'max_iter': [100], + # 'solver': ['saga'] + # } + ] #------------------------------------------------------------------------------- @@ -127,24 +83,21 @@ gscv_lr = GridSearchCV(model_lr #------------------------------------------------------------------------------ # Create pipeline -pipeline = Pipeline([('pre', MinMaxScaler()) +pipeline2 = Pipeline([('pre', MinMaxScaler()) #, ('feature_selection', sfs_selector) , ('feature_selection', model_rfecv ) , ('clf', gscv_lr)]) # Fit -lr_fs = pipeline.fit(X,y) +pipeline2.fit(X,y) +pipeline2.predict(X_bts) + +# Assigning fit an then running predict: sanity check +#lr_fs = pipeline.fit(X,y) +#lr_fs.predict(X_bts) -pipeline.predict(X_bts) -lr_fs.predict(X_bts) -test_predict = pipeline.predict(X_bts) -print(test_predict) -print(np.array(y_bts)) -#y_btsf = np.array(y_bts) -print(accuracy_score(y_bts, test_predict)) -print(matthews_corrcoef(y_bts, test_predict)) ############################################################################### ##################### @@ -160,13 +113,12 @@ print(matthews_corrcoef(y_bts, test_predict)) #print('\nBlind test score, mcc:', )) #test_predict = gscv_lr_fit.predict(X_bts) -test_predict = pipeline.predict(X_bts) -test_predict_fs = sfs_selector.predict(X_bts) +test_predict = pipeline2.predict(X_bts) print(test_predict) -print(accuracy_score(y_bts, test_predict)) -print(matthews_corrcoef(y_bts, test_predict)) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) # create a dict with all scores lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) @@ -237,7 +189,7 @@ from sklearn.feature_selection import SequentialFeatureSelector # RFE: ~ model coef or feature_importance rfe_selector = RFECV(estimator = LogisticRegression(**rs - , penalty='l1' + , penalty='l2' , solver='saga' , max_iter = 100 , C= 1.0) @@ -249,6 +201,30 @@ rfe_fs = X.columns[rfe_selector.get_support()] print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs) , '\nThese are:', rfe_fs) +# blind test +TEST_PREDICT = rfe_selector.predict(X_bts) +TEST_PREDICT + +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, TEST_PREDICT),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, TEST_PREDICT),2)) + +# add pipeline with preprocessing: changes numbers +pipe = Pipeline([ + ('pre', MinMaxScaler()) + #, ('fs', model_rfecv) + , ('fs', rfe_selector) + , ('clf', LogisticRegression(**rs))]) + +pipe.fit(X,y) + +tp = pipe.predict(X_bts) + +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2)) + +################################## + + # SFM: ~ model coef or feature_importance sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs , penalty='l1' diff --git a/UQ_LR_FS_p2.py b/UQ_LR_FS_p2.py index f399795..fb5432c 100644 --- a/UQ_LR_FS_p2.py +++ b/UQ_LR_FS_p2.py @@ -12,6 +12,8 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ + +# similar to _p1 but with Clf_Switcher #%% Import libraries, data, and scoring func: UQ_pnca_ML.py rs = {'random_state': 42} njobs = {'n_jobs': 10} @@ -21,25 +23,17 @@ class ClfSwitcher(BaseEstimator): def __init__( self, estimator = SGDClassifier(), - #feature = RFECV(SGDClassifier()) ): """ A Custom BaseEstimator that can switch between classifiers. :param estimator: sklearn object - The classifier """ self.estimator = estimator - #self.feature = feature def fit(self, X, y=None, **kwargs): self.estimator.fit(X, y) - #self.feature.fit(X, y) return self - - # def transform(self, X, y=None): - # #self.estimator.transform(X, y) - # self.feature.transform(X) - # return self - + def predict(self, X, y=None): return self.estimator.predict(X) @@ -52,35 +46,49 @@ class ClfSwitcher(BaseEstimator): #%% parameters = [ - # {'fs__feature__min_features_to_select': [1] - # , 'fs__feature__scoring': ['matthews_corrcoef'] - # , 'fs__feature__cv': [skf_cv]}, - {'fs__min_features_to_select': [1] #, 'fs__scoring': ['matthews_corrcoef'] , 'fs__cv': [skf_cv]}, - { - 'clf__estimator': [LogisticRegression(**rs)], - #'clf__estimator__C': np.logspace(0, 4, 10), - 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], - 'clf__estimator__max_iter': list(range(100,800,100)), - 'clf__estimator__solver': ['saga'] - }#, # { - # 'clf__estimator': [MODEL2(**rs)], + # 'clf__estimator': [LogisticRegression(**rs)], + # 'clf__estimator__C': np.logspace(0, 4, 10), + # 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + # 'clf__estimator__max_iter': list(range(100,800,100)), + # 'clf__estimator__solver': ['saga'] + # }, + # { + # 'clf__estimator': [LogisticRegression(**rs)], # 'clf__estimator__C': np.logspace(0, 4, 10), # 'clf__estimator__penalty': ['l2', 'none'], # 'clf__estimator__max_iter': list(range(100,800,100)), # 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] # }, + # { + # 'clf__estimator': [LogisticRegression(**rs)], + # 'clf__estimator__C': np.logspace(0, 4, 10), + # 'clf__estimator__penalty': ['l1', 'l2'], + # 'clf__estimator__max_iter': list(range(100,800,100)), + # 'clf__estimator__solver': ['liblinear'] + # } + + {'fs__min_features_to_select': [1,2]}, + {'classifier': [LogisticRegression()], + #'classifier__C': np.logspace(0, 4, 10), + 'classifier__C': [2, 2.8], + 'classifier__max_iter': [100], + 'classifier__penalty': ['l1', 'l2'], + 'classifier__solver': ['saga'] + + } ] #%% Create pipeline pipeline = Pipeline([ - ('pre', MinMaxScaler()) - , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn -# , ('fs', ClfSwitcher()) - , ('clf', ClfSwitcher()) + # ('pre', MinMaxScaler()) + ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn + #, ('clf', ClfSwitcher()) + , ('classifier', ClfSwitcher()) + ]) #%% @@ -95,81 +103,66 @@ gscv_lr = GridSearchCV(pipeline # Fit gscv_lr.fit(X, y) +gscv_lr.best_estimator_ +gscv_lr.best_params_ +gscv_lr.best_score_ + +# Blind test +test_predict = gscv_lr.predict(X_bts) +print(test_predict) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) + + #### gscv_lr_fit = gscv_lr.fit(X, y) gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ +gscv_lr_fit.best_score_ -#%% Grid search i.e hyperparameter tuning and refitting on mcc - -param_grid2 = [ - - {'fs__min_features_to_select': [1] - , 'fs__cv': [skf_cv] - }, - - - { - #'clf__estimator': [LogisticRegression(**rs)], - 'clf__C': np.logspace(0, 4, 10), - 'clf__penalty': ['l2'], - 'clf__max_iter': list(range(100,200,100)), - #'clf__solver': ['newton-cg', 'lbfgs', 'sag'] - 'clf__solver': ['sag'] - - }, - { - #'clf__estimator': [LogisticRegression(**rs)], - 'clf__C': np.logspace(0, 4, 10), - 'clf__penalty': ['l1', 'l2'], - 'clf__max_iter': list(range(100,200,100)), - 'clf__solver': ['liblinear'] - } - -] -# step 4: create pipeline -pipeline = Pipeline([ - ('pre', MinMaxScaler()) - #, ('fs', model_rfecv) - , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef')) - , ('clf', LogisticRegression(**rs))]) - -# step 5: Perform Gridsearch CV -gs_final = GridSearchCV(pipeline - , param_grid2 - , cv = skf_cv - , scoring = mcc_score_fn, refit = 'mcc' - , verbose = 1 - , return_train_score = False - , **njobs) - - - - - - - - - - -#%% Fit -mod_fs_fit = mod_fs.fit(X, y) -mod_fs_fbm = mod_fs_fit.best_params_ -mod_fs_fbmr = mod_fs_fit.cv_results_ -mod_fs_fbs = mod_fs_fit.best_score_ -print('Best model:\n', mod_fs_fbm) -print('Best models score:\n', mod_fs_fbs, ':' , round(mod_fs_fbs, 2)) +print('Best model:\n', gscv_lr_fit_be_mod) +print('Best models score:\n', gscv_lr_fit.best_score_, ':' + , round(gscv_lr_fit.best_score_, 2)) #print('\nMean test score from fit results:', round(mean(mod_fs_fbmr['mean_test_mcc']),2)) -print('\nMean test score from fit results:', round(np.nanmean(mod_fs_fbmr['mean_test_mcc']),2)) +print('\nMean test score from fit results:' + , round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2)) + +#%% print selected features +# Now get the features out +all_features = gscv_lr.feature_names_in_ +#all_features = gsfit.feature_names_in_ + +sel_features = X.columns[gscv_lr.best_estimator_.named_steps['fs'].get_support()] +n_sf = gscv_lr.best_estimator_.named_steps['fs'].n_features_ + +# get model name +model_name = gscv_lr.best_estimator_.named_steps['clf'] +b_model_params = gscv_lr.best_params_ + +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + + ) + + + ############################################################################### #%% Blind test ###################################### # Blind test ###################################### -test_predict = mod_fs_fit.predict(X_bts) +test_predict = gscv_lr.predict(X_bts) print(test_predict) print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) diff --git a/uq_ml_models/UQ_LR_FS2.py b/uq_ml_models/UQ_LR_FS2.py index 6719ae8..91d28a5 100644 --- a/uq_ml_models/UQ_LR_FS2.py +++ b/uq_ml_models/UQ_LR_FS2.py @@ -13,50 +13,50 @@ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() -model_lr = LogisticRegression(**rs) -model_rfecv = RFECV(estimator = model_lr - , cv = skf_cv - #, cv = 10 - , min_features_to_select = 1 # default - , scoring = 'matthews_corrcoef' - ) +# model_lr = LogisticRegression(**rs) +# model_rfecv = RFECV(estimator = model_lr +# , cv = skf_cv +# #, cv = 10 +# , min_features_to_select = 1 # default +# , scoring = 'matthews_corrcoef' +# ) -param_grid2 = [ - { - #'clf__estimator': [LogisticRegression(**rs)], - #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - 'C': np.logspace(0, 4, 10), - 'penalty': ['none', 'l1', 'l2', 'elasticnet'], - 'max_iter': list(range(100,800,100)), - 'solver': ['saga'] - }, - { - #'clf__estimator': [LogisticRegression(**rs)], - #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - 'C': np.logspace(0, 4, 10), - 'penalty': ['l2', 'none'], - 'max_iter': list(range(100,800,100)), - 'solver': ['newton-cg', 'lbfgs', 'sag'] - }, - { - #'clf__estimator': [LogisticRegression(**rs)], - #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - 'C': np.logspace(0, 4, 10), - 'penalty': ['l1', 'l2'], - 'max_iter': list(range(100,800,100)), - 'solver': ['liblinear'] - } +# param_grid2 = [ +# { +# #'clf': [LogisticRegression(**rs)], +# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'C': np.logspace(0, 4, 10), +# 'penalty': ['none', 'l1', 'l2', 'elasticnet'], +# 'max_iter': list(range(100,800,100)), +# 'solver': ['saga'] +# }, +# { +# #'clf': [LogisticRegression(**rs)], +# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'C': np.logspace(0, 4, 10), +# 'penalty': ['l2', 'none'], +# 'max_iter': list(range(100,800,100)), +# 'solver': ['newton-cg', 'lbfgs', 'sag'] +# }, +# { +# #'clf': [LogisticRegression(**rs)], +# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'C': np.logspace(0, 4, 10), +# 'penalty': ['l1', 'l2'], +# 'max_iter': list(range(100,800,100)), +# 'solver': ['liblinear'] +# } -] -#------------------------------------------------------------------------------- -# Grid search CV + FS -gscv_lr = GridSearchCV(estimator = model_lr - , param_grid = param_grid2 - , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv - , return_train_score = False - , verbose = 3 - , **njobs) +# ] +# #------------------------------------------------------------------------------- +# # Grid search CV + FS +# gscv_lr = GridSearchCV(estimator = model_lr +# , param_grid = param_grid2 +# , scoring = mcc_score_fn, refit = 'mcc' +# , cv = skf_cv +# , return_train_score = False +# , verbose = 3 +# , **njobs) #------------------------------------------------------------------------------ ################ @@ -64,27 +64,27 @@ gscv_lr = GridSearchCV(estimator = model_lr # Cannot get BEST model out ################ # Create pipeline -pipeline = Pipeline([('pre', MinMaxScaler()) - #, ('fs', sfs_selector) - , ('fs', model_rfecv ) - , ('clf', gscv_lr)]) +# pipeline = Pipeline([('pre', MinMaxScaler()) +# #, ('fs', sfs_selector) +# , ('fs', model_rfecv ) +# , ('clf', gscv_lr)]) -# Fit # dont assign fit -#lr_fs_fit = pipeline.fit(X,y) -pipeline.fit(X,y) +# # Fit # dont assign fit +# #lr_fs_fit = pipeline.fit(X,y) +# pipeline.fit(X,y) -pipeline.best_params_ +# pipeline.best_params_ -#https://github.com/scikit-learn/scikit-learn/issues/7536 -n_fs = gscv_lr.best_estimator_.n_features_in_ -n_fs +# #https://github.com/scikit-learn/scikit-learn/issues/7536 +# n_fs = gscv_lr.best_estimator_.n_features_in_ +# n_fs -sel_features = X.columns[pipeline.named_steps['fs'].get_support()] -print('\nNo. of features selected with RFECV for model' - , pipeline.named_steps['clf'].estimator - , ':', n_fs - , '\nThese are:', sel_features - ) +# sel_features = X.columns[pipeline.named_steps['fs'].get_support()] +# print('\nNo. of features selected with RFECV for model' +# , pipeline.named_steps['clf'].estimator +# , ':', n_fs +# , '\nThese are:', sel_features +# ) ############################################################## # THIS ONE ######### @@ -106,28 +106,45 @@ param_grid2 = [ {'fs__min_features_to_select': [1] , 'fs__cv': [skf_cv] - #, 'fs__scoring': ['matthews_corrcoef']}, - #, 'fs__scoring': [mcc_score_fn]} }, + # { + # #'clf': [LogisticRegression(**rs)], + # 'clf__C': np.logspace(0, 4, 10), + # 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], + # 'clf__max_iter': list(range(100,800,100)), + # 'clf__solver': ['saga'] + # }, + # { + # #'clf': [LogisticRegression(**rs)], + # 'clf__C': np.logspace(0, 4, 10), + # 'clf__penalty': ['l2', 'none'], + # 'clf__max_iter': list(range(100,800,100)), + # 'clf__solver': ['newton-cg', 'lbfgs', 'sag'] + # }, + # { + # #'clf': [LogisticRegression(**rs)], + # 'clf__C': np.logspace(0, 4, 10), + # 'clf__penalty': ['l1', 'l2'], + # 'clf__max_iter': list(range(100,800,100)), + # 'clf__solver': ['liblinear'] + # } - { - #'clf__estimator': [LogisticRegression(**rs)], + { #'clf': [LogisticRegression(**rs)], 'clf__C': np.logspace(0, 4, 10), 'clf__penalty': ['l2'], - 'clf__max_iter': list(range(100,200,100)), - #'clf__solver': ['newton-cg', 'lbfgs', 'sag'] - 'clf__solver': ['sag'] - - }, - { - #'clf__estimator': [LogisticRegression(**rs)], - 'clf__C': np.logspace(0, 4, 10), - 'clf__penalty': ['l1', 'l2'], - 'clf__max_iter': list(range(100,200,100)), + 'clf__max_iter': [100], 'clf__solver': ['liblinear'] + }, + + { #'clf': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l2'], + 'clf__max_iter':[100], + 'clf__solver': ['saga'] } + ] # step 4: create pipeline pipeline = Pipeline([ @@ -149,12 +166,34 @@ gs_final = GridSearchCV(pipeline gs_final.fit(X,y) gs_final.best_params_ gs_final.best_score_ +gs_final.best_estimator_ # assign the fit -gsfit = gs_final.fit(X,y) +#gsfit = gs_final.fit(X,y) #gsfit.best_estimator_ -gsfit.best_params_ -gsfit.best_score_ +#gsfit.best_params_ +#gsfit.best_score_ + +test_predict = gs_final.predict(X_bts) +print(test_predict) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) + + + + + + + + + + + + + + + + # Now get the features out all_features = gs_final.feature_names_in_ @@ -163,7 +202,6 @@ all_features = gs_final.feature_names_in_ sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()] n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_ - # get model name model_name = gs_final.best_estimator_.named_steps['clf'] b_model_params = gs_final.best_params_ @@ -179,4 +217,37 @@ print('\n========================================' , '\nThese are:', sel_features, '\n\n' , '\nBest Model hyperparams:', b_model_params - ) \ No newline at end of file + ) + + +###################################### +# Blind test +###################################### +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', )) + +#test_predict = gscv_lr_fit.predict(X_bts) +test_predict = gs_final.predict(X_bts) +print(test_predict) + +print(accuracy_score(y_bts, test_predict)) +print(matthews_corrcoef(y_bts, test_predict)) + +# create a dict with all scores +lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_bts_dict +lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +lr_bts_dict \ No newline at end of file diff --git a/uq_ml_models_FS/UQ_ABC.py b/uq_ml_models_FS/UQ_ABC.py new file mode 100644 index 0000000..23e5f8d --- /dev/null +++ b/uq_ml_models_FS/UQ_ABC.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [AdaBoostClassifier(**rs)] + , 'clf__estimator__n_estimators': [1, 2, 5, 10] + #, 'clf__estimator__base_estimator' : ['SVC'] + #, 'clf__estimator___splitter' : ["best", "random"] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_abc = GridSearchCV(pipeline + , parameters + #, scoring = 'matthews_corrcoef', refit = 'matthews_corrcoef' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_abc_fit = gscv_abc.fit(X, y) + +gscv_abc_fit_be_mod = gscv_abc_fit.best_params_ +gscv_abc_fit_be_res = gscv_abc_fit.cv_results_ + +print('Best model:\n', gscv_abc_fit_be_mod) +print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_abc_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +abc_bts_dict = {#'best_model': list(gscv_abc_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +abc_bts_dict +abc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +abc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +abc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +abc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +abc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +abc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +abc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +abc_bts_dict + +# Create a df from dict with all scores +abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index') +abc_bts_df.columns = ['ABC'] +print(abc_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_abc_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['ABC'] +model_params_df.columns + +# Combine the df of scores and the best model params +abc_bts_df.columns +abc_output = pd.concat([model_params_df, abc_bts_df], axis = 0) +abc_output + +# Format the combined df +# Drop the best_model_params row from abc_output +abc_df = abc_output.drop([0], axis = 0) +abc_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_BC.py b/uq_ml_models_FS/UQ_BC.py new file mode 100644 index 0000000..776db9f --- /dev/null +++ b/uq_ml_models_FS/UQ_BC.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [BaggingClassifier(**rs + , **njobs + , bootstrap = True + , oob_score = True)] + , 'clf__estimator__n_estimators' : [10, 25, 50, 100, 150, 200, 500, 700, 1000] + # If None, then the base estimator is a DecisionTreeClassifier. + #, 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_bc = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_bc_fit = gscv_bc.fit(X, y) + +gscv_bc_fit_be_mod = gscv_bc_fit.best_params_ +gscv_bc_fit_be_res = gscv_bc_fit.cv_results_ + +print('Best model:\n', gscv_bc_fit_be_mod) +print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_bc_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +bc_bts_dict = {#'best_model': list(gscv_bc_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +bc_bts_dict +bc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +bc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +bc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +bc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +bc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +bc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +bc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +bc_bts_dict + +# Create a df from dict with all scores +bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index') +bc_bts_df.columns = ['BC'] +print(bc_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_bc_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['BC'] +model_params_df.columns + +# Combine the df of scores and the best model params +bc_bts_df.columns +bc_output = pd.concat([model_params_df, bc_bts_df], axis = 0) +bc_output + +# Format the combined df +# Drop the best_model_params row from bc_output +bc_df = bc_output.drop([0], axis = 0) +bc_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_BNB.py b/uq_ml_models_FS/UQ_BNB.py new file mode 100644 index 0000000..9092c2d --- /dev/null +++ b/uq_ml_models_FS/UQ_BNB.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [BernoulliNB()] + , 'clf__estimator__alpha': [1, 0] + , 'clf__estimator__binarize':[None, 0] + , 'clf__estimator__fit_prior': [True] + , 'clf__estimator__class_prior': [None] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_bnb = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_bnb_fit = gscv_bnb.fit(X, y) + +gscv_bnb_fit_be_mod = gscv_bnb_fit.best_params_ +gscv_bnb_fit_be_res = gscv_bnb_fit.cv_results_ + +print('Best model:\n', gscv_bnb_fit_be_mod) +print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_bnb_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +bnb_bts_dict = {#'best_model': list(gscv_bnb_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +bnb_bts_dict +bnb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +bnb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +bnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +bnb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +bnb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +bnb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +bnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +bnb_bts_dict + +# Create a df from dict with all scores +bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index') +bnb_bts_df.columns = ['BNB'] +print(bnb_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_bnb_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['BNB'] +model_params_df.columns + +# Combine the df of scores and the best model params +bnb_bts_df.columns +bnb_output = pd.concat([model_params_df, bnb_bts_df], axis = 0) +bnb_output + +# Format the combined df +# Drop the best_model_params row from bnb_output +bnb_df = bnb_output.drop([0], axis = 0) +bnb_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_DT.py b/uq_ml_models_FS/UQ_DT.py new file mode 100644 index 0000000..bed047b --- /dev/null +++ b/uq_ml_models_FS/UQ_DT.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [DecisionTreeClassifier(**rs)] + , 'clf__estimator__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20] + , 'clf__estimator__class_weight':['balanced'] + , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss'] + , 'clf__estimator__max_features': [None, 'sqrt', 'log2'] + , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] + , 'clf__estimator__min_samples_split': [2, 5, 15, 20] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_dt = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_dt_fit = gscv_dt.fit(X, y) + +gscv_dt_fit_be_mod = gscv_dt_fit.best_params_ +gscv_dt_fit_be_res = gscv_dt_fit.cv_results_ + +print('Best model:\n', gscv_dt_fit_be_mod) +print('Best models score:\n', gscv_dt_fit.best_score_, ':' , round(gscv_dt_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_dt_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_dt_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_dt_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +dt_bts_dict = {#'best_model': list(gscv_dt_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +dt_bts_dict +dt_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +dt_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +dt_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +dt_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +dt_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +dt_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +dt_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +dt_bts_dict + +# Create a df from dict with all scores +dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index') +dt_bts_df.columns = ['DT'] +print(dt_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_dt_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['DT'] +model_params_df.columns + +# Combine the df of scores and the best model params +dt_bts_df.columns +dt_output = pd.concat([model_params_df, dt_bts_df], axis = 0) +dt_output + +# Format the combined df +# Drop the best_model_params row from dt_output +dt_df = dt_output.drop([0], axis = 0) +dt_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_GBC.py b/uq_ml_models_FS/UQ_GBC.py new file mode 100644 index 0000000..d692ce1 --- /dev/null +++ b/uq_ml_models_FS/UQ_GBC.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [GradientBoostingClassifier(**rs)] + , 'clf__estimator__n_estimators' : [10, 100, 200, 500, 1000] + , 'clf__estimator__n_estimators' : [10, 100, 1000] + , 'clf__estimator__learning_rate': [0.001, 0.01, 0.1] + , 'clf__estimator__subsample' : [0.5, 0.7, 1.0] + , 'clf__estimator__max_depth' : [3, 7, 9] + + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_gbc = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_gbc_fit = gscv_gbc.fit(X, y) + +gscv_gbc_fit_be_mod = gscv_gbc_fit.best_params_ +gscv_gbc_fit_be_res = gscv_gbc_fit.cv_results_ + +print('Best model:\n', gscv_gbc_fit_be_mod) +print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_gbc_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +gbc_bts_dict = {#'best_model': list(gscv_gbc_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +gbc_bts_dict +gbc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +gbc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +gbc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +gbc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +gbc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +gbc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +gbc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +gbc_bts_dict + +# Create a df from dict with all scores +gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index') +gbc_bts_df.columns = ['GBC'] +print(gbc_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_gbc_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['GBC'] +model_params_df.columns + +# Combine the df of scores and the best model params +gbc_bts_df.columns +gbc_output = pd.concat([model_params_df, gbc_bts_df], axis = 0) +gbc_output + +# Format the combined df +# Drop the best_model_params row from gbc_output +gbc_df = gbc_output.drop([0], axis = 0) +gbc_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_GNB.py b/uq_ml_models_FS/UQ_GNB.py new file mode 100644 index 0000000..3dab3c0 --- /dev/null +++ b/uq_ml_models_FS/UQ_GNB.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [GaussianNB()] + , 'clf__estimator__priors': [None] + , 'clf__estimator__var_smoothing': np.logspace(0,-9, num=100) + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_gnb = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_gnb_fit = gscv_gnb.fit(X, y) + +gscv_gnb_fit_be_mod = gscv_gnb_fit.best_params_ +gscv_gnb_fit_be_res = gscv_gnb_fit.cv_results_ + +print('Best model:\n', gscv_gnb_fit_be_mod) +print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_gnb_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +gnb_bts_dict = {#'best_model': list(gscv_gnb_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +gnb_bts_dict +gnb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +gnb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +gnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +gnb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +gnb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +gnb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +gnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +gnb_bts_dict + +# Create a df from dict with all scores +gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index') +gnb_bts_df.columns = ['GNB'] +print(gnb_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_gnb_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['GNB'] +model_params_df.columns + +# Combine the df of scores and the best model params +gnb_bts_df.columns +gnb_output = pd.concat([model_params_df, gnb_bts_df], axis = 0) +gnb_output + +# Format the combined df +# Drop the best_model_params row from gnb_output +gnb_df = gnb_output.drop([0], axis = 0) +gnb_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_GPC.py b/uq_ml_models_FS/UQ_GPC.py new file mode 100644 index 0000000..2fc5a88 --- /dev/null +++ b/uq_ml_models_FS/UQ_GPC.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [GaussianProcessClassifier(**rs)] + + , 'clf__estimator__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_gpc = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_gpc_fit = gscv_gpc.fit(X, y) + +gscv_gpc_fit_be_mod = gscv_gpc_fit.best_params_ +gscv_gpc_fit_be_res = gscv_gpc_fit.cv_results_ + +print('Best model:\n', gscv_gpc_fit_be_mod) +print('Best models score:\n', gscv_gpc_fit.best_score_, ':' , round(gscv_gpc_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_gpc_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_gpc_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_gpc_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +gpc_bts_dict = {#'best_model': list(gscv_gpc_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +gpc_bts_dict +gpc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +gpc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +gpc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +gpc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +gpc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +gpc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +gpc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +gpc_bts_dict + +# Create a df from dict with all scores +gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index') +gpc_bts_df.columns = ['GPC'] +print(gpc_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_gpc_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['GPC'] +model_params_df.columns + +# Combine the df of scores and the best model params +gpc_bts_df.columns +gpc_output = pd.concat([model_params_df, gpc_bts_df], axis = 0) +gpc_output + +# Format the combined df +# Drop the best_model_params row from gpc_output +gpc_df = gpc_output.drop([0], axis = 0) +gpc_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_KNN.py b/uq_ml_models_FS/UQ_KNN.py new file mode 100644 index 0000000..88b8fa0 --- /dev/null +++ b/uq_ml_models_FS/UQ_KNN.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [KNeighborsClassifier(**njobs)] + , 'clf__estimator__n_neighbors': range(21, 51, 2) + #, 'clf__estimator__n_neighbors': [5, 7, 11] + , 'clf__estimator__metric' : ['euclidean', 'manhattan', 'minkowski'] + , 'clf__estimator__weights' : ['uniform', 'distance'] + + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_knn = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_knn_fit = gscv_knn.fit(X, y) + +gscv_knn_fit_be_mod = gscv_knn_fit.best_params_ +gscv_knn_fit_be_res = gscv_knn_fit.cv_results_ + +print('Best model:\n', gscv_knn_fit_be_mod) +print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_knn_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +knn_bts_dict = {#'best_model': list(gscv_knn_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +knn_bts_dict +knn_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +knn_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +knn_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +knn_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +knn_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +knn_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +knn_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +knn_bts_dict + +# Create a df from dict with all scores +knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index') +knn_bts_df.columns = ['KNN'] +print(knn_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_knn_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['KNN'] +model_params_df.columns + +# Combine the df of scores and the best model params +knn_bts_df.columns +knn_output = pd.concat([model_params_df, knn_bts_df], axis = 0) +knn_output + +# Format the combined df +# Drop the best_model_params row from knn_output +knn_df = knn_output.drop([0], axis = 0) +knn_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_LR.py b/uq_ml_models_FS/UQ_LR.py new file mode 100644 index 0000000..879a926 --- /dev/null +++ b/uq_ml_models_FS/UQ_LR.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Import libs +import numpy as np +import pandas as pd +from sklearn.model_selection import GridSearchCV +from sklearn import datasets +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.svm import SVC + +from sklearn.base import BaseEstimator +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.pipeline import Pipeline +from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder +from xgboost import XGBClassifier +rs = {'random_state': 42} +njobs = {'n_jobs': 10} +#%% Get train-test split and scoring functions +# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] +# , num_df_wtgt['mutation_class'] +# , test_size = 0.33 +# , random_state = 2 +# , shuffle = True +# , stratify = num_df_wtgt['mutation_class']) + +y.to_frame().value_counts().plot(kind = 'bar') +blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jaccard' : make_scorer(jaccard_score) + }) + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} +jacc_score_fn = {'jcc': make_scorer(jaccard_score)} + +#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['saga'] + }, + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['l2', 'none'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] + }, + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['l1', 'l2'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['liblinear'] + } + +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_lr = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_lr_fit = gscv_lr.fit(X, y) +gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ +gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ + +print('Best model:\n', gscv_lr_fit_be_mod) +print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2)) + +#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2)) + + +###################################### +# Blind test +###################################### +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', )) + +test_predict = gscv_lr_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_bts, test_predict)) +print(matthews_corrcoef(y_bts, test_predict)) + +# create a dict with all scores +lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_bts_dict +lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +lr_bts_dict + +# Create a df from dict with all scores +lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') +lr_bts_df.columns = ['Logistic_Regression'] +print(lr_bts_df) + +# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} +# d2 +# def Merge(dict1, dict2): +# res = {**dict1, **dict2} +# return res +# d3 = Merge(d2, lr_bts_dict) +# d3 + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns + +# Combine the df of scores and the best model params +lr_bts_df.columns +lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) +lr_output + +# Format the combined df +# Drop the best_model_params row from lr_output +lr_df = lr_output.drop([0], axis = 0) +lr_df + +#FIXME: tidy the index of the formatted df + +############################################################################### diff --git a/uq_ml_models_FS/UQ_LR_FS.py b/uq_ml_models_FS/UQ_LR_FS.py new file mode 100644 index 0000000..6910fab --- /dev/null +++ b/uq_ml_models_FS/UQ_LR_FS.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() +model_lr = LogisticRegression(**rs) +model_rfecv = RFECV(estimator = model_lr + , cv = rskf_cv + #, cv = 10 + , scoring = 'matthews_corrcoef' + ) + +param_grid2 = [ + { + #'clf__estimator': [LogisticRegression(**rs)], + #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'C': np.logspace(0, 4, 10), + 'penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'max_iter': list(range(100,800,100)), + 'solver': ['saga'] + }, + { + #'clf__estimator': [LogisticRegression(**rs)], + #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'C': np.logspace(0, 4, 10), + 'penalty': ['l2', 'none'], + 'max_iter': list(range(100,800,100)), + 'solver': ['newton-cg', 'lbfgs', 'sag'] + }, + { + #'clf__estimator': [LogisticRegression(**rs)], + #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'C': np.logspace(0, 4, 10), + 'penalty': ['l1', 'l2'], + 'max_iter': list(range(100,800,100)), + 'solver': ['liblinear'] + } + +] + +#------------------------------------------------------------------------------- +# Grid search CV + FS +gscv_lr = GridSearchCV(model_lr + , param_grid2 + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , return_train_score = False + , verbose = 3 + , **njobs) + +#------------------------------------------------------------------------------ +# Create pipeline +pipeline = Pipeline([('pre', MinMaxScaler()) + #, ('feature_selection', sfs_selector) + , ('feature_selection', model_rfecv ) + , ('clf', gscv_lr)]) + +# Fit +lr_fs_fit = pipeline.fit(X,y) +#lr_fs_fit_be_mod = lr_fs_fit.best_params_ +#lr_fs_fit_be_res = lr_fs_fit.cv_results_ +dir(lr_fs_fit) + +print('Best model:\n', lr_fs_fit_be_mod) +print('Best models score:\n', lr_fs_fit.best_score_, ':' , round(lr_fs_fit.best_score_, 2)) + +pipeline.predict(X_bts) +lr_fs_fit.predict(X_bts) + +test_predict = pipeline.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +#y_btsf = np.array(y_bts) + +print(accuracy_score(y_bts, test_predict)) +print(matthews_corrcoef(y_bts, test_predict)) + + +###################################### +# Blind test +###################################### +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', )) + +test_predict = lr_fs_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_bts, test_predict)) +print(matthews_corrcoef(y_bts, test_predict)) + +# create a dict with all scores +lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_bts_dict +lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +lr_bts_dict + +# Create a df from dict with all scores +lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') +lr_bts_df.columns = ['Logistic_Regression'] +print(lr_bts_df) + +# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} +# d2 +# def Merge(dict1, dict2): +# res = {**dict1, **dict2} +# return res +# d3 = Merge(d2, lr_bts_dict) +# d3 + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(lr_fs_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns + +# Combine the df of scores and the best model params +lr_bts_df.columns +lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) +lr_output + +# Format the combined df +# Drop the best_model_params row from lr_output +lr_df = lr_output.drop([0], axis = 0) +lr_df + +#FIXME: tidy the index of the formatted df + +############################################################################### diff --git a/uq_ml_models_FS/UQ_LR_FS2.py b/uq_ml_models_FS/UQ_LR_FS2.py new file mode 100644 index 0000000..91d28a5 --- /dev/null +++ b/uq_ml_models_FS/UQ_LR_FS2.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon May 16 05:59:12 2022 + +@author: tanu +""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() +# model_lr = LogisticRegression(**rs) +# model_rfecv = RFECV(estimator = model_lr +# , cv = skf_cv +# #, cv = 10 +# , min_features_to_select = 1 # default +# , scoring = 'matthews_corrcoef' +# ) + +# param_grid2 = [ +# { +# #'clf': [LogisticRegression(**rs)], +# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'C': np.logspace(0, 4, 10), +# 'penalty': ['none', 'l1', 'l2', 'elasticnet'], +# 'max_iter': list(range(100,800,100)), +# 'solver': ['saga'] +# }, +# { +# #'clf': [LogisticRegression(**rs)], +# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'C': np.logspace(0, 4, 10), +# 'penalty': ['l2', 'none'], +# 'max_iter': list(range(100,800,100)), +# 'solver': ['newton-cg', 'lbfgs', 'sag'] +# }, +# { +# #'clf': [LogisticRegression(**rs)], +# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], +# 'C': np.logspace(0, 4, 10), +# 'penalty': ['l1', 'l2'], +# 'max_iter': list(range(100,800,100)), +# 'solver': ['liblinear'] +# } + +# ] +# #------------------------------------------------------------------------------- +# # Grid search CV + FS +# gscv_lr = GridSearchCV(estimator = model_lr +# , param_grid = param_grid2 +# , scoring = mcc_score_fn, refit = 'mcc' +# , cv = skf_cv +# , return_train_score = False +# , verbose = 3 +# , **njobs) + +#------------------------------------------------------------------------------ +################ +# NOTE: GS is going into pipeline, +# Cannot get BEST model out +################ +# Create pipeline +# pipeline = Pipeline([('pre', MinMaxScaler()) +# #, ('fs', sfs_selector) +# , ('fs', model_rfecv ) +# , ('clf', gscv_lr)]) + +# # Fit # dont assign fit +# #lr_fs_fit = pipeline.fit(X,y) +# pipeline.fit(X,y) + +# pipeline.best_params_ + +# #https://github.com/scikit-learn/scikit-learn/issues/7536 +# n_fs = gscv_lr.best_estimator_.n_features_in_ +# n_fs + +# sel_features = X.columns[pipeline.named_steps['fs'].get_support()] +# print('\nNo. of features selected with RFECV for model' +# , pipeline.named_steps['clf'].estimator +# , ':', n_fs +# , '\nThese are:', sel_features +# ) +############################################################## +# THIS ONE +######### +# Make Pipeline go into GS with FS +######### + +# step 1: specify model +#modLR = LogisticRegression(**rs) + +# step 2: specify fs +#model_rfecv = RFECV(estimator = model_lr + # , cv = skf_cv + #, min_features_to_select = 1 # default + #, scoring = 'matthews_corrcoef' + #) + +# step 3: specify param grid as dict +param_grid2 = [ + + {'fs__min_features_to_select': [1] + , 'fs__cv': [skf_cv] + }, + + # { + # #'clf': [LogisticRegression(**rs)], + # 'clf__C': np.logspace(0, 4, 10), + # 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], + # 'clf__max_iter': list(range(100,800,100)), + # 'clf__solver': ['saga'] + # }, + # { + # #'clf': [LogisticRegression(**rs)], + # 'clf__C': np.logspace(0, 4, 10), + # 'clf__penalty': ['l2', 'none'], + # 'clf__max_iter': list(range(100,800,100)), + # 'clf__solver': ['newton-cg', 'lbfgs', 'sag'] + # }, + # { + # #'clf': [LogisticRegression(**rs)], + # 'clf__C': np.logspace(0, 4, 10), + # 'clf__penalty': ['l1', 'l2'], + # 'clf__max_iter': list(range(100,800,100)), + # 'clf__solver': ['liblinear'] + # } + + { #'clf': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l2'], + 'clf__max_iter': [100], + 'clf__solver': ['liblinear'] + }, + + { #'clf': [LogisticRegression(**rs)], + 'clf__C': np.logspace(0, 4, 10), + 'clf__penalty': ['l2'], + 'clf__max_iter':[100], + 'clf__solver': ['saga'] + } + + +] +# step 4: create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()) + #, ('fs', model_rfecv) + , ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef')) + , ('clf', LogisticRegression(**rs))]) + +# step 5: Perform Gridsearch CV +gs_final = GridSearchCV(pipeline + , param_grid2 + , cv = skf_cv + , scoring = mcc_score_fn, refit = 'mcc' + , verbose = 1 + , return_train_score = False + , **njobs) + +#fit +gs_final.fit(X,y) +gs_final.best_params_ +gs_final.best_score_ +gs_final.best_estimator_ + +# assign the fit +#gsfit = gs_final.fit(X,y) +#gsfit.best_estimator_ +#gsfit.best_params_ +#gsfit.best_score_ + +test_predict = gs_final.predict(X_bts) +print(test_predict) +print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2)) +print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2)) + + + + + + + + + + + + + + + + + +# Now get the features out +all_features = gs_final.feature_names_in_ +#all_features = gsfit.feature_names_in_ + +sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()] +n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_ + +# get model name +model_name = gs_final.best_estimator_.named_steps['clf'] +b_model_params = gs_final.best_params_ + +print('\n========================================' + , '\nRunning model:' + , '\nModel name:', model_name + , '\n===============================================' + , '\nRunning feature selection with RFECV for model' + , '\nTotal no. of features in model:', len(all_features) + , '\nThese are:\n', all_features, '\n\n' + , '\nNo of features for best model: ', n_sf + , '\nThese are:', sel_features, '\n\n' + , '\nBest Model hyperparams:', b_model_params + + ) + + +###################################### +# Blind test +###################################### +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', )) + +#test_predict = gscv_lr_fit.predict(X_bts) +test_predict = gs_final.predict(X_bts) +print(test_predict) + +print(accuracy_score(y_bts, test_predict)) +print(matthews_corrcoef(y_bts, test_predict)) + +# create a dict with all scores +lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) + 'bts_fscore':None + , 'bts_mcc':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None } +lr_bts_dict +lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +lr_bts_dict \ No newline at end of file diff --git a/uq_ml_models_FS/UQ_MLP.py b/uq_ml_models_FS/UQ_MLP.py new file mode 100644 index 0000000..8c84e04 --- /dev/null +++ b/uq_ml_models_FS/UQ_MLP.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [MLPClassifier(**rs + , max_iter = 1000)] + , 'clf__estimator__hidden_layer_sizes': [(1), (2), (3), (5), (10)] + , 'clf__estimator__solver': ['lbfgs', 'sgd', 'adam'] + , 'clf__estimator__learning_rate': ['constant', 'invscaling', 'adaptive'] + #, 'clf__estimator__learning_rate': ['constant'] + + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_mlp = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_mlp_fit = gscv_mlp.fit(X, y) + +gscv_mlp_fit_be_mod = gscv_mlp_fit.best_params_ +gscv_mlp_fit_be_res = gscv_mlp_fit.cv_results_ + +print('Best model:\n', gscv_mlp_fit_be_mod) +print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_mlp_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +mlp_bts_dict = {#'best_model': list(gscv_mlp_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +mlp_bts_dict +mlp_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +mlp_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +mlp_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +mlp_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +mlp_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +mlp_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +mlp_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +mlp_bts_dict + +# Create a df from dict with all scores +mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index') +mlp_bts_df.columns = ['MLP'] +print(mlp_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_mlp_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['MLP'] +model_params_df.columns + +# Combine the df of scores and the best model params +mlp_bts_df.columns +mlp_output = pd.concat([model_params_df, mlp_bts_df], axis = 0) +mlp_output + +# Format the combined df +# Drop the best_model_params row from mlp_output +mlp_df = mlp_output.drop([0], axis = 0) +mlp_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_QDA.py b/uq_ml_models_FS/UQ_QDA.py new file mode 100644 index 0000000..5024c3c --- /dev/null +++ b/uq_ml_models_FS/UQ_QDA.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [QuadraticDiscriminantAnalysis()] + + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_qda = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_qda_fit = gscv_qda.fit(X, y) + +gscv_qda_fit_be_mod = gscv_qda_fit.best_params_ +gscv_qda_fit_be_res = gscv_qda_fit.cv_results_ + +print('Best model:\n', gscv_qda_fit_be_mod) +print('Best models score:\n', gscv_qda_fit.best_score_, ':' , round(gscv_qda_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_qda_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_qda_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_qda_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +qda_bts_dict = {#'best_model': list(gscv_qda_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +qda_bts_dict +qda_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +qda_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +qda_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +qda_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +qda_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +qda_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +qda_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +qda_bts_dict + +# Create a df from dict with all scores +qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index') +qda_bts_df.columns = ['QDA'] +print(qda_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_qda_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['QDA'] +model_params_df.columns + +# Combine the df of scores and the best model params +qda_bts_df.columns +qda_output = pd.concat([model_params_df, qda_bts_df], axis = 0) +qda_output + +# Format the combined df +# Drop the best_model_params row from qda_output +qda_df = qda_output.drop([0], axis = 0) +qda_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_RC.py b/uq_ml_models_FS/UQ_RC.py new file mode 100644 index 0000000..2ea710f --- /dev/null +++ b/uq_ml_models_FS/UQ_RC.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + {'clf__estimator' : [RidgeClassifier(**rs)] + , 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_rc = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_rc_fit = gscv_rc.fit(X, y) + +gscv_rc_fit_be_mod = gscv_rc_fit.best_params_ +gscv_rc_fit_be_res = gscv_rc_fit.cv_results_ + +print('Best model:\n', gscv_rc_fit_be_mod) +print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_rc_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +rc_bts_dict = {#'best_model': list(gscv_rc_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +rc_bts_dict +rc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +rc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +rc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +rc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +rc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +rc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +rc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +rc_bts_dict + +# Create a df from dict with all scores +rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index') +rc_bts_df.columns = ['Ridge Classifier'] +print(rc_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_rc_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['Ridge Classifier'] +model_params_df.columns + +# Combine the df of scores and the best model params +rc_bts_df.columns +rc_output = pd.concat([model_params_df, rc_bts_df], axis = 0) +rc_output + +# Format the combined df +# Drop the best_model_params row from rc_output +rc_df = rc_output.drop([0], axis = 0) +rc_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_RF.py b/uq_ml_models_FS/UQ_RF.py new file mode 100644 index 0000000..7758f9a --- /dev/null +++ b/uq_ml_models_FS/UQ_RF.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [RandomForestClassifier(**rs + , **njobs + , bootstrap = True + , oob_score = True)], + 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None] + , 'clf__estimator__class_weight':['balanced','balanced_subsample'] + , 'clf__estimator__n_estimators': [10, 25, 50, 100] + , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss'] + , 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt + , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] + , 'clf__estimator__min_samples_split': [2, 5, 15, 20] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_rf = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_rf_fit = gscv_rf.fit(X, y) + +gscv_rf_fit_be_mod = gscv_rf_fit.best_params_ +gscv_rf_fit_be_res = gscv_rf_fit.cv_results_ + +print('Best model:\n', gscv_rf_fit_be_mod) +print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_rf_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +rf_bts_dict = {#'best_model': list(gscv_rf_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +rf_bts_dict +rf_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +rf_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +rf_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +rf_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +rf_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +rf_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +rf_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +rf_bts_dict + +# Create a df from dict with all scores +rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index') +rf_bts_df.columns = ['Logistic_Regression'] +print(rf_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_rf_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns + +# Combine the df of scores and the best model params +rf_bts_df.columns +rf_output = pd.concat([model_params_df, rf_bts_df], axis = 0) +rf_output + +# Format the combined df +# Drop the best_model_params row from rf_output +rf_df = rf_output.drop([0], axis = 0) +rf_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_SVC.py b/uq_ml_models_FS/UQ_SVC.py new file mode 100644 index 0000000..c430649 --- /dev/null +++ b/uq_ml_models_FS/UQ_SVC.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" +#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [SVC(**rs)] + , 'clf__estimator__kernel': ['poly', 'rbf', 'sigmoid'] + #, 'clf__estimator__kernel': ['linear'] + + , 'clf__estimator__C' : [50, 10, 1.0, 0.1, 0.01] + , 'clf__estimator__gamma': ['scale', 'auto'] + + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_svc = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_svc_fit = gscv_svc.fit(X, y) + +gscv_svc_fit_be_mod = gscv_svc_fit.best_params_ +gscv_svc_fit_be_res = gscv_svc_fit.cv_results_ + +print('Best model:\n', gscv_svc_fit_be_mod) +print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_svc_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +svc_bts_dict = {#'best_model': list(gscv_svc_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +svc_bts_dict +svc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +svc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +svc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +svc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +svc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +svc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +svc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +svc_bts_dict + +# Create a df from dict with all scores +svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index') +svc_bts_df.columns = ['SVC'] +print(svc_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_svc_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['SVC'] +model_params_df.columns + +# Combine the df of scores and the best model params +svc_bts_df.columns +svc_output = pd.concat([model_params_df, svc_bts_df], axis = 0) +svc_output + +# Format the combined df +# Drop the best_model_params row from svc_output +svc_df = svc_output.drop([0], axis = 0) +svc_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/UQ_XGB.py b/uq_ml_models_FS/UQ_XGB.py new file mode 100644 index 0000000..65a5e8f --- /dev/null +++ b/uq_ml_models_FS/UQ_XGB.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed May 18 06:03:24 2022 + +@author: tanu +""" + +#%% +#https://www.datatechnotes.com/2019/07/classification-example-with.html +# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, +# colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1, +# max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, +# n_estimators=100, n_jobs=1, nthread=None, +# objective='multi:softprob', random_state=0, reg_alpha=0, +# reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, +# subsample=1, verbosity=1) + +#%% XGBoost + hyperparam: BaseEstimator: ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [XGBClassifier(**rs , **njobs, verbose = 3)] + , 'clf__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2] + , 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20] + #, 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20] + #, 'clf__estimator__max_features': ['auto', 'sqrt'] + } +] + +# Create pipeline +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + +# Grid search i.e hyperparameter tuning and refitting on mcc +gscv_xgb = GridSearchCV(pipeline + , parameters + #, scoring = 'f1', refit = 'f1' + , scoring = mcc_score_fn, refit = 'mcc' + , cv = skf_cv + , **njobs + , return_train_score = False + , verbose = 3) + +# Fit +gscv_xgb_fit = gscv_xgb.fit(X, y) + +gscv_xgb_fit_be_mod = gscv_xgb_fit.best_params_ +gscv_xgb_fit_be_res = gscv_xgb_fit.cv_results_ + +print('Best model:\n', gscv_xgb_fit_be_mod) +print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2)) + +print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) + +###################################### +# Blind test +###################################### + +# See how it does on the BLIND test +#print('\nBlind test score, mcc:', ) + +test_predict = gscv_xgb_fit.predict(X_bts) +print(test_predict) +print(np.array(y_bts)) +y_btsf = np.array(y_bts) + +print(accuracy_score(y_btsf, test_predict)) +print(matthews_corrcoef(y_btsf, test_predict)) + +# create a dict with all scores +xgb_bts_dict = {#'best_model': list(gscv_xgb_fit_be_mod.items()) + 'bts_fscore' : None + , 'bts_mcc' : None + , 'bts_precision': None + , 'bts_recall' : None + , 'bts_accuracy' : None + , 'bts_roc_auc' : None + , 'bts_jaccard' : None } +xgb_bts_dict +xgb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) +xgb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) +xgb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) +xgb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) +xgb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) +xgb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) +xgb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) +xgb_bts_dict + +# Create a df from dict with all scores +xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index') +xgb_bts_df.columns = ['XGBoost'] +print(xgb_bts_df) + +# Create df with best model params +model_params = pd.Series(['best_model_params', list(gscv_xgb_fit_be_mod.items() )]) +model_params_df = model_params.to_frame() +model_params_df +model_params_df.columns = ['XGBoost'] +model_params_df.columns + +# Combine the df of scores and the best model params +xgb_bts_df.columns +xgb_output = pd.concat([model_params_df, xgb_bts_df], axis = 0) +xgb_output + +# Format the combined df +# Drop the best_model_params row from xgb_output +xgb_df = xgb_output.drop([0], axis = 0) +xgb_df + +#FIXME: tidy the index of the formatted df + +############################################################################### + + + diff --git a/uq_ml_models_FS/pnca_num_fs_hy.txt b/uq_ml_models_FS/pnca_num_fs_hy.txt new file mode 100644 index 0000000..a0ffbea --- /dev/null +++ b/uq_ml_models_FS/pnca_num_fs_hy.txt @@ -0,0 +1,8 @@ + Logistic_Regression +bts_fscore 0.71 +bts_mcc 0.34 +bts_precision 0.61 +bts_recall 0.87 +bts_accuracy 0.65 +bts_roc_auc 0.65 +bts_jaccard 0.55 diff --git a/uq_ml_models_FS/pnca_num_hy.txt b/uq_ml_models_FS/pnca_num_hy.txt new file mode 100644 index 0000000..338aab6 --- /dev/null +++ b/uq_ml_models_FS/pnca_num_hy.txt @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri May 20 00:36:17 2022 + +@author: tanu +""" +# pnca [ numerical ONLY + NO oversampling] + +# LR: hyperparm + +{'clf__estimator': LogisticRegression(penalty='l1', random_state=42, solver='saga'), + 'clf__estimator__C': 1.0, + 'clf__estimator__max_iter': 100, + 'clf__estimator__penalty': 'l1', + 'clf__estimator__solver': 'saga'} + + Logistic_Regression +bts_fscore 0.70 +bts_mcc 0.29 +bts_precision 0.57 +bts_recall 0.92 +bts_accuracy 0.61 +bts_roc_auc 0.61 +bts_jaccard 0.54 + +# LR: FS + hyperparam +{'bts_fscore': 0.71, + 'bts_mcc': 0.34, + 'bts_precision': 0.61, + 'bts_recall': 0.87, + 'bts_accuracy': 0.65, + 'bts_roc_auc': 0.65, + 'bts_jaccard': 0.55} +####################################################################### +# RF: hyperparam [~45] + +Best model: + {'clf__estimator': RandomForestClassifier(class_weight='balanced', max_depth=4, max_features=None, + min_samples_leaf=2, min_samples_split=15, + n_estimators=10, n_jobs=10, oob_score=True, + random_state=42), 'clf__estimator__class_weight': 'balanced', 'clf__estimator__criterion': 'gini', 'clf__estimator__max_depth': 4, 'clf__estimator__max_features': None, 'clf__estimator__min_samples_leaf': 2, 'clf__estimator__min_samples_split': 15, 'clf__estimator__n_estimators': 10} +Best models score: + 0.3329374281771619 : 0.33 + + RF +bts_fscore 0.69 +bts_mcc 0.37 +bts_precision 0.67 +bts_recall 0.72 +bts_accuracy 0.68 +bts_roc_auc 0.68 +bts_jaccard 0.53 + + +####################################################################### +# ABC: hyperparam + +{'clf__estimator': AdaBoostClassifier(n_estimators=2, random_state=42), + 'clf__estimator__n_estimators': 2} + ABC +1 [(clf__estimator, AdaBoostClassifier(n_estimat... +bts_fscore 0.71 +bts_mcc 0.36 +bts_precision 0.63 +bts_recall 0.83 +bts_accuracy 0.67 +bts_roc_auc 0.67 +bts_jaccard 0.56 +####################################################################### +# BC: hyperparam +{'clf__estimator': BaggingClassifier(n_estimators=200, n_jobs=10, oob_score=True, random_state=42), + 'clf__estimator__n_estimators': 200} + BC +0 best_model_params +1 [(clf__estimator, BaggingClassifier(n_estimato... +bts_fscore 0.72 +bts_mcc 0.37 +bts_precision 0.64 +bts_recall 0.82 +bts_accuracy 0.68 +bts_roc_auc 0.68 +bts_jaccard 0.56 +####################################################################### +# BNB: hyperparam +{'clf__estimator': BernoulliNB(alpha=1, binarize=None), + 'clf__estimator__alpha': 1, + 'clf__estimator__binarize': None, + 'clf__estimator__class_prior': None, + 'clf__estimator__fit_prior': True} + + BNB +1 [(clf__estimator, BernoulliNB(alpha=1, binariz... +bts_fscore 0.72 +bts_mcc 0.35 +bts_precision 0.6 +bts_recall 0.92 +bts_accuracy 0.65 +bts_roc_auc 0.65 +bts_jaccard 0.56 +####################################################################### +# DT: hyperparam +{'clf__estimator': DecisionTreeClassifier(class_weight='balanced', criterion='entropy', + max_depth=2, random_state=42), + 'clf__estimator__class_weight': 'balanced', + 'clf__estimator__criterion': 'entropy', + 'clf__estimator__max_depth': 2, + 'clf__estimator__max_features': None, + 'clf__estimator__min_samples_leaf': 1, + 'clf__estimator__min_samples_split': 2} + + + DT +1 [(clf__estimator, DecisionTreeClassifier(class... +bts_fscore 0.72 +bts_mcc 0.42 +bts_precision 0.69 +bts_recall 0.76 +bts_accuracy 0.71 +bts_roc_auc 0.71 +bts_jaccard 0.57 +####################################################################### +# GBC: hyperparam +{'clf__estimator': GradientBoostingClassifier(learning_rate=0.01, max_depth=7, random_state=42, + subsample=0.5), + 'clf__estimator__learning_rate': 0.01, + 'clf__estimator__max_depth': 7, + 'clf__estimator__n_estimators': 100, + 'clf__estimator__subsample': 0.5} + + + GBC +1 [(clf__estimator, GradientBoostingClassifier(l... +bts_fscore 0.71 +bts_mcc 0.33 +bts_precision 0.6 +bts_recall 0.88 +bts_accuracy 0.64 +bts_roc_auc 0.65 +bts_jaccard 0.55 + +####################################################################### +# GNB: hyperparam +{'clf__estimator': GaussianNB(var_smoothing=0.006579332246575682), + 'clf__estimator__priors': None, + 'clf__estimator__var_smoothing': 0.006579332246575682} + GNB +1 [(clf__estimator, GaussianNB(var_smoothing=0.0... +bts_fscore 0.72 +bts_mcc 0.46 +bts_precision 0.73 +bts_recall 0.71 +bts_accuracy 0.73 +bts_roc_auc 0.73 +bts_jaccard 0.57 +####################################################################### +# GPC: hyperparam +{'clf__estimator': GaussianProcessClassifier(kernel=1**2 * Matern(length_scale=1, nu=1.5), + random_state=42), + 'clf__estimator__kernel': 1**2 * Matern(length_scale=1, nu=1.5)} + + ConvergenceWarning: The optimal value found for dimension 0 of parameter k2__alpha is close to the specified upper bound 100000.0. Increasing the bound and calling fit again may find a better value. + warnings.warn( + GPC +1 [(clf__estimator, GaussianProcessClassifier(ke... +bts_fscore 0.73 +bts_mcc 0.38 +bts_precision 0.6 +bts_recall 0.92 +bts_accuracy 0.66 +bts_roc_auc 0.66 +bts_jaccard 0.58 +####################################################################### +# KNN: hyperparam +Best model: + {'clf__estimator': KNeighborsClassifier(metric='euclidean', n_jobs=10, n_neighbors=11, + weights='distance'), 'clf__estimator__metric': 'euclidean', 'clf__estimator__n_neighbors': 11, 'clf__estimator__weights': 'distance'} + + 1 [(clf__estimator, KNeighborsClassifier(metric=... +bts_fscore 0.69 +bts_mcc 0.26 +bts_precision 0.58 +bts_recall 0.85 +bts_accuracy 0.62 +bts_roc_auc 0.62 +bts_jaccard 0.52 + +Best model: + {'clf__estimator': KNeighborsClassifier(metric='euclidean', n_jobs=10, n_neighbors=29), 'clf__estimator__metric': 'euclidean', 'clf__estimator__n_neighbors': 29, 'clf__estimator__weights': 'uniform'} + + KNN +1 [(clf__estimator, KNeighborsClassifier(metric=... +bts_fscore 0.73 +bts_mcc 0.37 +bts_precision 0.6 +bts_recall 0.92 +bts_accuracy 0.65 +bts_roc_auc 0.65 +bts_jaccard 0.57 + +####################################################################### +# MLP: hyperparam +#constant lr, tried others as well, but comes back with constant +{'clf__estimator': MLPClassifier(hidden_layer_sizes=3, max_iter=500, random_state=42, + solver='lbfgs'), + 'clf__estimator__hidden_layer_sizes': 3, + 'clf__estimator__learning_rate': 'constant', + 'clf__estimator__solver': 'lbfgs'} + + +1 [(clf__estimator, MLPClassifier(hidden_layer_s... +bts_fscore 0.71 +bts_mcc 0.34 +bts_precision 0.61 +bts_recall 0.86 +bts_accuracy 0.65 +bts_roc_auc 0.65 +bts_jaccard 0.55 + +####################################################################### +# QDA: hyperparam +Best model: + {'clf__estimator': QuadraticDiscriminantAnalysis()} + + QDA +1 [(clf__estimator, QuadraticDiscriminantAnalysi... +bts_fscore 0.66 +bts_mcc 0.33 +bts_precision 0.67 +bts_recall 0.65 +bts_accuracy 0.67 +bts_roc_auc 0.67 +bts_jaccard 0.49 +####################################################################### +# RC: hyperparam +Best model: + {'clf__estimator': RidgeClassifier(alpha=0.8, random_state=42) + , 'clf__estimator__alpha': 0.8} + + Ridge Classifier +1 [(clf__estimator, RidgeClassifier(alpha=0.8, r... +bts_fscore 0.71 +bts_mcc 0.31 +bts_precision 0.59 +bts_recall 0.88 +bts_accuracy 0.64 +bts_roc_auc 0.64 +bts_jaccard 0.55 + +####################################################################### +# SVC: hyperparam +Best model: + {'clf__estimator': SVC(C=10, kernel='linear', random_state=42), 'clf__estimator__C': 10, 'clf__estimator__gamma': 'scale', 'clf__estimator__kernel': 'linear'} + + SVC +1 [(clf__estimator, SVC(C=10, kernel='linear', r... +bts_fscore 0.71 +bts_mcc 0.31 +bts_precision 0.57 +bts_recall 0.93 +bts_accuracy 0.62 +bts_roc_auc 0.62 +bts_jaccard 0.55 + +Best model: + {'clf__estimator': SVC(C=10, gamma='auto', random_state=42), 'clf__estimator__C': 10, 'clf__estimator__gamma': 'auto', 'clf__estimator__kernel': 'rbf'} +Best models score: + SVC +1 [(clf__estimator, SVC(C=10, gamma='auto', rand... +bts_fscore 0.71 +bts_mcc 0.32 +bts_precision 0.58 +bts_recall 0.93 +bts_accuracy 0.63 +bts_roc_auc 0.63 +bts_jaccard 0.56 + +Best model: + {'clf__estimator': SVC(C=50, gamma='auto', kernel='sigmoid', random_state=42), 'clf__estimator__C': 50, 'clf__estimator__gamma': 'auto', 'clf__estimator__kernel': 'sigmoid'} + + SVC +1 [(clf__estimator, SVC(C=50, gamma='auto', kern... +bts_fscore 0.72 +bts_mcc 0.33 +bts_precision 0.58 +bts_recall 0.93 +bts_accuracy 0.63 +bts_roc_auc 0.63 +bts_jaccard 0.56 +####################################################################### +# XGB: hyperparam + +Best model: + {'clf__estimator': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None, + colsample_bynode=None, colsample_bytree=None, + enable_categorical=False, gamma=None, gpu_id=None, + importance_type=None, interaction_constraints=None, + learning_rate=0.01, max_delta_step=None, max_depth=6, + max_features='auto', min_child_weight=None, min_samples_leaf=4, + missing=nan, monotone_constraints=None, n_estimators=100, + n_jobs=10, num_parallel_tree=None, predictor=None, + random_state=42, reg_alpha=None, reg_lambda=None, + scale_pos_weight=None, subsample=None, tree_method=None, + validate_parameters=None, verbosity=None), 'clf__estimator__learning_rate': 0.01, 'clf__estimator__max_depth': 6, 'clf__estimator__max_features': 'auto', 'clf__estimator__min_samples_leaf': 4} + + XGBoost +0 best_model_params +1 [(clf__estimator, XGBClassifier(base_score=Non... +bts_fscore 0.68 +bts_mcc 0.31 +bts_precision 0.63 +bts_recall 0.73 +bts_accuracy 0.65 +bts_roc_auc 0.65 +bts_jaccard 0.51 +