copy of ML dir to an FS-only version

This commit is contained in:
Tanushree Tunstall 2022-05-22 23:30:58 +01:00
parent 52cc16f3fa
commit 80e6b3af96
23 changed files with 3115 additions and 243 deletions

View file

@ -49,8 +49,7 @@ clf2.best_estimator_.named_steps['selector'].n_features_in_
clf2.best_estimator_ #n of best features
clf2.best_params_
clf2.best_estimator_.get_params
clf2.get_feature_names()
clf2.get_feature_names(
clf3 = clf2.best_estimator_ #
@ -62,4 +61,37 @@ clf3._final_estimator.solver
fs_bmod = clf2.best_estimator_
print('\nbest model with feature selection:', fs_bmod)
#########################################################
# my data
pipe = Pipeline([
('pre', MinMaxScaler())
('selector', RFECV(LogisticRegression(**rs), cv = skf_cv, scoring = 'matthews_corrcoef'))
, ('classifier', LogisticRegression(**rs))])
search_space = [{'selector__min_features_to_select': [1,2]},
{'classifier': [LogisticRegression()],
#'classifier__C': np.logspace(0, 4, 10),
'classifier__C': [2, 2.8],
'classifier__max_iter': [100],
'classifier__penalty': ['l1', 'l2'],
'classifier__solver': ['saga']
}] #,
#{'classifier': [RandomForestClassifier(n_estimators=100)],
# 'classifier__max_depth': [5, 10, None]},
#{'classifier': [KNeighborsClassifier()],
# 'classifier__n_neighbors': [3, 7, 11],
# 'classifier__weights': ['uniform', 'distance']
#}]
clf = GridSearchCV(pipe, search_space, cv=skf_cv, scoring = mcc_score_fn, refit = 'mcc', verbose=0)
clf.fit(X, y)
clf.best_params_
clf.best_score_
tp = clf.predict(X_bts)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))

View file

@ -12,60 +12,20 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Import libs
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
#####################
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector
# Attempting feature selection for LR WITHOUT ClfSwitcher Class
#%% Import libraries, data, and scoring func: UQ_pnca_ML.py
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%%
y.to_frame().value_counts().plot(kind = 'bar')
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jaccard' : make_scorer(jaccard_score)
})
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
#%% Logistic Regression + hyperparam + FS: Pipeline takes GridSearchCV (not the other way round!)
model_lr = LogisticRegression(**rs)
model_rfecv = RFECV(estimator = model_lr
, cv = rskf_cv
, cv = skf_cv
#, cv = 10
, scoring = 'matthews_corrcoef'
)
# model_rfecv = SequentialFeatureSelector(estimator = model_lr
# model_sfs = SequentialFeatureSelector(estimator = model_lr
# , n_features_to_select = 'auto'
# , tol = None
# # , cv = 10
@ -74,23 +34,9 @@ model_rfecv = RFECV(estimator = model_lr
# , direction ='forward'
# , **njobs)
# param_grid = [
# { 'C': np.logspace(0, 4, 10),
# 'penalty': ['l1', 'l2'],
# 'max_iter': [100],
# 'solver': ['saga']
# }#,
# # { 'C': [1],
# # 'penalty': ['l1'],
# # 'max_iter': [100],
# # 'solver': ['saga']
# # }
# ]
param_grid2 = [
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'max_iter': list(range(100,800,100)),
@ -98,7 +44,6 @@ param_grid2 = [
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l2', 'none'],
'max_iter': list(range(100,800,100)),
@ -106,13 +51,24 @@ param_grid2 = [
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l1', 'l2'],
'max_iter': list(range(100,800,100)),
'solver': ['liblinear']
}
# lesser params for testing
# { 'C': np.logspace(0, 4, 10),
# 'penalty': ['l1', 'l2'],
# 'max_iter': [100],
# 'solver': ['saga']
# },
# { 'C': [1],
# 'penalty': ['l1'],
# 'max_iter': [100],
# 'solver': ['saga']
# }
]
#-------------------------------------------------------------------------------
@ -127,24 +83,21 @@ gscv_lr = GridSearchCV(model_lr
#------------------------------------------------------------------------------
# Create pipeline
pipeline = Pipeline([('pre', MinMaxScaler())
pipeline2 = Pipeline([('pre', MinMaxScaler())
#, ('feature_selection', sfs_selector)
, ('feature_selection', model_rfecv )
, ('clf', gscv_lr)])
# Fit
lr_fs = pipeline.fit(X,y)
pipeline2.fit(X,y)
pipeline2.predict(X_bts)
# Assigning fit an then running predict: sanity check
#lr_fs = pipeline.fit(X,y)
#lr_fs.predict(X_bts)
pipeline.predict(X_bts)
lr_fs.predict(X_bts)
test_predict = pipeline.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
#y_btsf = np.array(y_bts)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
###############################################################################
#####################
@ -160,13 +113,12 @@ print(matthews_corrcoef(y_bts, test_predict))
#print('\nBlind test score, mcc:', ))
#test_predict = gscv_lr_fit.predict(X_bts)
test_predict = pipeline.predict(X_bts)
test_predict_fs = sfs_selector.predict(X_bts)
test_predict = pipeline2.predict(X_bts)
print(test_predict)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
@ -237,7 +189,7 @@ from sklearn.feature_selection import SequentialFeatureSelector
# RFE: ~ model coef or feature_importance
rfe_selector = RFECV(estimator = LogisticRegression(**rs
, penalty='l1'
, penalty='l2'
, solver='saga'
, max_iter = 100
, C= 1.0)
@ -249,6 +201,30 @@ rfe_fs = X.columns[rfe_selector.get_support()]
print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs)
, '\nThese are:', rfe_fs)
# blind test
TEST_PREDICT = rfe_selector.predict(X_bts)
TEST_PREDICT
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, TEST_PREDICT),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, TEST_PREDICT),2))
# add pipeline with preprocessing: changes numbers
pipe = Pipeline([
('pre', MinMaxScaler())
#, ('fs', model_rfecv)
, ('fs', rfe_selector)
, ('clf', LogisticRegression(**rs))])
pipe.fit(X,y)
tp = pipe.predict(X_bts)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
##################################
# SFM: ~ model coef or feature_importance
sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs
, penalty='l1'

View file

@ -12,6 +12,8 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
# similar to _p1 but with Clf_Switcher
#%% Import libraries, data, and scoring func: UQ_pnca_ML.py
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
@ -21,25 +23,17 @@ class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
#feature = RFECV(SGDClassifier())
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
#self.feature = feature
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
#self.feature.fit(X, y)
return self
# def transform(self, X, y=None):
# #self.estimator.transform(X, y)
# self.feature.transform(X)
# return self
def predict(self, X, y=None):
return self.estimator.predict(X)
@ -52,35 +46,49 @@ class ClfSwitcher(BaseEstimator):
#%%
parameters = [
# {'fs__feature__min_features_to_select': [1]
# , 'fs__feature__scoring': ['matthews_corrcoef']
# , 'fs__feature__cv': [skf_cv]},
{'fs__min_features_to_select': [1]
#, 'fs__scoring': ['matthews_corrcoef']
, 'fs__cv': [skf_cv]},
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['saga']
}#,
# {
# 'clf__estimator': [MODEL2(**rs)],
# 'clf__estimator': [LogisticRegression(**rs)],
# 'clf__estimator__C': np.logspace(0, 4, 10),
# 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
# 'clf__estimator__max_iter': list(range(100,800,100)),
# 'clf__estimator__solver': ['saga']
# },
# {
# 'clf__estimator': [LogisticRegression(**rs)],
# 'clf__estimator__C': np.logspace(0, 4, 10),
# 'clf__estimator__penalty': ['l2', 'none'],
# 'clf__estimator__max_iter': list(range(100,800,100)),
# 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
# },
# {
# 'clf__estimator': [LogisticRegression(**rs)],
# 'clf__estimator__C': np.logspace(0, 4, 10),
# 'clf__estimator__penalty': ['l1', 'l2'],
# 'clf__estimator__max_iter': list(range(100,800,100)),
# 'clf__estimator__solver': ['liblinear']
# }
{'fs__min_features_to_select': [1,2]},
{'classifier': [LogisticRegression()],
#'classifier__C': np.logspace(0, 4, 10),
'classifier__C': [2, 2.8],
'classifier__max_iter': [100],
'classifier__penalty': ['l1', 'l2'],
'classifier__solver': ['saga']
}
]
#%% Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler())
, ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn
# , ('fs', ClfSwitcher())
, ('clf', ClfSwitcher())
# ('pre', MinMaxScaler())
('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))#cant be my mcc_fn
#, ('clf', ClfSwitcher())
, ('classifier', ClfSwitcher())
])
#%%
@ -95,81 +103,66 @@ gscv_lr = GridSearchCV(pipeline
# Fit
gscv_lr.fit(X, y)
gscv_lr.best_estimator_
gscv_lr.best_params_
gscv_lr.best_score_
# Blind test
test_predict = gscv_lr.predict(X_bts)
print(test_predict)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
####
gscv_lr_fit = gscv_lr.fit(X, y)
gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
gscv_lr_fit.best_score_
#%% Grid search i.e hyperparameter tuning and refitting on mcc
param_grid2 = [
{'fs__min_features_to_select': [1]
, 'fs__cv': [skf_cv]
},
{
#'clf__estimator': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l2'],
'clf__max_iter': list(range(100,200,100)),
#'clf__solver': ['newton-cg', 'lbfgs', 'sag']
'clf__solver': ['sag']
},
{
#'clf__estimator': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l1', 'l2'],
'clf__max_iter': list(range(100,200,100)),
'clf__solver': ['liblinear']
}
]
# step 4: create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler())
#, ('fs', model_rfecv)
, ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
, ('clf', LogisticRegression(**rs))])
# step 5: Perform Gridsearch CV
gs_final = GridSearchCV(pipeline
, param_grid2
, cv = skf_cv
, scoring = mcc_score_fn, refit = 'mcc'
, verbose = 1
, return_train_score = False
, **njobs)
#%% Fit
mod_fs_fit = mod_fs.fit(X, y)
mod_fs_fbm = mod_fs_fit.best_params_
mod_fs_fbmr = mod_fs_fit.cv_results_
mod_fs_fbs = mod_fs_fit.best_score_
print('Best model:\n', mod_fs_fbm)
print('Best models score:\n', mod_fs_fbs, ':' , round(mod_fs_fbs, 2))
print('Best model:\n', gscv_lr_fit_be_mod)
print('Best models score:\n', gscv_lr_fit.best_score_, ':'
, round(gscv_lr_fit.best_score_, 2))
#print('\nMean test score from fit results:', round(mean(mod_fs_fbmr['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(mod_fs_fbmr['mean_test_mcc']),2))
print('\nMean test score from fit results:'
, round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
#%% print selected features
# Now get the features out
all_features = gscv_lr.feature_names_in_
#all_features = gsfit.feature_names_in_
sel_features = X.columns[gscv_lr.best_estimator_.named_steps['fs'].get_support()]
n_sf = gscv_lr.best_estimator_.named_steps['fs'].n_features_
# get model name
model_name = gscv_lr.best_estimator_.named_steps['clf']
b_model_params = gscv_lr.best_params_
print('\n========================================'
, '\nRunning model:'
, '\nModel name:', model_name
, '\n==============================================='
, '\nRunning feature selection with RFECV for model'
, '\nTotal no. of features in model:', len(all_features)
, '\nThese are:\n', all_features, '\n\n'
, '\nNo of features for best model: ', n_sf
, '\nThese are:', sel_features, '\n\n'
, '\nBest Model hyperparams:', b_model_params
)
###############################################################################
#%% Blind test
######################################
# Blind test
######################################
test_predict = mod_fs_fit.predict(X_bts)
test_predict = gscv_lr.predict(X_bts)
print(test_predict)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))

View file

@ -13,50 +13,50 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
model_lr = LogisticRegression(**rs)
model_rfecv = RFECV(estimator = model_lr
, cv = skf_cv
#, cv = 10
, min_features_to_select = 1 # default
, scoring = 'matthews_corrcoef'
)
# model_lr = LogisticRegression(**rs)
# model_rfecv = RFECV(estimator = model_lr
# , cv = skf_cv
# #, cv = 10
# , min_features_to_select = 1 # default
# , scoring = 'matthews_corrcoef'
# )
param_grid2 = [
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'max_iter': list(range(100,800,100)),
'solver': ['saga']
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l2', 'none'],
'max_iter': list(range(100,800,100)),
'solver': ['newton-cg', 'lbfgs', 'sag']
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l1', 'l2'],
'max_iter': list(range(100,800,100)),
'solver': ['liblinear']
}
# param_grid2 = [
# {
# #'clf': [LogisticRegression(**rs)],
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'C': np.logspace(0, 4, 10),
# 'penalty': ['none', 'l1', 'l2', 'elasticnet'],
# 'max_iter': list(range(100,800,100)),
# 'solver': ['saga']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'C': np.logspace(0, 4, 10),
# 'penalty': ['l2', 'none'],
# 'max_iter': list(range(100,800,100)),
# 'solver': ['newton-cg', 'lbfgs', 'sag']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'C': np.logspace(0, 4, 10),
# 'penalty': ['l1', 'l2'],
# 'max_iter': list(range(100,800,100)),
# 'solver': ['liblinear']
# }
]
#-------------------------------------------------------------------------------
# Grid search CV + FS
gscv_lr = GridSearchCV(estimator = model_lr
, param_grid = param_grid2
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, return_train_score = False
, verbose = 3
, **njobs)
# ]
# #-------------------------------------------------------------------------------
# # Grid search CV + FS
# gscv_lr = GridSearchCV(estimator = model_lr
# , param_grid = param_grid2
# , scoring = mcc_score_fn, refit = 'mcc'
# , cv = skf_cv
# , return_train_score = False
# , verbose = 3
# , **njobs)
#------------------------------------------------------------------------------
################
@ -64,27 +64,27 @@ gscv_lr = GridSearchCV(estimator = model_lr
# Cannot get BEST model out
################
# Create pipeline
pipeline = Pipeline([('pre', MinMaxScaler())
#, ('fs', sfs_selector)
, ('fs', model_rfecv )
, ('clf', gscv_lr)])
# pipeline = Pipeline([('pre', MinMaxScaler())
# #, ('fs', sfs_selector)
# , ('fs', model_rfecv )
# , ('clf', gscv_lr)])
# Fit # dont assign fit
#lr_fs_fit = pipeline.fit(X,y)
pipeline.fit(X,y)
# # Fit # dont assign fit
# #lr_fs_fit = pipeline.fit(X,y)
# pipeline.fit(X,y)
pipeline.best_params_
# pipeline.best_params_
#https://github.com/scikit-learn/scikit-learn/issues/7536
n_fs = gscv_lr.best_estimator_.n_features_in_
n_fs
# #https://github.com/scikit-learn/scikit-learn/issues/7536
# n_fs = gscv_lr.best_estimator_.n_features_in_
# n_fs
sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
print('\nNo. of features selected with RFECV for model'
, pipeline.named_steps['clf'].estimator
, ':', n_fs
, '\nThese are:', sel_features
)
# sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
# print('\nNo. of features selected with RFECV for model'
# , pipeline.named_steps['clf'].estimator
# , ':', n_fs
# , '\nThese are:', sel_features
# )
##############################################################
# THIS ONE
#########
@ -106,28 +106,45 @@ param_grid2 = [
{'fs__min_features_to_select': [1]
, 'fs__cv': [skf_cv]
#, 'fs__scoring': ['matthews_corrcoef']},
#, 'fs__scoring': [mcc_score_fn]}
},
# {
# #'clf': [LogisticRegression(**rs)],
# 'clf__C': np.logspace(0, 4, 10),
# 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
# 'clf__max_iter': list(range(100,800,100)),
# 'clf__solver': ['saga']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# 'clf__C': np.logspace(0, 4, 10),
# 'clf__penalty': ['l2', 'none'],
# 'clf__max_iter': list(range(100,800,100)),
# 'clf__solver': ['newton-cg', 'lbfgs', 'sag']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# 'clf__C': np.logspace(0, 4, 10),
# 'clf__penalty': ['l1', 'l2'],
# 'clf__max_iter': list(range(100,800,100)),
# 'clf__solver': ['liblinear']
# }
{
#'clf__estimator': [LogisticRegression(**rs)],
{ #'clf': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l2'],
'clf__max_iter': list(range(100,200,100)),
#'clf__solver': ['newton-cg', 'lbfgs', 'sag']
'clf__solver': ['sag']
},
{
#'clf__estimator': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l1', 'l2'],
'clf__max_iter': list(range(100,200,100)),
'clf__max_iter': [100],
'clf__solver': ['liblinear']
},
{ #'clf': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l2'],
'clf__max_iter':[100],
'clf__solver': ['saga']
}
]
# step 4: create pipeline
pipeline = Pipeline([
@ -149,12 +166,34 @@ gs_final = GridSearchCV(pipeline
gs_final.fit(X,y)
gs_final.best_params_
gs_final.best_score_
gs_final.best_estimator_
# assign the fit
gsfit = gs_final.fit(X,y)
#gsfit = gs_final.fit(X,y)
#gsfit.best_estimator_
gsfit.best_params_
gsfit.best_score_
#gsfit.best_params_
#gsfit.best_score_
test_predict = gs_final.predict(X_bts)
print(test_predict)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
# Now get the features out
all_features = gs_final.feature_names_in_
@ -163,7 +202,6 @@ all_features = gs_final.feature_names_in_
sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_
# get model name
model_name = gs_final.best_estimator_.named_steps['clf']
b_model_params = gs_final.best_params_
@ -180,3 +218,36 @@ print('\n========================================'
, '\nBest Model hyperparams:', b_model_params
)
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))
#test_predict = gscv_lr_fit.predict(X_bts)
test_predict = gs_final.predict(X_bts)
print(test_predict)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
'bts_fscore':None
, 'bts_mcc':None
, 'bts_precision':None
, 'bts_recall':None
, 'bts_accuracy':None
, 'bts_roc_auc':None
, 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict

131
uq_ml_models_FS/UQ_ABC.py Normal file
View file

@ -0,0 +1,131 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [AdaBoostClassifier(**rs)]
, 'clf__estimator__n_estimators': [1, 2, 5, 10]
#, 'clf__estimator__base_estimator' : ['SVC']
#, 'clf__estimator___splitter' : ["best", "random"]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_abc = GridSearchCV(pipeline
, parameters
#, scoring = 'matthews_corrcoef', refit = 'matthews_corrcoef'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_abc_fit = gscv_abc.fit(X, y)
gscv_abc_fit_be_mod = gscv_abc_fit.best_params_
gscv_abc_fit_be_res = gscv_abc_fit.cv_results_
print('Best model:\n', gscv_abc_fit_be_mod)
print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_abc_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
abc_bts_dict = {#'best_model': list(gscv_abc_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
abc_bts_dict
abc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
abc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
abc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
abc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
abc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
abc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
abc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
abc_bts_dict
# Create a df from dict with all scores
abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index')
abc_bts_df.columns = ['ABC']
print(abc_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_abc_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['ABC']
model_params_df.columns
# Combine the df of scores and the best model params
abc_bts_df.columns
abc_output = pd.concat([model_params_df, abc_bts_df], axis = 0)
abc_output
# Format the combined df
# Drop the best_model_params row from abc_output
abc_df = abc_output.drop([0], axis = 0)
abc_df
#FIXME: tidy the index of the formatted df
###############################################################################

134
uq_ml_models_FS/UQ_BC.py Normal file
View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [BaggingClassifier(**rs
, **njobs
, bootstrap = True
, oob_score = True)]
, 'clf__estimator__n_estimators' : [10, 25, 50, 100, 150, 200, 500, 700, 1000]
# If None, then the base estimator is a DecisionTreeClassifier.
#, 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_bc = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_bc_fit = gscv_bc.fit(X, y)
gscv_bc_fit_be_mod = gscv_bc_fit.best_params_
gscv_bc_fit_be_res = gscv_bc_fit.cv_results_
print('Best model:\n', gscv_bc_fit_be_mod)
print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_bc_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
bc_bts_dict = {#'best_model': list(gscv_bc_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
bc_bts_dict
bc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
bc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
bc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
bc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
bc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
bc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
bc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
bc_bts_dict
# Create a df from dict with all scores
bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index')
bc_bts_df.columns = ['BC']
print(bc_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_bc_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['BC']
model_params_df.columns
# Combine the df of scores and the best model params
bc_bts_df.columns
bc_output = pd.concat([model_params_df, bc_bts_df], axis = 0)
bc_output
# Format the combined df
# Drop the best_model_params row from bc_output
bc_df = bc_output.drop([0], axis = 0)
bc_df
#FIXME: tidy the index of the formatted df
###############################################################################

132
uq_ml_models_FS/UQ_BNB.py Normal file
View file

@ -0,0 +1,132 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [BernoulliNB()]
, 'clf__estimator__alpha': [1, 0]
, 'clf__estimator__binarize':[None, 0]
, 'clf__estimator__fit_prior': [True]
, 'clf__estimator__class_prior': [None]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_bnb = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_bnb_fit = gscv_bnb.fit(X, y)
gscv_bnb_fit_be_mod = gscv_bnb_fit.best_params_
gscv_bnb_fit_be_res = gscv_bnb_fit.cv_results_
print('Best model:\n', gscv_bnb_fit_be_mod)
print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_bnb_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
bnb_bts_dict = {#'best_model': list(gscv_bnb_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
bnb_bts_dict
bnb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
bnb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
bnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
bnb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
bnb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
bnb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
bnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
bnb_bts_dict
# Create a df from dict with all scores
bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index')
bnb_bts_df.columns = ['BNB']
print(bnb_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_bnb_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['BNB']
model_params_df.columns
# Combine the df of scores and the best model params
bnb_bts_df.columns
bnb_output = pd.concat([model_params_df, bnb_bts_df], axis = 0)
bnb_output
# Format the combined df
# Drop the best_model_params row from bnb_output
bnb_df = bnb_output.drop([0], axis = 0)
bnb_df
#FIXME: tidy the index of the formatted df
###############################################################################

134
uq_ml_models_FS/UQ_DT.py Normal file
View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [DecisionTreeClassifier(**rs)]
, 'clf__estimator__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20]
, 'clf__estimator__class_weight':['balanced']
, 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
, 'clf__estimator__max_features': [None, 'sqrt', 'log2']
, 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
, 'clf__estimator__min_samples_split': [2, 5, 15, 20]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_dt = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_dt_fit = gscv_dt.fit(X, y)
gscv_dt_fit_be_mod = gscv_dt_fit.best_params_
gscv_dt_fit_be_res = gscv_dt_fit.cv_results_
print('Best model:\n', gscv_dt_fit_be_mod)
print('Best models score:\n', gscv_dt_fit.best_score_, ':' , round(gscv_dt_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_dt_fit_be_re['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_dt_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_dt_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
dt_bts_dict = {#'best_model': list(gscv_dt_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
dt_bts_dict
dt_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
dt_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
dt_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
dt_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
dt_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
dt_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
dt_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
dt_bts_dict
# Create a df from dict with all scores
dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index')
dt_bts_df.columns = ['DT']
print(dt_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_dt_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['DT']
model_params_df.columns
# Combine the df of scores and the best model params
dt_bts_df.columns
dt_output = pd.concat([model_params_df, dt_bts_df], axis = 0)
dt_output
# Format the combined df
# Drop the best_model_params row from dt_output
dt_df = dt_output.drop([0], axis = 0)
dt_df
#FIXME: tidy the index of the formatted df
###############################################################################

134
uq_ml_models_FS/UQ_GBC.py Normal file
View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [GradientBoostingClassifier(**rs)]
, 'clf__estimator__n_estimators' : [10, 100, 200, 500, 1000]
, 'clf__estimator__n_estimators' : [10, 100, 1000]
, 'clf__estimator__learning_rate': [0.001, 0.01, 0.1]
, 'clf__estimator__subsample' : [0.5, 0.7, 1.0]
, 'clf__estimator__max_depth' : [3, 7, 9]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_gbc = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_gbc_fit = gscv_gbc.fit(X, y)
gscv_gbc_fit_be_mod = gscv_gbc_fit.best_params_
gscv_gbc_fit_be_res = gscv_gbc_fit.cv_results_
print('Best model:\n', gscv_gbc_fit_be_mod)
print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_gbc_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
gbc_bts_dict = {#'best_model': list(gscv_gbc_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
gbc_bts_dict
gbc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
gbc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
gbc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
gbc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
gbc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
gbc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
gbc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
gbc_bts_dict
# Create a df from dict with all scores
gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index')
gbc_bts_df.columns = ['GBC']
print(gbc_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_gbc_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['GBC']
model_params_df.columns
# Combine the df of scores and the best model params
gbc_bts_df.columns
gbc_output = pd.concat([model_params_df, gbc_bts_df], axis = 0)
gbc_output
# Format the combined df
# Drop the best_model_params row from gbc_output
gbc_df = gbc_output.drop([0], axis = 0)
gbc_df
#FIXME: tidy the index of the formatted df
###############################################################################

130
uq_ml_models_FS/UQ_GNB.py Normal file
View file

@ -0,0 +1,130 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [GaussianNB()]
, 'clf__estimator__priors': [None]
, 'clf__estimator__var_smoothing': np.logspace(0,-9, num=100)
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_gnb = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_gnb_fit = gscv_gnb.fit(X, y)
gscv_gnb_fit_be_mod = gscv_gnb_fit.best_params_
gscv_gnb_fit_be_res = gscv_gnb_fit.cv_results_
print('Best model:\n', gscv_gnb_fit_be_mod)
print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_gnb_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
gnb_bts_dict = {#'best_model': list(gscv_gnb_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
gnb_bts_dict
gnb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
gnb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
gnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
gnb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
gnb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
gnb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
gnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
gnb_bts_dict
# Create a df from dict with all scores
gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index')
gnb_bts_df.columns = ['GNB']
print(gnb_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_gnb_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['GNB']
model_params_df.columns
# Combine the df of scores and the best model params
gnb_bts_df.columns
gnb_output = pd.concat([model_params_df, gnb_bts_df], axis = 0)
gnb_output
# Format the combined df
# Drop the best_model_params row from gnb_output
gnb_df = gnb_output.drop([0], axis = 0)
gnb_df
#FIXME: tidy the index of the formatted df
###############################################################################

130
uq_ml_models_FS/UQ_GPC.py Normal file
View file

@ -0,0 +1,130 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [GaussianProcessClassifier(**rs)]
, 'clf__estimator__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_gpc = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_gpc_fit = gscv_gpc.fit(X, y)
gscv_gpc_fit_be_mod = gscv_gpc_fit.best_params_
gscv_gpc_fit_be_res = gscv_gpc_fit.cv_results_
print('Best model:\n', gscv_gpc_fit_be_mod)
print('Best models score:\n', gscv_gpc_fit.best_score_, ':' , round(gscv_gpc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_gpc_fit_be_re['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_gpc_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_gpc_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
gpc_bts_dict = {#'best_model': list(gscv_gpc_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
gpc_bts_dict
gpc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
gpc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
gpc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
gpc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
gpc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
gpc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
gpc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
gpc_bts_dict
# Create a df from dict with all scores
gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index')
gpc_bts_df.columns = ['GPC']
print(gpc_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_gpc_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['GPC']
model_params_df.columns
# Combine the df of scores and the best model params
gpc_bts_df.columns
gpc_output = pd.concat([model_params_df, gpc_bts_df], axis = 0)
gpc_output
# Format the combined df
# Drop the best_model_params row from gpc_output
gpc_df = gpc_output.drop([0], axis = 0)
gpc_df
#FIXME: tidy the index of the formatted df
###############################################################################

133
uq_ml_models_FS/UQ_KNN.py Normal file
View file

@ -0,0 +1,133 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [KNeighborsClassifier(**njobs)]
, 'clf__estimator__n_neighbors': range(21, 51, 2)
#, 'clf__estimator__n_neighbors': [5, 7, 11]
, 'clf__estimator__metric' : ['euclidean', 'manhattan', 'minkowski']
, 'clf__estimator__weights' : ['uniform', 'distance']
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_knn = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_knn_fit = gscv_knn.fit(X, y)
gscv_knn_fit_be_mod = gscv_knn_fit.best_params_
gscv_knn_fit_be_res = gscv_knn_fit.cv_results_
print('Best model:\n', gscv_knn_fit_be_mod)
print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_knn_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
knn_bts_dict = {#'best_model': list(gscv_knn_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
knn_bts_dict
knn_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
knn_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
knn_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
knn_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
knn_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
knn_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
knn_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
knn_bts_dict
# Create a df from dict with all scores
knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index')
knn_bts_df.columns = ['KNN']
print(knn_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_knn_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['KNN']
model_params_df.columns
# Combine the df of scores and the best model params
knn_bts_df.columns
knn_output = pd.concat([model_params_df, knn_bts_df], axis = 0)
knn_output
# Format the combined df
# Drop the best_model_params row from knn_output
knn_df = knn_output.drop([0], axis = 0)
knn_df
#FIXME: tidy the index of the formatted df
###############################################################################

205
uq_ml_models_FS/UQ_LR.py Normal file
View file

@ -0,0 +1,205 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 16 05:59:12 2022
@author: tanu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Import libs
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%% Get train-test split and scoring functions
# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
# , num_df_wtgt['mutation_class']
# , test_size = 0.33
# , random_state = 2
# , shuffle = True
# , stratify = num_df_wtgt['mutation_class'])
y.to_frame().value_counts().plot(kind = 'bar')
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jaccard' : make_scorer(jaccard_score)
})
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['saga']
},
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['l2', 'none'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
},
{
'clf__estimator': [LogisticRegression(**rs)],
#'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__estimator__C': np.logspace(0, 4, 10),
'clf__estimator__penalty': ['l1', 'l2'],
'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['liblinear']
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_lr = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_lr_fit = gscv_lr.fit(X, y)
gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
print('Best model:\n', gscv_lr_fit_be_mod)
print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
#print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))
test_predict = gscv_lr_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
'bts_fscore':None
, 'bts_mcc':None
, 'bts_precision':None
, 'bts_recall':None
, 'bts_accuracy':None
, 'bts_roc_auc':None
, 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict
# Create a df from dict with all scores
lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
lr_bts_df.columns = ['Logistic_Regression']
print(lr_bts_df)
# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
# d2
# def Merge(dict1, dict2):
# res = {**dict1, **dict2}
# return res
# d3 = Merge(d2, lr_bts_dict)
# d3
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['Logistic_Regression']
model_params_df.columns
# Combine the df of scores and the best model params
lr_bts_df.columns
lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
lr_output
# Format the combined df
# Drop the best_model_params row from lr_output
lr_df = lr_output.drop([0], axis = 0)
lr_df
#FIXME: tidy the index of the formatted df
###############################################################################

154
uq_ml_models_FS/UQ_LR_FS.py Normal file
View file

@ -0,0 +1,154 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 16 05:59:12 2022
@author: tanu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
model_lr = LogisticRegression(**rs)
model_rfecv = RFECV(estimator = model_lr
, cv = rskf_cv
#, cv = 10
, scoring = 'matthews_corrcoef'
)
param_grid2 = [
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'max_iter': list(range(100,800,100)),
'solver': ['saga']
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l2', 'none'],
'max_iter': list(range(100,800,100)),
'solver': ['newton-cg', 'lbfgs', 'sag']
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l1', 'l2'],
'max_iter': list(range(100,800,100)),
'solver': ['liblinear']
}
]
#-------------------------------------------------------------------------------
# Grid search CV + FS
gscv_lr = GridSearchCV(model_lr
, param_grid2
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, return_train_score = False
, verbose = 3
, **njobs)
#------------------------------------------------------------------------------
# Create pipeline
pipeline = Pipeline([('pre', MinMaxScaler())
#, ('feature_selection', sfs_selector)
, ('feature_selection', model_rfecv )
, ('clf', gscv_lr)])
# Fit
lr_fs_fit = pipeline.fit(X,y)
#lr_fs_fit_be_mod = lr_fs_fit.best_params_
#lr_fs_fit_be_res = lr_fs_fit.cv_results_
dir(lr_fs_fit)
print('Best model:\n', lr_fs_fit_be_mod)
print('Best models score:\n', lr_fs_fit.best_score_, ':' , round(lr_fs_fit.best_score_, 2))
pipeline.predict(X_bts)
lr_fs_fit.predict(X_bts)
test_predict = pipeline.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
#y_btsf = np.array(y_bts)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))
test_predict = lr_fs_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
'bts_fscore':None
, 'bts_mcc':None
, 'bts_precision':None
, 'bts_recall':None
, 'bts_accuracy':None
, 'bts_roc_auc':None
, 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict
# Create a df from dict with all scores
lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
lr_bts_df.columns = ['Logistic_Regression']
print(lr_bts_df)
# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
# d2
# def Merge(dict1, dict2):
# res = {**dict1, **dict2}
# return res
# d3 = Merge(d2, lr_bts_dict)
# d3
# Create df with best model params
model_params = pd.Series(['best_model_params', list(lr_fs_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['Logistic_Regression']
model_params_df.columns
# Combine the df of scores and the best model params
lr_bts_df.columns
lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
lr_output
# Format the combined df
# Drop the best_model_params row from lr_output
lr_df = lr_output.drop([0], axis = 0)
lr_df
#FIXME: tidy the index of the formatted df
###############################################################################

View file

@ -0,0 +1,253 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 16 05:59:12 2022
@author: tanu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
# model_lr = LogisticRegression(**rs)
# model_rfecv = RFECV(estimator = model_lr
# , cv = skf_cv
# #, cv = 10
# , min_features_to_select = 1 # default
# , scoring = 'matthews_corrcoef'
# )
# param_grid2 = [
# {
# #'clf': [LogisticRegression(**rs)],
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'C': np.logspace(0, 4, 10),
# 'penalty': ['none', 'l1', 'l2', 'elasticnet'],
# 'max_iter': list(range(100,800,100)),
# 'solver': ['saga']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'C': np.logspace(0, 4, 10),
# 'penalty': ['l2', 'none'],
# 'max_iter': list(range(100,800,100)),
# 'solver': ['newton-cg', 'lbfgs', 'sag']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'C': np.logspace(0, 4, 10),
# 'penalty': ['l1', 'l2'],
# 'max_iter': list(range(100,800,100)),
# 'solver': ['liblinear']
# }
# ]
# #-------------------------------------------------------------------------------
# # Grid search CV + FS
# gscv_lr = GridSearchCV(estimator = model_lr
# , param_grid = param_grid2
# , scoring = mcc_score_fn, refit = 'mcc'
# , cv = skf_cv
# , return_train_score = False
# , verbose = 3
# , **njobs)
#------------------------------------------------------------------------------
################
# NOTE: GS is going into pipeline,
# Cannot get BEST model out
################
# Create pipeline
# pipeline = Pipeline([('pre', MinMaxScaler())
# #, ('fs', sfs_selector)
# , ('fs', model_rfecv )
# , ('clf', gscv_lr)])
# # Fit # dont assign fit
# #lr_fs_fit = pipeline.fit(X,y)
# pipeline.fit(X,y)
# pipeline.best_params_
# #https://github.com/scikit-learn/scikit-learn/issues/7536
# n_fs = gscv_lr.best_estimator_.n_features_in_
# n_fs
# sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
# print('\nNo. of features selected with RFECV for model'
# , pipeline.named_steps['clf'].estimator
# , ':', n_fs
# , '\nThese are:', sel_features
# )
##############################################################
# THIS ONE
#########
# Make Pipeline go into GS with FS
#########
# step 1: specify model
#modLR = LogisticRegression(**rs)
# step 2: specify fs
#model_rfecv = RFECV(estimator = model_lr
# , cv = skf_cv
#, min_features_to_select = 1 # default
#, scoring = 'matthews_corrcoef'
#)
# step 3: specify param grid as dict
param_grid2 = [
{'fs__min_features_to_select': [1]
, 'fs__cv': [skf_cv]
},
# {
# #'clf': [LogisticRegression(**rs)],
# 'clf__C': np.logspace(0, 4, 10),
# 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
# 'clf__max_iter': list(range(100,800,100)),
# 'clf__solver': ['saga']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# 'clf__C': np.logspace(0, 4, 10),
# 'clf__penalty': ['l2', 'none'],
# 'clf__max_iter': list(range(100,800,100)),
# 'clf__solver': ['newton-cg', 'lbfgs', 'sag']
# },
# {
# #'clf': [LogisticRegression(**rs)],
# 'clf__C': np.logspace(0, 4, 10),
# 'clf__penalty': ['l1', 'l2'],
# 'clf__max_iter': list(range(100,800,100)),
# 'clf__solver': ['liblinear']
# }
{ #'clf': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l2'],
'clf__max_iter': [100],
'clf__solver': ['liblinear']
},
{ #'clf': [LogisticRegression(**rs)],
'clf__C': np.logspace(0, 4, 10),
'clf__penalty': ['l2'],
'clf__max_iter':[100],
'clf__solver': ['saga']
}
]
# step 4: create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler())
#, ('fs', model_rfecv)
, ('fs', RFECV(LogisticRegression(**rs), scoring = 'matthews_corrcoef'))
, ('clf', LogisticRegression(**rs))])
# step 5: Perform Gridsearch CV
gs_final = GridSearchCV(pipeline
, param_grid2
, cv = skf_cv
, scoring = mcc_score_fn, refit = 'mcc'
, verbose = 1
, return_train_score = False
, **njobs)
#fit
gs_final.fit(X,y)
gs_final.best_params_
gs_final.best_score_
gs_final.best_estimator_
# assign the fit
#gsfit = gs_final.fit(X,y)
#gsfit.best_estimator_
#gsfit.best_params_
#gsfit.best_score_
test_predict = gs_final.predict(X_bts)
print(test_predict)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
# Now get the features out
all_features = gs_final.feature_names_in_
#all_features = gsfit.feature_names_in_
sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_
# get model name
model_name = gs_final.best_estimator_.named_steps['clf']
b_model_params = gs_final.best_params_
print('\n========================================'
, '\nRunning model:'
, '\nModel name:', model_name
, '\n==============================================='
, '\nRunning feature selection with RFECV for model'
, '\nTotal no. of features in model:', len(all_features)
, '\nThese are:\n', all_features, '\n\n'
, '\nNo of features for best model: ', n_sf
, '\nThese are:', sel_features, '\n\n'
, '\nBest Model hyperparams:', b_model_params
)
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))
#test_predict = gscv_lr_fit.predict(X_bts)
test_predict = gs_final.predict(X_bts)
print(test_predict)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
'bts_fscore':None
, 'bts_mcc':None
, 'bts_precision':None
, 'bts_recall':None
, 'bts_accuracy':None
, 'bts_roc_auc':None
, 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict

134
uq_ml_models_FS/UQ_MLP.py Normal file
View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [MLPClassifier(**rs
, max_iter = 1000)]
, 'clf__estimator__hidden_layer_sizes': [(1), (2), (3), (5), (10)]
, 'clf__estimator__solver': ['lbfgs', 'sgd', 'adam']
, 'clf__estimator__learning_rate': ['constant', 'invscaling', 'adaptive']
#, 'clf__estimator__learning_rate': ['constant']
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_mlp = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_mlp_fit = gscv_mlp.fit(X, y)
gscv_mlp_fit_be_mod = gscv_mlp_fit.best_params_
gscv_mlp_fit_be_res = gscv_mlp_fit.cv_results_
print('Best model:\n', gscv_mlp_fit_be_mod)
print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_mlp_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
mlp_bts_dict = {#'best_model': list(gscv_mlp_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
mlp_bts_dict
mlp_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
mlp_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
mlp_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
mlp_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
mlp_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
mlp_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
mlp_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
mlp_bts_dict
# Create a df from dict with all scores
mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index')
mlp_bts_df.columns = ['MLP']
print(mlp_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_mlp_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['MLP']
model_params_df.columns
# Combine the df of scores and the best model params
mlp_bts_df.columns
mlp_output = pd.concat([model_params_df, mlp_bts_df], axis = 0)
mlp_output
# Format the combined df
# Drop the best_model_params row from mlp_output
mlp_df = mlp_output.drop([0], axis = 0)
mlp_df
#FIXME: tidy the index of the formatted df
###############################################################################

129
uq_ml_models_FS/UQ_QDA.py Normal file
View file

@ -0,0 +1,129 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [QuadraticDiscriminantAnalysis()]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_qda = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_qda_fit = gscv_qda.fit(X, y)
gscv_qda_fit_be_mod = gscv_qda_fit.best_params_
gscv_qda_fit_be_res = gscv_qda_fit.cv_results_
print('Best model:\n', gscv_qda_fit_be_mod)
print('Best models score:\n', gscv_qda_fit.best_score_, ':' , round(gscv_qda_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_qda_fit_be_re['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_qda_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_qda_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
qda_bts_dict = {#'best_model': list(gscv_qda_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
qda_bts_dict
qda_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
qda_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
qda_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
qda_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
qda_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
qda_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
qda_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
qda_bts_dict
# Create a df from dict with all scores
qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index')
qda_bts_df.columns = ['QDA']
print(qda_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_qda_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['QDA']
model_params_df.columns
# Combine the df of scores and the best model params
qda_bts_df.columns
qda_output = pd.concat([model_params_df, qda_bts_df], axis = 0)
qda_output
# Format the combined df
# Drop the best_model_params row from qda_output
qda_df = qda_output.drop([0], axis = 0)
qda_df
#FIXME: tidy the index of the formatted df
###############################################################################

128
uq_ml_models_FS/UQ_RC.py Normal file
View file

@ -0,0 +1,128 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{'clf__estimator' : [RidgeClassifier(**rs)]
, 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_rc = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_rc_fit = gscv_rc.fit(X, y)
gscv_rc_fit_be_mod = gscv_rc_fit.best_params_
gscv_rc_fit_be_res = gscv_rc_fit.cv_results_
print('Best model:\n', gscv_rc_fit_be_mod)
print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_rc_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
rc_bts_dict = {#'best_model': list(gscv_rc_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
rc_bts_dict
rc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
rc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
rc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
rc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
rc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
rc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
rc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
rc_bts_dict
# Create a df from dict with all scores
rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index')
rc_bts_df.columns = ['Ridge Classifier']
print(rc_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_rc_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['Ridge Classifier']
model_params_df.columns
# Combine the df of scores and the best model params
rc_bts_df.columns
rc_output = pd.concat([model_params_df, rc_bts_df], axis = 0)
rc_output
# Format the combined df
# Drop the best_model_params row from rc_output
rc_df = rc_output.drop([0], axis = 0)
rc_df
#FIXME: tidy the index of the formatted df
###############################################################################

138
uq_ml_models_FS/UQ_RF.py Normal file
View file

@ -0,0 +1,138 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [RandomForestClassifier(**rs
, **njobs
, bootstrap = True
, oob_score = True)],
'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
, 'clf__estimator__class_weight':['balanced','balanced_subsample']
, 'clf__estimator__n_estimators': [10, 25, 50, 100]
, 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
, 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt
, 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
, 'clf__estimator__min_samples_split': [2, 5, 15, 20]
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_rf = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_rf_fit = gscv_rf.fit(X, y)
gscv_rf_fit_be_mod = gscv_rf_fit.best_params_
gscv_rf_fit_be_res = gscv_rf_fit.cv_results_
print('Best model:\n', gscv_rf_fit_be_mod)
print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_rf_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
rf_bts_dict = {#'best_model': list(gscv_rf_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
rf_bts_dict
rf_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
rf_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
rf_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
rf_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
rf_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
rf_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
rf_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
rf_bts_dict
# Create a df from dict with all scores
rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index')
rf_bts_df.columns = ['Logistic_Regression']
print(rf_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_rf_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['Logistic_Regression']
model_params_df.columns
# Combine the df of scores and the best model params
rf_bts_df.columns
rf_output = pd.concat([model_params_df, rf_bts_df], axis = 0)
rf_output
# Format the combined df
# Drop the best_model_params row from rf_output
rf_df = rf_output.drop([0], axis = 0)
rf_df
#FIXME: tidy the index of the formatted df
###############################################################################

134
uq_ml_models_FS/UQ_SVC.py Normal file
View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [SVC(**rs)]
, 'clf__estimator__kernel': ['poly', 'rbf', 'sigmoid']
#, 'clf__estimator__kernel': ['linear']
, 'clf__estimator__C' : [50, 10, 1.0, 0.1, 0.01]
, 'clf__estimator__gamma': ['scale', 'auto']
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_svc = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_svc_fit = gscv_svc.fit(X, y)
gscv_svc_fit_be_mod = gscv_svc_fit.best_params_
gscv_svc_fit_be_res = gscv_svc_fit.cv_results_
print('Best model:\n', gscv_svc_fit_be_mod)
print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_svc_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
svc_bts_dict = {#'best_model': list(gscv_svc_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
svc_bts_dict
svc_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
svc_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
svc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
svc_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
svc_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
svc_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
svc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
svc_bts_dict
# Create a df from dict with all scores
svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index')
svc_bts_df.columns = ['SVC']
print(svc_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_svc_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['SVC']
model_params_df.columns
# Combine the df of scores and the best model params
svc_bts_df.columns
svc_output = pd.concat([model_params_df, svc_bts_df], axis = 0)
svc_output
# Format the combined df
# Drop the best_model_params row from svc_output
svc_df = svc_output.drop([0], axis = 0)
svc_df
#FIXME: tidy the index of the formatted df
###############################################################################

143
uq_ml_models_FS/UQ_XGB.py Normal file
View file

@ -0,0 +1,143 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 18 06:03:24 2022
@author: tanu
"""
#%%
#https://www.datatechnotes.com/2019/07/classification-example-with.html
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
# colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
# max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
# n_estimators=100, n_jobs=1, nthread=None,
# objective='multi:softprob', random_state=0, reg_alpha=0,
# reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
# subsample=1, verbosity=1)
#%% XGBoost + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [XGBClassifier(**rs , **njobs, verbose = 3)]
, 'clf__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2]
, 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20]
#, 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20]
#, 'clf__estimator__max_features': ['auto', 'sqrt']
}
]
# Create pipeline
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
# Grid search i.e hyperparameter tuning and refitting on mcc
gscv_xgb = GridSearchCV(pipeline
, parameters
#, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv
, **njobs
, return_train_score = False
, verbose = 3)
# Fit
gscv_xgb_fit = gscv_xgb.fit(X, y)
gscv_xgb_fit_be_mod = gscv_xgb_fit.best_params_
gscv_xgb_fit_be_res = gscv_xgb_fit.cv_results_
print('Best model:\n', gscv_xgb_fit_be_mod)
print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2))
######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', )
test_predict = gscv_xgb_fit.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
y_btsf = np.array(y_bts)
print(accuracy_score(y_btsf, test_predict))
print(matthews_corrcoef(y_btsf, test_predict))
# create a dict with all scores
xgb_bts_dict = {#'best_model': list(gscv_xgb_fit_be_mod.items())
'bts_fscore' : None
, 'bts_mcc' : None
, 'bts_precision': None
, 'bts_recall' : None
, 'bts_accuracy' : None
, 'bts_roc_auc' : None
, 'bts_jaccard' : None }
xgb_bts_dict
xgb_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2)
xgb_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2)
xgb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
xgb_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2)
xgb_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2)
xgb_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2)
xgb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
xgb_bts_dict
# Create a df from dict with all scores
xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index')
xgb_bts_df.columns = ['XGBoost']
print(xgb_bts_df)
# Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_xgb_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['XGBoost']
model_params_df.columns
# Combine the df of scores and the best model params
xgb_bts_df.columns
xgb_output = pd.concat([model_params_df, xgb_bts_df], axis = 0)
xgb_output
# Format the combined df
# Drop the best_model_params row from xgb_output
xgb_df = xgb_output.drop([0], axis = 0)
xgb_df
#FIXME: tidy the index of the formatted df
###############################################################################

View file

@ -0,0 +1,8 @@
Logistic_Regression
bts_fscore 0.71
bts_mcc 0.34
bts_precision 0.61
bts_recall 0.87
bts_accuracy 0.65
bts_roc_auc 0.65
bts_jaccard 0.55

View file

@ -0,0 +1,316 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 20 00:36:17 2022
@author: tanu
"""
# pnca [ numerical ONLY + NO oversampling]
# LR: hyperparm
{'clf__estimator': LogisticRegression(penalty='l1', random_state=42, solver='saga'),
'clf__estimator__C': 1.0,
'clf__estimator__max_iter': 100,
'clf__estimator__penalty': 'l1',
'clf__estimator__solver': 'saga'}
Logistic_Regression
bts_fscore 0.70
bts_mcc 0.29
bts_precision 0.57
bts_recall 0.92
bts_accuracy 0.61
bts_roc_auc 0.61
bts_jaccard 0.54
# LR: FS + hyperparam
{'bts_fscore': 0.71,
'bts_mcc': 0.34,
'bts_precision': 0.61,
'bts_recall': 0.87,
'bts_accuracy': 0.65,
'bts_roc_auc': 0.65,
'bts_jaccard': 0.55}
#######################################################################
# RF: hyperparam [~45]
Best model:
{'clf__estimator': RandomForestClassifier(class_weight='balanced', max_depth=4, max_features=None,
min_samples_leaf=2, min_samples_split=15,
n_estimators=10, n_jobs=10, oob_score=True,
random_state=42), 'clf__estimator__class_weight': 'balanced', 'clf__estimator__criterion': 'gini', 'clf__estimator__max_depth': 4, 'clf__estimator__max_features': None, 'clf__estimator__min_samples_leaf': 2, 'clf__estimator__min_samples_split': 15, 'clf__estimator__n_estimators': 10}
Best models score:
0.3329374281771619 : 0.33
RF
bts_fscore 0.69
bts_mcc 0.37
bts_precision 0.67
bts_recall 0.72
bts_accuracy 0.68
bts_roc_auc 0.68
bts_jaccard 0.53
#######################################################################
# ABC: hyperparam
{'clf__estimator': AdaBoostClassifier(n_estimators=2, random_state=42),
'clf__estimator__n_estimators': 2}
ABC
1 [(clf__estimator, AdaBoostClassifier(n_estimat...
bts_fscore 0.71
bts_mcc 0.36
bts_precision 0.63
bts_recall 0.83
bts_accuracy 0.67
bts_roc_auc 0.67
bts_jaccard 0.56
#######################################################################
# BC: hyperparam
{'clf__estimator': BaggingClassifier(n_estimators=200, n_jobs=10, oob_score=True, random_state=42),
'clf__estimator__n_estimators': 200}
BC
0 best_model_params
1 [(clf__estimator, BaggingClassifier(n_estimato...
bts_fscore 0.72
bts_mcc 0.37
bts_precision 0.64
bts_recall 0.82
bts_accuracy 0.68
bts_roc_auc 0.68
bts_jaccard 0.56
#######################################################################
# BNB: hyperparam
{'clf__estimator': BernoulliNB(alpha=1, binarize=None),
'clf__estimator__alpha': 1,
'clf__estimator__binarize': None,
'clf__estimator__class_prior': None,
'clf__estimator__fit_prior': True}
BNB
1 [(clf__estimator, BernoulliNB(alpha=1, binariz...
bts_fscore 0.72
bts_mcc 0.35
bts_precision 0.6
bts_recall 0.92
bts_accuracy 0.65
bts_roc_auc 0.65
bts_jaccard 0.56
#######################################################################
# DT: hyperparam
{'clf__estimator': DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
max_depth=2, random_state=42),
'clf__estimator__class_weight': 'balanced',
'clf__estimator__criterion': 'entropy',
'clf__estimator__max_depth': 2,
'clf__estimator__max_features': None,
'clf__estimator__min_samples_leaf': 1,
'clf__estimator__min_samples_split': 2}
DT
1 [(clf__estimator, DecisionTreeClassifier(class...
bts_fscore 0.72
bts_mcc 0.42
bts_precision 0.69
bts_recall 0.76
bts_accuracy 0.71
bts_roc_auc 0.71
bts_jaccard 0.57
#######################################################################
# GBC: hyperparam
{'clf__estimator': GradientBoostingClassifier(learning_rate=0.01, max_depth=7, random_state=42,
subsample=0.5),
'clf__estimator__learning_rate': 0.01,
'clf__estimator__max_depth': 7,
'clf__estimator__n_estimators': 100,
'clf__estimator__subsample': 0.5}
GBC
1 [(clf__estimator, GradientBoostingClassifier(l...
bts_fscore 0.71
bts_mcc 0.33
bts_precision 0.6
bts_recall 0.88
bts_accuracy 0.64
bts_roc_auc 0.65
bts_jaccard 0.55
#######################################################################
# GNB: hyperparam
{'clf__estimator': GaussianNB(var_smoothing=0.006579332246575682),
'clf__estimator__priors': None,
'clf__estimator__var_smoothing': 0.006579332246575682}
GNB
1 [(clf__estimator, GaussianNB(var_smoothing=0.0...
bts_fscore 0.72
bts_mcc 0.46
bts_precision 0.73
bts_recall 0.71
bts_accuracy 0.73
bts_roc_auc 0.73
bts_jaccard 0.57
#######################################################################
# GPC: hyperparam
{'clf__estimator': GaussianProcessClassifier(kernel=1**2 * Matern(length_scale=1, nu=1.5),
random_state=42),
'clf__estimator__kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
ConvergenceWarning: The optimal value found for dimension 0 of parameter k2__alpha is close to the specified upper bound 100000.0. Increasing the bound and calling fit again may find a better value.
warnings.warn(
GPC
1 [(clf__estimator, GaussianProcessClassifier(ke...
bts_fscore 0.73
bts_mcc 0.38
bts_precision 0.6
bts_recall 0.92
bts_accuracy 0.66
bts_roc_auc 0.66
bts_jaccard 0.58
#######################################################################
# KNN: hyperparam
Best model:
{'clf__estimator': KNeighborsClassifier(metric='euclidean', n_jobs=10, n_neighbors=11,
weights='distance'), 'clf__estimator__metric': 'euclidean', 'clf__estimator__n_neighbors': 11, 'clf__estimator__weights': 'distance'}
1 [(clf__estimator, KNeighborsClassifier(metric=...
bts_fscore 0.69
bts_mcc 0.26
bts_precision 0.58
bts_recall 0.85
bts_accuracy 0.62
bts_roc_auc 0.62
bts_jaccard 0.52
Best model:
{'clf__estimator': KNeighborsClassifier(metric='euclidean', n_jobs=10, n_neighbors=29), 'clf__estimator__metric': 'euclidean', 'clf__estimator__n_neighbors': 29, 'clf__estimator__weights': 'uniform'}
KNN
1 [(clf__estimator, KNeighborsClassifier(metric=...
bts_fscore 0.73
bts_mcc 0.37
bts_precision 0.6
bts_recall 0.92
bts_accuracy 0.65
bts_roc_auc 0.65
bts_jaccard 0.57
#######################################################################
# MLP: hyperparam
#constant lr, tried others as well, but comes back with constant
{'clf__estimator': MLPClassifier(hidden_layer_sizes=3, max_iter=500, random_state=42,
solver='lbfgs'),
'clf__estimator__hidden_layer_sizes': 3,
'clf__estimator__learning_rate': 'constant',
'clf__estimator__solver': 'lbfgs'}
1 [(clf__estimator, MLPClassifier(hidden_layer_s...
bts_fscore 0.71
bts_mcc 0.34
bts_precision 0.61
bts_recall 0.86
bts_accuracy 0.65
bts_roc_auc 0.65
bts_jaccard 0.55
#######################################################################
# QDA: hyperparam
Best model:
{'clf__estimator': QuadraticDiscriminantAnalysis()}
QDA
1 [(clf__estimator, QuadraticDiscriminantAnalysi...
bts_fscore 0.66
bts_mcc 0.33
bts_precision 0.67
bts_recall 0.65
bts_accuracy 0.67
bts_roc_auc 0.67
bts_jaccard 0.49
#######################################################################
# RC: hyperparam
Best model:
{'clf__estimator': RidgeClassifier(alpha=0.8, random_state=42)
, 'clf__estimator__alpha': 0.8}
Ridge Classifier
1 [(clf__estimator, RidgeClassifier(alpha=0.8, r...
bts_fscore 0.71
bts_mcc 0.31
bts_precision 0.59
bts_recall 0.88
bts_accuracy 0.64
bts_roc_auc 0.64
bts_jaccard 0.55
#######################################################################
# SVC: hyperparam
Best model:
{'clf__estimator': SVC(C=10, kernel='linear', random_state=42), 'clf__estimator__C': 10, 'clf__estimator__gamma': 'scale', 'clf__estimator__kernel': 'linear'}
SVC
1 [(clf__estimator, SVC(C=10, kernel='linear', r...
bts_fscore 0.71
bts_mcc 0.31
bts_precision 0.57
bts_recall 0.93
bts_accuracy 0.62
bts_roc_auc 0.62
bts_jaccard 0.55
Best model:
{'clf__estimator': SVC(C=10, gamma='auto', random_state=42), 'clf__estimator__C': 10, 'clf__estimator__gamma': 'auto', 'clf__estimator__kernel': 'rbf'}
Best models score:
SVC
1 [(clf__estimator, SVC(C=10, gamma='auto', rand...
bts_fscore 0.71
bts_mcc 0.32
bts_precision 0.58
bts_recall 0.93
bts_accuracy 0.63
bts_roc_auc 0.63
bts_jaccard 0.56
Best model:
{'clf__estimator': SVC(C=50, gamma='auto', kernel='sigmoid', random_state=42), 'clf__estimator__C': 50, 'clf__estimator__gamma': 'auto', 'clf__estimator__kernel': 'sigmoid'}
SVC
1 [(clf__estimator, SVC(C=50, gamma='auto', kern...
bts_fscore 0.72
bts_mcc 0.33
bts_precision 0.58
bts_recall 0.93
bts_accuracy 0.63
bts_roc_auc 0.63
bts_jaccard 0.56
#######################################################################
# XGB: hyperparam
Best model:
{'clf__estimator': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
colsample_bynode=None, colsample_bytree=None,
enable_categorical=False, gamma=None, gpu_id=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_delta_step=None, max_depth=6,
max_features='auto', min_child_weight=None, min_samples_leaf=4,
missing=nan, monotone_constraints=None, n_estimators=100,
n_jobs=10, num_parallel_tree=None, predictor=None,
random_state=42, reg_alpha=None, reg_lambda=None,
scale_pos_weight=None, subsample=None, tree_method=None,
validate_parameters=None, verbosity=None), 'clf__estimator__learning_rate': 0.01, 'clf__estimator__max_depth': 6, 'clf__estimator__max_features': 'auto', 'clf__estimator__min_samples_leaf': 4}
XGBoost
0 best_model_params
1 [(clf__estimator, XGBClassifier(base_score=Non...
bts_fscore 0.68
bts_mcc 0.31
bts_precision 0.63
bts_recall 0.73
bts_accuracy 0.65
bts_roc_auc 0.65
bts_jaccard 0.51