added and ran hyperparam script for all different classifiers, but couldn't successfully run the feature selection and hyperparam together

This commit is contained in:
Tanushree Tunstall 2022-05-20 08:09:24 +01:00
parent 74af5ef890
commit 37bda41f44
18 changed files with 131 additions and 142 deletions

View file

@ -60,19 +60,19 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
model_lr = LogisticRegression(**rs) model_lr = LogisticRegression(**rs)
model_rfecv = RFECV(estimator = model_lr model_rfecv = RFECV(estimator = model_lr
, cv = skf_cv , cv = rskf_cv
#, cv = 10 #, cv = 10
, scoring = 'matthews_corrcoef' , scoring = 'matthews_corrcoef'
) )
model_rfecv = SequentialFeatureSelector(estimator = model_lr # model_rfecv = SequentialFeatureSelector(estimator = model_lr
, n_features_to_select = 'auto' # , n_features_to_select = 'auto'
, tol = None # , tol = None
# , cv = 10 # # , cv = 10
, cv = skf_cv # , cv = rskf_cv
# , direction ='backward' # # , direction ='backward'
, direction ='forward' # , direction ='forward'
, **njobs) # , **njobs)
# param_grid = [ # param_grid = [
# { 'C': np.logspace(0, 4, 10), # { 'C': np.logspace(0, 4, 10),
@ -297,3 +297,5 @@ print('\nFeatures selected from Sequential Feature Selector (Greedy):', len(sfsb
# Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = 10] # Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = 10]
#These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf'] #These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf']
############################################################################### ###############################################################################
# IMP: nice eg of including it as part of pipeline
# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/

View file

@ -34,12 +34,7 @@ from xgboost import XGBClassifier
rs = {'random_state': 42} rs = {'random_state': 42}
njobs = {'n_jobs': 10} njobs = {'n_jobs': 10}
#%% Get train-test split and scoring functions #%% Get train-test split and scoring functions
# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
# , num_df_wtgt['mutation_class']
# , test_size = 0.33
# , random_state = 2
# , shuffle = True
# , stratify = num_df_wtgt['mutation_class'])
y.to_frame().value_counts().plot(kind = 'bar') y.to_frame().value_counts().plot(kind = 'bar')
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
@ -90,22 +85,22 @@ parameters = [
'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__max_iter': list(range(100,800,100)),
'clf__estimator__solver': ['saga'] 'clf__estimator__solver': ['saga']
}, },
# { {
# 'clf__estimator': [LogisticRegression(**rs)], 'clf__estimator': [LogisticRegression(**rs)],
# #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'clf__estimator__C': np.logspace(0, 4, 10), 'clf__estimator__C': np.logspace(0, 4, 10),
# 'clf__estimator__penalty': ['l2', 'none'], 'clf__estimator__penalty': ['l2', 'none'],
# 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__max_iter': list(range(100,800,100)),
# 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
# }, },
# { {
# 'clf__estimator': [LogisticRegression(**rs)], 'clf__estimator': [LogisticRegression(**rs)],
# #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'clf__estimator__C': np.logspace(0, 4, 10), 'clf__estimator__C': np.logspace(0, 4, 10),
# 'clf__estimator__penalty': ['l1', 'l2'], 'clf__estimator__penalty': ['l1', 'l2'],
# 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__max_iter': list(range(100,800,100)),
# 'clf__estimator__solver': ['liblinear'] 'clf__estimator__solver': ['liblinear']
# } }
] ]
@ -120,7 +115,8 @@ gscv_lr = GridSearchCV(pipeline
, parameters , parameters
#, scoring = 'f1', refit = 'f1' #, scoring = 'f1', refit = 'f1'
, scoring = mcc_score_fn, refit = 'mcc' , scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv #, cv = skf_cv
, cv = rskf_cv
, **njobs , **njobs
, return_train_score = False , return_train_score = False
, verbose = 3) , verbose = 3)
@ -138,7 +134,6 @@ print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res
############################################################################### ###############################################################################
###################################### ######################################
# Blind test # Blind test
###################################### ######################################
@ -186,7 +181,7 @@ print(lr_bts_df)
# d3 # d3
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items())])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['Logistic_Regression']
@ -209,3 +204,4 @@ lr_df
print(confusion_matrix(y_bts, test_predict)) print(confusion_matrix(y_bts, test_predict))
cm = confusion_matrix(y_bts, test_predict) cm = confusion_matrix(y_bts, test_predict)

View file

@ -5,6 +5,8 @@ Created on Sun Mar 6 13:41:54 2022
@author: tanu @author: tanu
""" """
#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline
import os, sys import os, sys
import pandas as pd import pandas as pd
import numpy as np import numpy as np
@ -19,7 +21,21 @@ from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process import kernels
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier from sklearn.linear_model import SGDClassifier
@ -87,7 +103,7 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None) #, shuffle = False, random_state= None)
#, shuffle = True #, shuffle = True
,**rs) ,**rs)
#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
#%% #%%

View file

@ -33,8 +33,8 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [AdaBoostClassifier(**rs)] 'clf__estimator': [AdaBoostClassifier(**rs)]
, 'clf__estimator__n_estimators': [none, 1, 2] , 'clf__estimator__n_estimators': [1, 2, 5, 10]
, 'clf__estimator__base_estiamtor' : ['None', 1*SVC(), 1*KNeighborsClassifier()] #, 'clf__estimator__base_estimator' : ['SVC']
#, 'clf__estimator___splitter' : ["best", "random"] #, 'clf__estimator___splitter' : ["best", "random"]
} }
] ]
@ -48,7 +48,7 @@ pipeline = Pipeline([
# Grid search i.e hyperparameter tuning and refitting on mcc # Grid search i.e hyperparameter tuning and refitting on mcc
gscv_abc = GridSearchCV(pipeline gscv_abc = GridSearchCV(pipeline
, parameters , parameters
#, scoring = 'f1', refit = 'f1' #, scoring = 'matthews_corrcoef', refit = 'matthews_corrcoef'
, scoring = mcc_score_fn, refit = 'mcc' , scoring = mcc_score_fn, refit = 'mcc'
, cv = skf_cv , cv = skf_cv
, **njobs , **njobs
@ -64,7 +64,7 @@ gscv_abc_fit_be_res = gscv_abc_fit.cv_results_
print('Best model:\n', gscv_abc_fit_be_mod) print('Best model:\n', gscv_abc_fit_be_mod)
print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2)) print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -102,17 +102,15 @@ abc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
abc_bts_dict abc_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(abc_bts_dict, orient = 'index', columns = 'best_model')
abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index') abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index')
abc_bts_df.columns = ['Logistic_Regression'] abc_bts_df.columns = ['ABC']
print(abc_bts_df) print(abc_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_abc_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_abc_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['ABC']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -33,13 +33,12 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [BaggingClassifier(**rs 'clf__estimator': [BaggingClassifier(**rs
, **njobs , **njobs
, bootstrap = True , bootstrap = True
, oob_score = True)], , oob_score = True)]
, 'clf__estimator__n_estimators' : [10, 100, 1000] , 'clf__estimator__n_estimators' : [10, 25, 50, 100, 150, 200, 500, 700, 1000]
# If None, then the base estimator is a DecisionTreeClassifier. # If None, then the base estimator is a DecisionTreeClassifier.
, 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used #, 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
, 'clf__estimator__gamma': ['scale', 'auto']
} }
] ]
@ -68,7 +67,7 @@ gscv_bc_fit_be_res = gscv_bc_fit.cv_results_
print('Best model:\n', gscv_bc_fit_be_mod) print('Best model:\n', gscv_bc_fit_be_mod)
print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2)) print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -106,17 +105,15 @@ bc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
bc_bts_dict bc_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(bc_bts_dict, orient = 'index', columns = 'best_model')
bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index') bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index')
bc_bts_df.columns = ['Logistic_Regression'] bc_bts_df.columns = ['BC']
print(bc_bts_df) print(bc_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_bc_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_bc_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['BC']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -33,10 +33,10 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [BernoulliNB()] 'clf__estimator': [BernoulliNB()]
, 'clf__estimator__alpha': [0, 1] , 'clf__estimator__alpha': [1, 0]
, 'clf__estimator__binarize':['None', 0] , 'clf__estimator__binarize':[None, 0]
, 'clf__estimator__fit_prior': [True] , 'clf__estimator__fit_prior': [True]
, 'clf__estimator__class_prior': ['None'] , 'clf__estimator__class_prior': [None]
} }
] ]
@ -65,7 +65,7 @@ gscv_bnb_fit_be_res = gscv_bnb_fit.cv_results_
print('Best model:\n', gscv_bnb_fit_be_mod) print('Best model:\n', gscv_bnb_fit_be_mod)
print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2)) print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -103,17 +103,15 @@ bnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
bnb_bts_dict bnb_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(bnb_bts_dict, orient = 'index', columns = 'best_model')
bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index') bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index')
bnb_bts_df.columns = ['Logistic_Regression'] bnb_bts_df.columns = ['BNB']
print(bnb_bts_df) print(bnb_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_bnb_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_bnb_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['BNB']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -32,10 +32,9 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [DecisionTreeClassifier(**rs 'clf__estimator': [DecisionTreeClassifier(**rs)]
, **njobs)]
, 'clf__estimator__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20] , 'clf__estimator__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20]
, 'clf__estimator__class_weight':['balanced','balanced_subsample'] , 'clf__estimator__class_weight':['balanced']
, 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss'] , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
, 'clf__estimator__max_features': [None, 'sqrt', 'log2'] , 'clf__estimator__max_features': [None, 'sqrt', 'log2']
, 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
@ -106,17 +105,15 @@ dt_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
dt_bts_dict dt_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(dt_bts_dict, orient = 'index', columns = 'best_model')
dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index') dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index')
dt_bts_df.columns = ['Logistic_Regression'] dt_bts_df.columns = ['DT']
print(dt_bts_df) print(dt_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_dt_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_dt_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['DT']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -67,7 +67,7 @@ gscv_gbc_fit_be_res = gscv_gbc_fit.cv_results_
print('Best model:\n', gscv_gbc_fit_be_mod) print('Best model:\n', gscv_gbc_fit_be_mod)
print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2)) print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -105,17 +105,15 @@ gbc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
gbc_bts_dict gbc_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(gbc_bts_dict, orient = 'index', columns = 'best_model')
gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index') gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index')
gbc_bts_df.columns = ['Logistic_Regression'] gbc_bts_df.columns = ['GBC']
print(gbc_bts_df) print(gbc_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_gbc_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_gbc_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['GBC']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -32,7 +32,7 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [GaussianNB(**rs)] 'clf__estimator': [GaussianNB()]
, 'clf__estimator__priors': [None] , 'clf__estimator__priors': [None]
, 'clf__estimator__var_smoothing': np.logspace(0,-9, num=100) , 'clf__estimator__var_smoothing': np.logspace(0,-9, num=100)
} }
@ -63,7 +63,7 @@ gscv_gnb_fit_be_res = gscv_gnb_fit.cv_results_
print('Best model:\n', gscv_gnb_fit_be_mod) print('Best model:\n', gscv_gnb_fit_be_mod)
print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2)) print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -101,17 +101,15 @@ gnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
gnb_bts_dict gnb_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(gnb_bts_dict, orient = 'index', columns = 'best_model')
gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index') gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index')
gnb_bts_df.columns = ['Logistic_Regression'] gnb_bts_df.columns = ['GNB']
print(gnb_bts_df) print(gnb_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_gnb_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_gnb_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['GNB']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -101,17 +101,15 @@ gpc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
gpc_bts_dict gpc_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(gpc_bts_dict, orient = 'index', columns = 'best_model')
gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index') gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index')
gpc_bts_df.columns = ['Logistic_Regression'] gpc_bts_df.columns = ['GPC']
print(gpc_bts_df) print(gpc_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_gpc_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_gpc_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['GPC']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -32,10 +32,9 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [KNeighborsClassifier(**rs 'clf__estimator': [KNeighborsClassifier(**njobs)]
, **njobs] , 'clf__estimator__n_neighbors': range(21, 51, 2)
#, 'clf__estimator__n_neighbors': range(1, 21, 2) #, 'clf__estimator__n_neighbors': [5, 7, 11]
, 'clf__estimator__n_neighbors': [5, 7, 11]
, 'clf__estimator__metric' : ['euclidean', 'manhattan', 'minkowski'] , 'clf__estimator__metric' : ['euclidean', 'manhattan', 'minkowski']
, 'clf__estimator__weights' : ['uniform', 'distance'] , 'clf__estimator__weights' : ['uniform', 'distance']
@ -67,7 +66,7 @@ gscv_knn_fit_be_res = gscv_knn_fit.cv_results_
print('Best model:\n', gscv_knn_fit_be_mod) print('Best model:\n', gscv_knn_fit_be_mod)
print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2)) print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -105,17 +104,15 @@ knn_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
knn_bts_dict knn_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(knn_bts_dict, orient = 'index', columns = 'best_model')
knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index') knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index')
knn_bts_df.columns = ['Logistic_Regression'] knn_bts_df.columns = ['KNN']
print(knn_bts_df) print(knn_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_knn_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_knn_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['KNN']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -171,8 +171,6 @@ lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict lr_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model')
lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
lr_bts_df.columns = ['Logistic_Regression'] lr_bts_df.columns = ['Logistic_Regression']
print(lr_bts_df) print(lr_bts_df)

View file

@ -33,12 +33,11 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [MLPClassifier(**rs 'clf__estimator': [MLPClassifier(**rs
, **njobs , max_iter = 1000)]
, max_iter = 500)], , 'clf__estimator__hidden_layer_sizes': [(1), (2), (3), (5), (10)]
, 'clf__estimator__hidden_layer_sizes': [(1), (2), (3)] , 'clf__estimator__solver': ['lbfgs', 'sgd', 'adam']
, 'clf__estimator__max_features': ['auto', 'sqrt'] , 'clf__estimator__learning_rate': ['constant', 'invscaling', 'adaptive']
, 'clf__estimator__min_samples_leaf': [2, 4, 8] #, 'clf__estimator__learning_rate': ['constant']
, 'clf__estimator__min_samples_split': [10, 20]
} }
] ]
@ -68,7 +67,7 @@ gscv_mlp_fit_be_res = gscv_mlp_fit.cv_results_
print('Best model:\n', gscv_mlp_fit_be_mod) print('Best model:\n', gscv_mlp_fit_be_mod)
print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2)) print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -106,17 +105,15 @@ mlp_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
mlp_bts_dict mlp_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(mlp_bts_dict, orient = 'index', columns = 'best_model')
mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index') mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index')
mlp_bts_df.columns = ['Logistic_Regression'] mlp_bts_df.columns = ['MLP']
print(mlp_bts_df) print(mlp_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_mlp_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_mlp_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['MLP']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -100,17 +100,15 @@ qda_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
qda_bts_dict qda_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(qda_bts_dict, orient = 'index', columns = 'best_model')
qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index') qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index')
qda_bts_df.columns = ['Logistic_Regression'] qda_bts_df.columns = ['QDA']
print(qda_bts_df) print(qda_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_qda_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_qda_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['QDA']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -31,11 +31,9 @@ class ClfSwitcher(BaseEstimator):
return self.estimator.score(X, y) return self.estimator.score(X, y)
parameters = [ parameters = [
{ {'clf__estimator' : [RidgeClassifier(**rs)]
'clf__estimator': [RidgeClassifier(**rs , 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
, **njobs)], }
, 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
}
] ]
# Create pipeline # Create pipeline
@ -63,7 +61,7 @@ gscv_rc_fit_be_res = gscv_rc_fit.cv_results_
print('Best model:\n', gscv_rc_fit_be_mod) print('Best model:\n', gscv_rc_fit_be_mod)
print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2)) print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -101,17 +99,15 @@ rc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
rc_bts_dict rc_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(rc_bts_dict, orient = 'index', columns = 'best_model')
rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index') rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index')
rc_bts_df.columns = ['Logistic_Regression'] rc_bts_df.columns = ['Ridge Classifier']
print(rc_bts_df) print(rc_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_rc_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_rc_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['Ridge Classifier']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -71,7 +71,7 @@ gscv_rf_fit_be_res = gscv_rf_fit.cv_results_
print('Best model:\n', gscv_rf_fit_be_mod) print('Best model:\n', gscv_rf_fit_be_mod)
print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2)) print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -109,8 +109,6 @@ rf_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
rf_bts_dict rf_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(rf_bts_dict, orient = 'index', columns = 'best_model')
rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index') rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index')
rf_bts_df.columns = ['Logistic_Regression'] rf_bts_df.columns = ['Logistic_Regression']
print(rf_bts_df) print(rf_bts_df)

View file

@ -32,9 +32,10 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [SVC(**rs 'clf__estimator': [SVC(**rs)]
, **njobs)], , 'clf__estimator__kernel': ['poly', 'rbf', 'sigmoid']
, 'clf__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} #, 'clf__estimator__kernel': ['linear']
, 'clf__estimator__C' : [50, 10, 1.0, 0.1, 0.01] , 'clf__estimator__C' : [50, 10, 1.0, 0.1, 0.01]
, 'clf__estimator__gamma': ['scale', 'auto'] , 'clf__estimator__gamma': ['scale', 'auto']
@ -66,7 +67,7 @@ gscv_svc_fit_be_res = gscv_svc_fit.cv_results_
print('Best model:\n', gscv_svc_fit_be_mod) print('Best model:\n', gscv_svc_fit_be_mod)
print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2)) print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -104,17 +105,15 @@ svc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
svc_bts_dict svc_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(svc_bts_dict, orient = 'index', columns = 'best_model')
svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index') svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index')
svc_bts_df.columns = ['Logistic_Regression'] svc_bts_df.columns = ['SVC']
print(svc_bts_df) print(svc_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_svc_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_svc_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['SVC']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params

View file

@ -5,7 +5,18 @@ Created on Wed May 18 06:03:24 2022
@author: tanu @author: tanu
""" """
#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
#%%
#https://www.datatechnotes.com/2019/07/classification-example-with.html
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
# colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
# max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
# n_estimators=100, n_jobs=1, nthread=None,
# objective='multi:softprob', random_state=0, reg_alpha=0,
# reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
# subsample=1, verbosity=1)
#%% XGBoost + hyperparam: BaseEstimator: ClfSwitcher()
class ClfSwitcher(BaseEstimator): class ClfSwitcher(BaseEstimator):
def __init__( def __init__(
self, self,
@ -32,12 +43,11 @@ class ClfSwitcher(BaseEstimator):
parameters = [ parameters = [
{ {
'clf__estimator': [XGBClassifier(**rs 'clf__estimator': [XGBClassifier(**rs , **njobs, verbose = 3)]
, **njobs]
, 'clf__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2] , 'clf__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2]
, 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20] , 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20]
, 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20] #, 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20]
, 'clf__estimator__max_features': ['auto', 'sqrt'] #, 'clf__estimator__max_features': ['auto', 'sqrt']
} }
] ]
@ -66,7 +76,7 @@ gscv_xgb_fit_be_res = gscv_xgb_fit.cv_results_
print('Best model:\n', gscv_xgb_fit_be_mod) print('Best model:\n', gscv_xgb_fit_be_mod)
print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2)) print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2))
print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_re['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_res['mean_test_mcc']),2))
print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2))
###################################### ######################################
@ -104,17 +114,15 @@ xgb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2)
xgb_bts_dict xgb_bts_dict
# Create a df from dict with all scores # Create a df from dict with all scores
pd.DataFrame.from_dict(xgb_bts_dict, orient = 'index', columns = 'best_model')
xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index') xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index')
xgb_bts_df.columns = ['Logistic_Regression'] xgb_bts_df.columns = ['XGBoost']
print(xgb_bts_df) print(xgb_bts_df)
# Create df with best model params # Create df with best model params
model_params = pd.Series(['best_model_params', list(gscv_xgb_fit_be_mod.items() )]) model_params = pd.Series(['best_model_params', list(gscv_xgb_fit_be_mod.items() )])
model_params_df = model_params.to_frame() model_params_df = model_params.to_frame()
model_params_df model_params_df
model_params_df.columns = ['Logistic_Regression'] model_params_df.columns = ['XGBoost']
model_params_df.columns model_params_df.columns
# Combine the df of scores and the best model params # Combine the df of scores and the best model params