diff --git a/UQ_LR_FS.py b/UQ_LR_FS.py index 67942b8..9110b77 100644 --- a/UQ_LR_FS.py +++ b/UQ_LR_FS.py @@ -60,19 +60,19 @@ jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher() model_lr = LogisticRegression(**rs) model_rfecv = RFECV(estimator = model_lr - , cv = skf_cv + , cv = rskf_cv #, cv = 10 , scoring = 'matthews_corrcoef' ) -model_rfecv = SequentialFeatureSelector(estimator = model_lr - , n_features_to_select = 'auto' - , tol = None -# , cv = 10 - , cv = skf_cv -# , direction ='backward' - , direction ='forward' - , **njobs) +# model_rfecv = SequentialFeatureSelector(estimator = model_lr +# , n_features_to_select = 'auto' +# , tol = None +# # , cv = 10 +# , cv = rskf_cv +# # , direction ='backward' +# , direction ='forward' +# , **njobs) # param_grid = [ # { 'C': np.logspace(0, 4, 10), @@ -296,4 +296,6 @@ print('\nFeatures selected from Sequential Feature Selector (Greedy):', len(sfsb # Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = 10] #These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf'] -############################################################################### \ No newline at end of file +############################################################################### +# IMP: nice eg of including it as part of pipeline +# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/ \ No newline at end of file diff --git a/UQ_LR_p1.py b/UQ_LR_p1.py index 813bd2a..dc628b5 100644 --- a/UQ_LR_p1.py +++ b/UQ_LR_p1.py @@ -34,12 +34,7 @@ from xgboost import XGBClassifier rs = {'random_state': 42} njobs = {'n_jobs': 10} #%% Get train-test split and scoring functions -# X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] -# , num_df_wtgt['mutation_class'] -# , test_size = 0.33 -# , random_state = 2 -# , shuffle = True -# , stratify = num_df_wtgt['mutation_class']) + y.to_frame().value_counts().plot(kind = 'bar') blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') @@ -90,22 +85,22 @@ parameters = [ 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__solver': ['saga'] }, - # { - # 'clf__estimator': [LogisticRegression(**rs)], - # #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - # 'clf__estimator__C': np.logspace(0, 4, 10), - # 'clf__estimator__penalty': ['l2', 'none'], - # 'clf__estimator__max_iter': list(range(100,800,100)), - # 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] - # }, - # { - # 'clf__estimator': [LogisticRegression(**rs)], - # #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], - # 'clf__estimator__C': np.logspace(0, 4, 10), - # 'clf__estimator__penalty': ['l1', 'l2'], - # 'clf__estimator__max_iter': list(range(100,800,100)), - # 'clf__estimator__solver': ['liblinear'] - # } + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['l2', 'none'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] + }, + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__estimator__C': np.logspace(0, 4, 10), + 'clf__estimator__penalty': ['l1', 'l2'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['liblinear'] + } ] @@ -120,7 +115,8 @@ gscv_lr = GridSearchCV(pipeline , parameters #, scoring = 'f1', refit = 'f1' , scoring = mcc_score_fn, refit = 'mcc' - , cv = skf_cv + #, cv = skf_cv + , cv = rskf_cv , **njobs , return_train_score = False , verbose = 3) @@ -138,7 +134,6 @@ print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res ############################################################################### - ###################################### # Blind test ###################################### @@ -186,7 +181,7 @@ print(lr_bts_df) # d3 # Create df with best model params -model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items() )]) +model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items())]) model_params_df = model_params.to_frame() model_params_df model_params_df.columns = ['Logistic_Regression'] @@ -209,3 +204,4 @@ lr_df print(confusion_matrix(y_bts, test_predict)) cm = confusion_matrix(y_bts, test_predict) + diff --git a/UQ_pnca_ML.py b/UQ_pnca_ML.py index 76b5dfa..293d878 100644 --- a/UQ_pnca_ML.py +++ b/UQ_pnca_ML.py @@ -5,6 +5,8 @@ Created on Sun Mar 6 13:41:54 2022 @author: tanu """ + +#https://stackoverflow.com/questions/51695322/compare-multiple-algorithms-with-sklearn-pipeline import os, sys import pandas as pd import numpy as np @@ -19,7 +21,21 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import BaggingClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.gaussian_process import kernels +from sklearn.gaussian_process.kernels import RBF +from sklearn.gaussian_process.kernels import DotProduct +from sklearn.gaussian_process.kernels import Matern +from sklearn.gaussian_process.kernels import RationalQuadratic +from sklearn.gaussian_process.kernels import WhiteKernel + +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neural_network import MLPClassifier + +from sklearn.linear_model import RidgeClassifier +from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier @@ -87,7 +103,7 @@ rskf_cv = RepeatedStratifiedKFold(n_splits = 10 #, shuffle = False, random_state= None) #, shuffle = True ,**rs) -#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef}) + mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} #%% diff --git a/uq_ml_models/UQ_ABC.py b/uq_ml_models/UQ_ABC.py index c812d02..23e5f8d 100644 --- a/uq_ml_models/UQ_ABC.py +++ b/uq_ml_models/UQ_ABC.py @@ -33,8 +33,8 @@ class ClfSwitcher(BaseEstimator): parameters = [ { 'clf__estimator': [AdaBoostClassifier(**rs)] - , 'clf__estimator__n_estimators': [none, 1, 2] - , 'clf__estimator__base_estiamtor' : ['None', 1*SVC(), 1*KNeighborsClassifier()] + , 'clf__estimator__n_estimators': [1, 2, 5, 10] + #, 'clf__estimator__base_estimator' : ['SVC'] #, 'clf__estimator___splitter' : ["best", "random"] } ] @@ -48,7 +48,7 @@ pipeline = Pipeline([ # Grid search i.e hyperparameter tuning and refitting on mcc gscv_abc = GridSearchCV(pipeline , parameters - #, scoring = 'f1', refit = 'f1' + #, scoring = 'matthews_corrcoef', refit = 'matthews_corrcoef' , scoring = mcc_score_fn, refit = 'mcc' , cv = skf_cv , **njobs @@ -64,7 +64,7 @@ gscv_abc_fit_be_res = gscv_abc_fit.cv_results_ print('Best model:\n', gscv_abc_fit_be_mod) print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -102,17 +102,15 @@ abc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) abc_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(abc_bts_dict, orient = 'index', columns = 'best_model') - abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index') -abc_bts_df.columns = ['Logistic_Regression'] +abc_bts_df.columns = ['ABC'] print(abc_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_abc_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['ABC'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_BC.py b/uq_ml_models/UQ_BC.py index 0938a56..776db9f 100644 --- a/uq_ml_models/UQ_BC.py +++ b/uq_ml_models/UQ_BC.py @@ -33,13 +33,12 @@ class ClfSwitcher(BaseEstimator): parameters = [ { 'clf__estimator': [BaggingClassifier(**rs - , **njobs - , bootstrap = True - , oob_score = True)], - , 'clf__estimator__n_estimators' : [10, 100, 1000] + , **njobs + , bootstrap = True + , oob_score = True)] + , 'clf__estimator__n_estimators' : [10, 25, 50, 100, 150, 200, 500, 700, 1000] # If None, then the base estimator is a DecisionTreeClassifier. - , 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used - , 'clf__estimator__gamma': ['scale', 'auto'] + #, 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used } ] @@ -68,7 +67,7 @@ gscv_bc_fit_be_res = gscv_bc_fit.cv_results_ print('Best model:\n', gscv_bc_fit_be_mod) print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -106,17 +105,15 @@ bc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) bc_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(bc_bts_dict, orient = 'index', columns = 'best_model') - bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index') -bc_bts_df.columns = ['Logistic_Regression'] +bc_bts_df.columns = ['BC'] print(bc_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_bc_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['BC'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_BNB.py b/uq_ml_models/UQ_BNB.py index 52c6cbe..9092c2d 100644 --- a/uq_ml_models/UQ_BNB.py +++ b/uq_ml_models/UQ_BNB.py @@ -33,10 +33,10 @@ class ClfSwitcher(BaseEstimator): parameters = [ { 'clf__estimator': [BernoulliNB()] - , 'clf__estimator__alpha': [0, 1] - , 'clf__estimator__binarize':['None', 0] + , 'clf__estimator__alpha': [1, 0] + , 'clf__estimator__binarize':[None, 0] , 'clf__estimator__fit_prior': [True] - , 'clf__estimator__class_prior': ['None'] + , 'clf__estimator__class_prior': [None] } ] @@ -65,7 +65,7 @@ gscv_bnb_fit_be_res = gscv_bnb_fit.cv_results_ print('Best model:\n', gscv_bnb_fit_be_mod) print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -103,17 +103,15 @@ bnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) bnb_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(bnb_bts_dict, orient = 'index', columns = 'best_model') - bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index') -bnb_bts_df.columns = ['Logistic_Regression'] +bnb_bts_df.columns = ['BNB'] print(bnb_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_bnb_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['BNB'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_DT.py b/uq_ml_models/UQ_DT.py index 9272a61..bed047b 100644 --- a/uq_ml_models/UQ_DT.py +++ b/uq_ml_models/UQ_DT.py @@ -32,10 +32,9 @@ class ClfSwitcher(BaseEstimator): parameters = [ { - 'clf__estimator': [DecisionTreeClassifier(**rs - , **njobs)] + 'clf__estimator': [DecisionTreeClassifier(**rs)] , 'clf__estimator__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20] - , 'clf__estimator__class_weight':['balanced','balanced_subsample'] + , 'clf__estimator__class_weight':['balanced'] , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss'] , 'clf__estimator__max_features': [None, 'sqrt', 'log2'] , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10] @@ -106,17 +105,15 @@ dt_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) dt_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(dt_bts_dict, orient = 'index', columns = 'best_model') - dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index') -dt_bts_df.columns = ['Logistic_Regression'] +dt_bts_df.columns = ['DT'] print(dt_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_dt_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['DT'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_GBC.py b/uq_ml_models/UQ_GBC.py index ae204c8..d692ce1 100644 --- a/uq_ml_models/UQ_GBC.py +++ b/uq_ml_models/UQ_GBC.py @@ -67,7 +67,7 @@ gscv_gbc_fit_be_res = gscv_gbc_fit.cv_results_ print('Best model:\n', gscv_gbc_fit_be_mod) print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -105,17 +105,15 @@ gbc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) gbc_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(gbc_bts_dict, orient = 'index', columns = 'best_model') - gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index') -gbc_bts_df.columns = ['Logistic_Regression'] +gbc_bts_df.columns = ['GBC'] print(gbc_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_gbc_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['GBC'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_GNB.py b/uq_ml_models/UQ_GNB.py index a2ea1bb..3dab3c0 100644 --- a/uq_ml_models/UQ_GNB.py +++ b/uq_ml_models/UQ_GNB.py @@ -32,7 +32,7 @@ class ClfSwitcher(BaseEstimator): parameters = [ { - 'clf__estimator': [GaussianNB(**rs)] + 'clf__estimator': [GaussianNB()] , 'clf__estimator__priors': [None] , 'clf__estimator__var_smoothing': np.logspace(0,-9, num=100) } @@ -63,7 +63,7 @@ gscv_gnb_fit_be_res = gscv_gnb_fit.cv_results_ print('Best model:\n', gscv_gnb_fit_be_mod) print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -101,17 +101,15 @@ gnb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) gnb_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(gnb_bts_dict, orient = 'index', columns = 'best_model') - gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index') -gnb_bts_df.columns = ['Logistic_Regression'] +gnb_bts_df.columns = ['GNB'] print(gnb_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_gnb_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['GNB'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_GPC.py b/uq_ml_models/UQ_GPC.py index f59fa39..2fc5a88 100644 --- a/uq_ml_models/UQ_GPC.py +++ b/uq_ml_models/UQ_GPC.py @@ -101,17 +101,15 @@ gpc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) gpc_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(gpc_bts_dict, orient = 'index', columns = 'best_model') - gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index') -gpc_bts_df.columns = ['Logistic_Regression'] +gpc_bts_df.columns = ['GPC'] print(gpc_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_gpc_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['GPC'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_KNN.py b/uq_ml_models/UQ_KNN.py index cd670ec..88b8fa0 100644 --- a/uq_ml_models/UQ_KNN.py +++ b/uq_ml_models/UQ_KNN.py @@ -32,10 +32,9 @@ class ClfSwitcher(BaseEstimator): parameters = [ { - 'clf__estimator': [KNeighborsClassifier(**rs - , **njobs] - #, 'clf__estimator__n_neighbors': range(1, 21, 2) - , 'clf__estimator__n_neighbors': [5, 7, 11] + 'clf__estimator': [KNeighborsClassifier(**njobs)] + , 'clf__estimator__n_neighbors': range(21, 51, 2) + #, 'clf__estimator__n_neighbors': [5, 7, 11] , 'clf__estimator__metric' : ['euclidean', 'manhattan', 'minkowski'] , 'clf__estimator__weights' : ['uniform', 'distance'] @@ -67,7 +66,7 @@ gscv_knn_fit_be_res = gscv_knn_fit.cv_results_ print('Best model:\n', gscv_knn_fit_be_mod) print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -105,17 +104,15 @@ knn_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) knn_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(knn_bts_dict, orient = 'index', columns = 'best_model') - knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index') -knn_bts_df.columns = ['Logistic_Regression'] +knn_bts_df.columns = ['KNN'] print(knn_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_knn_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['KNN'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_LR.py b/uq_ml_models/UQ_LR.py index 9f20f32..879a926 100644 --- a/uq_ml_models/UQ_LR.py +++ b/uq_ml_models/UQ_LR.py @@ -171,8 +171,6 @@ lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) lr_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model') - lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') lr_bts_df.columns = ['Logistic_Regression'] print(lr_bts_df) diff --git a/uq_ml_models/UQ_MLP.py b/uq_ml_models/UQ_MLP.py index 0f77283..8c84e04 100644 --- a/uq_ml_models/UQ_MLP.py +++ b/uq_ml_models/UQ_MLP.py @@ -33,12 +33,11 @@ class ClfSwitcher(BaseEstimator): parameters = [ { 'clf__estimator': [MLPClassifier(**rs - , **njobs - , max_iter = 500)], - , 'clf__estimator__hidden_layer_sizes': [(1), (2), (3)] - , 'clf__estimator__max_features': ['auto', 'sqrt'] - , 'clf__estimator__min_samples_leaf': [2, 4, 8] - , 'clf__estimator__min_samples_split': [10, 20] + , max_iter = 1000)] + , 'clf__estimator__hidden_layer_sizes': [(1), (2), (3), (5), (10)] + , 'clf__estimator__solver': ['lbfgs', 'sgd', 'adam'] + , 'clf__estimator__learning_rate': ['constant', 'invscaling', 'adaptive'] + #, 'clf__estimator__learning_rate': ['constant'] } ] @@ -68,7 +67,7 @@ gscv_mlp_fit_be_res = gscv_mlp_fit.cv_results_ print('Best model:\n', gscv_mlp_fit_be_mod) print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -106,17 +105,15 @@ mlp_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) mlp_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(mlp_bts_dict, orient = 'index', columns = 'best_model') - mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index') -mlp_bts_df.columns = ['Logistic_Regression'] +mlp_bts_df.columns = ['MLP'] print(mlp_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_mlp_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['MLP'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_QDA.py b/uq_ml_models/UQ_QDA.py index ff252ff..5024c3c 100644 --- a/uq_ml_models/UQ_QDA.py +++ b/uq_ml_models/UQ_QDA.py @@ -100,17 +100,15 @@ qda_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) qda_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(qda_bts_dict, orient = 'index', columns = 'best_model') - qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index') -qda_bts_df.columns = ['Logistic_Regression'] +qda_bts_df.columns = ['QDA'] print(qda_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_qda_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['QDA'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_RC.py b/uq_ml_models/UQ_RC.py index 1f37717..2ea710f 100644 --- a/uq_ml_models/UQ_RC.py +++ b/uq_ml_models/UQ_RC.py @@ -31,11 +31,9 @@ class ClfSwitcher(BaseEstimator): return self.estimator.score(X, y) parameters = [ - { - 'clf__estimator': [RidgeClassifier(**rs - , **njobs)], - , 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] - } + {'clf__estimator' : [RidgeClassifier(**rs)] + , 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0] + } ] # Create pipeline @@ -63,7 +61,7 @@ gscv_rc_fit_be_res = gscv_rc_fit.cv_results_ print('Best model:\n', gscv_rc_fit_be_mod) print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -101,17 +99,15 @@ rc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) rc_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(rc_bts_dict, orient = 'index', columns = 'best_model') - rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index') -rc_bts_df.columns = ['Logistic_Regression'] +rc_bts_df.columns = ['Ridge Classifier'] print(rc_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_rc_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['Ridge Classifier'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_RF.py b/uq_ml_models/UQ_RF.py index 36d9a50..7758f9a 100644 --- a/uq_ml_models/UQ_RF.py +++ b/uq_ml_models/UQ_RF.py @@ -71,7 +71,7 @@ gscv_rf_fit_be_res = gscv_rf_fit.cv_results_ print('Best model:\n', gscv_rf_fit_be_mod) print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -109,8 +109,6 @@ rf_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) rf_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(rf_bts_dict, orient = 'index', columns = 'best_model') - rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index') rf_bts_df.columns = ['Logistic_Regression'] print(rf_bts_df) diff --git a/uq_ml_models/UQ_SVC.py b/uq_ml_models/UQ_SVC.py index edb15be..c430649 100644 --- a/uq_ml_models/UQ_SVC.py +++ b/uq_ml_models/UQ_SVC.py @@ -32,9 +32,10 @@ class ClfSwitcher(BaseEstimator): parameters = [ { - 'clf__estimator': [SVC(**rs - , **njobs)], - , 'clf__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} + 'clf__estimator': [SVC(**rs)] + , 'clf__estimator__kernel': ['poly', 'rbf', 'sigmoid'] + #, 'clf__estimator__kernel': ['linear'] + , 'clf__estimator__C' : [50, 10, 1.0, 0.1, 0.01] , 'clf__estimator__gamma': ['scale', 'auto'] @@ -66,7 +67,7 @@ gscv_svc_fit_be_res = gscv_svc_fit.cv_results_ print('Best model:\n', gscv_svc_fit_be_mod) print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -104,17 +105,15 @@ svc_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) svc_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(svc_bts_dict, orient = 'index', columns = 'best_model') - svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index') -svc_bts_df.columns = ['Logistic_Regression'] +svc_bts_df.columns = ['SVC'] print(svc_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_svc_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['SVC'] model_params_df.columns # Combine the df of scores and the best model params diff --git a/uq_ml_models/UQ_XGB.py b/uq_ml_models/UQ_XGB.py index 6cfe705..65a5e8f 100644 --- a/uq_ml_models/UQ_XGB.py +++ b/uq_ml_models/UQ_XGB.py @@ -5,7 +5,18 @@ Created on Wed May 18 06:03:24 2022 @author: tanu """ -#%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher() + +#%% +#https://www.datatechnotes.com/2019/07/classification-example-with.html +# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, +# colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1, +# max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, +# n_estimators=100, n_jobs=1, nthread=None, +# objective='multi:softprob', random_state=0, reg_alpha=0, +# reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, +# subsample=1, verbosity=1) + +#%% XGBoost + hyperparam: BaseEstimator: ClfSwitcher() class ClfSwitcher(BaseEstimator): def __init__( self, @@ -32,12 +43,11 @@ class ClfSwitcher(BaseEstimator): parameters = [ { - 'clf__estimator': [XGBClassifier(**rs - , **njobs] + 'clf__estimator': [XGBClassifier(**rs , **njobs, verbose = 3)] , 'clf__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2] , 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20] - , 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20] - , 'clf__estimator__max_features': ['auto', 'sqrt'] + #, 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20] + #, 'clf__estimator__max_features': ['auto', 'sqrt'] } ] @@ -66,7 +76,7 @@ gscv_xgb_fit_be_res = gscv_xgb_fit.cv_results_ print('Best model:\n', gscv_xgb_fit_be_mod) print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2)) -print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_re['mean_test_mcc']),2)) +print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2)) ###################################### @@ -104,17 +114,15 @@ xgb_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) xgb_bts_dict # Create a df from dict with all scores -pd.DataFrame.from_dict(xgb_bts_dict, orient = 'index', columns = 'best_model') - xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index') -xgb_bts_df.columns = ['Logistic_Regression'] +xgb_bts_df.columns = ['XGBoost'] print(xgb_bts_df) # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_xgb_fit_be_mod.items() )]) model_params_df = model_params.to_frame() model_params_df -model_params_df.columns = ['Logistic_Regression'] +model_params_df.columns = ['XGBoost'] model_params_df.columns # Combine the df of scores and the best model params