added pratice and feature selection scripts for LR and hyperparam for all classification models as separate scripts in uq_ml_models

2022-05-19 08:31:16 +01:00 · 2022-05-19 08:31:16 +01:00 · 8b0f69bbd9
commit 8b0f69bbd9
parent fa0f5e5b39
17 changed files with 2604 additions and 0 deletions
--- a/UQ_LR_FS.py
+++ b/UQ_LR_FS.py
@ -0,0 +1,299 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 16 05:59:12 2022
@author: tanu
 """
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Import libs
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 #####################
 from sklearn.feature_selection import RFE
 from sklearn.feature_selection import RFECV
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_selection import SelectFromModel
 from sklearn.feature_selection import SequentialFeatureSelector
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 #%%
 y.to_frame().value_counts().plot(kind = 'bar')
 blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })    
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
 model_lr = LogisticRegression(**rs)
 model_rfecv = RFECV(estimator = model_lr
                    , cv = skf_cv
                    #, cv = 10
                    , scoring = 'matthews_corrcoef'
                    )
 model_rfecv = SequentialFeatureSelector(estimator = model_lr
                                          , n_features_to_select = 'auto'
                                          , tol = None
 #                                         , cv = 10
                                          , cv = skf_cv
 #                                          , direction ='backward'
                                          , direction ='forward'
                                          , **njobs)
 # param_grid = [
 #       { 'C': np.logspace(0, 4, 10),
 #          'penalty': ['l1', 'l2'],
 #          'max_iter': [100],
 #          'solver': ['saga']
 #          }#,
 #      # { 'C': [1],
 #      #    'penalty': ['l1'],
 #      #    'max_iter': [100],
 #      #    'solver': ['saga']
 #      #    }
 # ]    
 param_grid2 = [
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'max_iter': list(range(100,800,100)),
        'solver': ['saga']
    },
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l2', 'none'],
        'max_iter': list(range(100,800,100)),
        'solver': ['newton-cg', 'lbfgs', 'sag']
    }, 
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l1', 'l2'],
        'max_iter': list(range(100,800,100)),
        'solver': ['liblinear']
    }
 ]    
 #-------------------------------------------------------------------------------
 # Grid search CV + FS
 gscv_lr = GridSearchCV(model_lr
                    , param_grid2
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , return_train_score = False
                    , verbose = 3
                    , **njobs)
 #------------------------------------------------------------------------------
 # Create pipeline
 pipeline = Pipeline([('pre', MinMaxScaler())
                     #, ('feature_selection', sfs_selector)
                     , ('feature_selection', model_rfecv )
                     , ('clf', gscv_lr)])
 # Fit
 lr_fs = pipeline.fit(X,y)
 pipeline.predict(X_bts)
 lr_fs.predict(X_bts)
 test_predict = pipeline.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 #y_btsf = np.array(y_bts)
 print(accuracy_score(y_bts, test_predict))
 print(matthews_corrcoef(y_bts, test_predict))
 ###############################################################################
 #####################
 # Feature selection: AFTER model selection
 # https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
 ###############################################################################
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )) 
 #test_predict = gscv_lr_fit.predict(X_bts)
 test_predict =  pipeline.predict(X_bts)
 test_predict_fs = sfs_selector.predict(X_bts)
 print(test_predict)
 print(accuracy_score(y_bts, test_predict))
 print(matthews_corrcoef(y_bts, test_predict))
 # create a dict with all scores
 lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
 lr_bts_dict
 lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 lr_bts_dict
 # Create a df from dict with all scores
 lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
 lr_bts_df.columns = ['Logistic_Regression']
 print(lr_bts_df)
 # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
 # d2
 # def Merge(dict1, dict2):
 #     res = {**dict1, **dict2}
 #     return res
 # d3 = Merge(d2, lr_bts_dict)
 # d3
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 lr_bts_df.columns
 lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
 lr_output
 # Format the combined df
 # Drop the best_model_params row from lr_output
 lr_df = lr_output.drop([0], axis = 0)
 lr_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
 # FIXME: confusion matrix
 print(confusion_matrix(y_bts, test_predict))
 #%% Feature selection
 #####################
 # Feature selection: AFTER model selection?
 # ADD that within the loop
 # https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
 #####################
 from sklearn.feature_selection import RFE
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_selection import SelectFromModel
 from sklearn.feature_selection import SequentialFeatureSelector
 # RFE: ~ model coef or feature_importance
 rfe_selector = RFE(estimator = LogisticRegression(**rs
                                                  , penalty='l1'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C= 1.0)
                   , n_features_to_select = None # median by default
                   , step = 1)
 rfe_selector.fit(X, y)
 rfe_fs = X.columns[rfe_selector.get_support()]
 print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs)
      , '\nThese are:', rfe_fs)
 # SFM: ~ model coef or feature_importance
 sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs
                                                  , penalty='l1'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C= 1.0)
                               , threshold = "median"
                               , max_features = None ) # median by default
 sfm_selector.fit(X, y)
 sfm_fs = X.columns[sfm_selector.get_support()]
 print('\nFeatures selected from Select From Model:', len(sfm_fs)
      , '\nThese are:', sfm_fs)
 # SFS:ML CV
 sfs_selector = SequentialFeatureSelector(estimator = LogisticRegression(**rs
                                                  , penalty='l1'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C = 1.0)
                                         , n_features_to_select = 'auto'
                                         , tol = None
                                         , cv = 10
                                         #, cv = skf_cv
 #                                         , direction ='backward'
                                         , direction ='forward'
                                         , **njobs)
 sfs_selector.fit(X, y)
 sfsb_fs = X.columns[sfs_selector.get_support()]
 print('\nFeatures selected from Sequential Feature Selector (Greedy):', len(sfsb_fs)
      , '\nThese are:', sfsb_fs)
 #Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV = SKF_CV]
 #These are: Index(['ligand_distance', 'duet_stability_change', 'ddg_foldx', 'deepddg',
 #      'contacts', 'rd_values', 'snap2_score']
 #Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV=10]
 #These are: Index(['ligand_distance', 'deepddg', 'contacts', 'rsa', 'kd_values',
 #       'rd_values', 'maf']
 #-----
 # Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = SKF_CV]
 # These are: Index(['ligand_distance', 'ddg_dynamut2', 'rsa', 'kd_values', 'rd_values', 'maf']
 # Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV  = 10]
 #These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf']
 ###############################################################################
--- a/UQ_LR_p1.py
+++ b/UQ_LR_p1.py
@ -0,0 +1,211 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 16 05:59:12 2022
@author: tanu
 """
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Import libs
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 #%% Get train-test split and scoring functions
 # X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
 #                                                     , num_df_wtgt['mutation_class']
 #                                                     , test_size    = 0.33
 #                                                     , random_state = 2
 #                                                     , shuffle      = True
 #                                                     , stratify     = num_df_wtgt['mutation_class'])
 y.to_frame().value_counts().plot(kind = 'bar')
 blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })    
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['saga']
    },
    # {
    #     'clf__estimator': [LogisticRegression(**rs)],
    #     #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    #     'clf__estimator__C': np.logspace(0, 4, 10),
    #     'clf__estimator__penalty': ['l2', 'none'],
    #     'clf__estimator__max_iter': list(range(100,800,100)),
    #     'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
    # }, 
    # {
    #     'clf__estimator': [LogisticRegression(**rs)],
    #     #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    #     'clf__estimator__C': np.logspace(0, 4, 10),
    #     'clf__estimator__penalty': ['l1', 'l2'],
    #     'clf__estimator__max_iter': list(range(100,800,100)),
    #     'clf__estimator__solver': ['liblinear']
    # }
 ]    
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_lr = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_lr_fit = gscv_lr.fit(X, y)
 gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
 gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
 print('Best model:\n', gscv_lr_fit_be_mod)
 print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
 #print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 ###############################################################################
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )) 
 test_predict = gscv_lr_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_bts, test_predict))
 print(matthews_corrcoef(y_bts, test_predict))
 # create a dict with all scores
 lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
 lr_bts_dict
 lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 lr_bts_dict
 # Create a df from dict with all scores
 lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
 lr_bts_df.columns = ['Logistic_Regression']
 print(lr_bts_df)
 # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
 # d2
 # def Merge(dict1, dict2):
 #     res = {**dict1, **dict2}
 #     return res
 # d3 = Merge(d2, lr_bts_dict)
 # d3
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 lr_bts_df.columns
 lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
 lr_output
 # Format the combined df
 # Drop the best_model_params row from lr_output
 lr_df = lr_output.drop([0], axis = 0)
 lr_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
 # FIXME: confusion matrix
 print(confusion_matrix(y_bts, test_predict))
 cm = confusion_matrix(y_bts, test_predict)
--- a/uq_ml_models/UQ_ABC.py
+++ b/uq_ml_models/UQ_ABC.py
@ -0,0 +1,133 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [AdaBoostClassifier(**rs)]      
       , 'clf__estimator__n_estimators': [none, 1, 2]
       , 'clf__estimator__base_estiamtor'  : ['None', 1*SVC(), 1*KNeighborsClassifier()]
       #, 'clf__estimator___splitter'  :   ["best", "random"]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_abc = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_abc_fit  = gscv_abc.fit(X, y)
 gscv_abc_fit_be_mod = gscv_abc_fit.best_params_
 gscv_abc_fit_be_res = gscv_abc_fit.cv_results_
 print('Best model:\n', gscv_abc_fit_be_mod)
 print('Best models score:\n', gscv_abc_fit.best_score_, ':' , round(gscv_abc_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_abc_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_abc_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_abc_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 abc_bts_dict = {#'best_model': list(gscv_abc_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 abc_bts_dict
 abc_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 abc_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 abc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 abc_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 abc_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 abc_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 abc_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 abc_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(abc_bts_dict, orient = 'index', columns = 'best_model')
 abc_bts_df = pd.DataFrame.from_dict(abc_bts_dict,orient = 'index')
 abc_bts_df.columns = ['Logistic_Regression']
 print(abc_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_abc_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 abc_bts_df.columns
 abc_output = pd.concat([model_params_df, abc_bts_df], axis = 0)
 abc_output
 # Format the combined df
 # Drop the best_model_params row from abc_output
 abc_df = abc_output.drop([0], axis = 0)
 abc_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_BC.py
+++ b/uq_ml_models/UQ_BC.py
@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [BaggingClassifier(**rs
                                                  , **njobs
                                                  , bootstrap = True
                                                  , oob_score = True)],
       , 'clf__estimator__n_estimators'    : [10, 100, 1000]
       # If None, then the base estimator is a DecisionTreeClassifier.
       , 'clf__estimator__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
       , 'clf__estimator__gamma': ['scale', 'auto'] 
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_bc = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_bc_fit  = gscv_bc.fit(X, y)
 gscv_bc_fit_be_mod = gscv_bc_fit.best_params_
 gscv_bc_fit_be_res = gscv_bc_fit.cv_results_
 print('Best model:\n', gscv_bc_fit_be_mod)
 print('Best models score:\n', gscv_bc_fit.best_score_, ':' , round(gscv_bc_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_bc_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_bc_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_bc_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 bc_bts_dict = {#'best_model': list(gscv_bc_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 bc_bts_dict
 bc_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 bc_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 bc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 bc_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 bc_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 bc_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 bc_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 bc_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(bc_bts_dict, orient = 'index', columns = 'best_model')
 bc_bts_df = pd.DataFrame.from_dict(bc_bts_dict,orient = 'index')
 bc_bts_df.columns = ['Logistic_Regression']
 print(bc_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_bc_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 bc_bts_df.columns
 bc_output = pd.concat([model_params_df, bc_bts_df], axis = 0)
 bc_output
 # Format the combined df
 # Drop the best_model_params row from bc_output
 bc_df = bc_output.drop([0], axis = 0)
 bc_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_BNB.py
+++ b/uq_ml_models/UQ_BNB.py
@ -0,0 +1,134 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [BernoulliNB()]        
        , 'clf__estimator__alpha': [0, 1]
        , 'clf__estimator__binarize':['None', 0]
        , 'clf__estimator__fit_prior': [True]
        , 'clf__estimator__class_prior': ['None']
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_bnb = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_bnb_fit  = gscv_bnb.fit(X, y)
 gscv_bnb_fit_be_mod = gscv_bnb_fit.best_params_
 gscv_bnb_fit_be_res = gscv_bnb_fit.cv_results_
 print('Best model:\n', gscv_bnb_fit_be_mod)
 print('Best models score:\n', gscv_bnb_fit.best_score_, ':' , round(gscv_bnb_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_bnb_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_bnb_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_bnb_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 bnb_bts_dict = {#'best_model': list(gscv_bnb_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 bnb_bts_dict
 bnb_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 bnb_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 bnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 bnb_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 bnb_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 bnb_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 bnb_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 bnb_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(bnb_bts_dict, orient = 'index', columns = 'best_model')
 bnb_bts_df = pd.DataFrame.from_dict(bnb_bts_dict,orient = 'index')
 bnb_bts_df.columns = ['Logistic_Regression']
 print(bnb_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_bnb_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 bnb_bts_df.columns
 bnb_output = pd.concat([model_params_df, bnb_bts_df], axis = 0)
 bnb_output
 # Format the combined df
 # Drop the best_model_params row from bnb_output
 bnb_df = bnb_output.drop([0], axis = 0)
 bnb_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_DT.py
+++ b/uq_ml_models/UQ_DT.py
@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [DecisionTreeClassifier(**rs
                                                  , **njobs)]
        , 'clf__estimator__max_depth': [None, 2, 4, 6, 8, 10, 12, 16, 20]
        , 'clf__estimator__class_weight':['balanced','balanced_subsample']
        , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
        , 'clf__estimator__max_features': [None, 'sqrt', 'log2']
        , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
        , 'clf__estimator__min_samples_split': [2, 5, 15, 20]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_dt = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_dt_fit  = gscv_dt.fit(X, y)
 gscv_dt_fit_be_mod = gscv_dt_fit.best_params_
 gscv_dt_fit_be_res = gscv_dt_fit.cv_results_
 print('Best model:\n', gscv_dt_fit_be_mod)
 print('Best models score:\n', gscv_dt_fit.best_score_, ':' , round(gscv_dt_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_dt_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_dt_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_dt_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 dt_bts_dict = {#'best_model': list(gscv_dt_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 dt_bts_dict
 dt_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 dt_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 dt_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 dt_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 dt_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 dt_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 dt_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 dt_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(dt_bts_dict, orient = 'index', columns = 'best_model')
 dt_bts_df = pd.DataFrame.from_dict(dt_bts_dict,orient = 'index')
 dt_bts_df.columns = ['Logistic_Regression']
 print(dt_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_dt_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 dt_bts_df.columns
 dt_output = pd.concat([model_params_df, dt_bts_df], axis = 0)
 dt_output
 # Format the combined df
 # Drop the best_model_params row from dt_output
 dt_df = dt_output.drop([0], axis = 0)
 dt_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_GBC.py
+++ b/uq_ml_models/UQ_GBC.py
@ -0,0 +1,136 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [GradientBoostingClassifier(**rs)]
        , 'clf__estimator__n_estimators'   : [10, 100, 200, 500, 1000]
        , 'clf__estimator__n_estimators' : [10, 100, 1000]
        , 'clf__estimator__learning_rate': [0.001, 0.01, 0.1]
        , 'clf__estimator__subsample'    : [0.5, 0.7, 1.0]
        , 'clf__estimator__max_depth'     : [3, 7, 9]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_gbc = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_gbc_fit  = gscv_gbc.fit(X, y)
 gscv_gbc_fit_be_mod = gscv_gbc_fit.best_params_
 gscv_gbc_fit_be_res = gscv_gbc_fit.cv_results_
 print('Best model:\n', gscv_gbc_fit_be_mod)
 print('Best models score:\n', gscv_gbc_fit.best_score_, ':' , round(gscv_gbc_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_gbc_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_gbc_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_gbc_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 gbc_bts_dict = {#'best_model': list(gscv_gbc_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 gbc_bts_dict
 gbc_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 gbc_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 gbc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 gbc_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 gbc_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 gbc_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 gbc_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 gbc_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(gbc_bts_dict, orient = 'index', columns = 'best_model')
 gbc_bts_df = pd.DataFrame.from_dict(gbc_bts_dict,orient = 'index')
 gbc_bts_df.columns = ['Logistic_Regression']
 print(gbc_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_gbc_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 gbc_bts_df.columns
 gbc_output = pd.concat([model_params_df, gbc_bts_df], axis = 0)
 gbc_output
 # Format the combined df
 # Drop the best_model_params row from gbc_output
 gbc_df = gbc_output.drop([0], axis = 0)
 gbc_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_GNB.py
+++ b/uq_ml_models/UQ_GNB.py
@ -0,0 +1,132 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [GaussianNB(**rs)]        
        , 'clf__estimator__priors': [None]
        , 'clf__estimator__var_smoothing': np.logspace(0,-9, num=100)
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_gnb = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_gnb_fit  = gscv_gnb.fit(X, y)
 gscv_gnb_fit_be_mod = gscv_gnb_fit.best_params_
 gscv_gnb_fit_be_res = gscv_gnb_fit.cv_results_
 print('Best model:\n', gscv_gnb_fit_be_mod)
 print('Best models score:\n', gscv_gnb_fit.best_score_, ':' , round(gscv_gnb_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_gnb_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_gnb_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_gnb_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 gnb_bts_dict = {#'best_model': list(gscv_gnb_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 gnb_bts_dict
 gnb_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 gnb_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 gnb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 gnb_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 gnb_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 gnb_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 gnb_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 gnb_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(gnb_bts_dict, orient = 'index', columns = 'best_model')
 gnb_bts_df = pd.DataFrame.from_dict(gnb_bts_dict,orient = 'index')
 gnb_bts_df.columns = ['Logistic_Regression']
 print(gnb_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_gnb_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 gnb_bts_df.columns
 gnb_output = pd.concat([model_params_df, gnb_bts_df], axis = 0)
 gnb_output
 # Format the combined df
 # Drop the best_model_params row from gnb_output
 gnb_df = gnb_output.drop([0], axis = 0)
 gnb_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_GPC.py
+++ b/uq_ml_models/UQ_GPC.py
@ -0,0 +1,132 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [GaussianProcessClassifier(**rs)]
        , 'clf__estimator__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_gpc = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_gpc_fit  = gscv_gpc.fit(X, y)
 gscv_gpc_fit_be_mod = gscv_gpc_fit.best_params_
 gscv_gpc_fit_be_res = gscv_gpc_fit.cv_results_
 print('Best model:\n', gscv_gpc_fit_be_mod)
 print('Best models score:\n', gscv_gpc_fit.best_score_, ':' , round(gscv_gpc_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_gpc_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_gpc_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_gpc_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 gpc_bts_dict = {#'best_model': list(gscv_gpc_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 gpc_bts_dict
 gpc_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 gpc_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 gpc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 gpc_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 gpc_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 gpc_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 gpc_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 gpc_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(gpc_bts_dict, orient = 'index', columns = 'best_model')
 gpc_bts_df = pd.DataFrame.from_dict(gpc_bts_dict,orient = 'index')
 gpc_bts_df.columns = ['Logistic_Regression']
 print(gpc_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_gpc_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 gpc_bts_df.columns
 gpc_output = pd.concat([model_params_df, gpc_bts_df], axis = 0)
 gpc_output
 # Format the combined df
 # Drop the best_model_params row from gpc_output
 gpc_df = gpc_output.drop([0], axis = 0)
 gpc_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_KNN.py
+++ b/uq_ml_models/UQ_KNN.py
@ -0,0 +1,136 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [KNeighborsClassifier(**rs
                                                  , **njobs]
        #, 'clf__estimator__n_neighbors': range(1, 21, 2)
        , 'clf__estimator__n_neighbors': [5, 7, 11]
        , 'clf__estimator__metric'     : ['euclidean', 'manhattan', 'minkowski']
        , 'clf__estimator__weights'    : ['uniform', 'distance']
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_knn = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_knn_fit  = gscv_knn.fit(X, y)
 gscv_knn_fit_be_mod = gscv_knn_fit.best_params_
 gscv_knn_fit_be_res = gscv_knn_fit.cv_results_
 print('Best model:\n', gscv_knn_fit_be_mod)
 print('Best models score:\n', gscv_knn_fit.best_score_, ':' , round(gscv_knn_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_knn_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_knn_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_knn_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 knn_bts_dict = {#'best_model': list(gscv_knn_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 knn_bts_dict
 knn_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 knn_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 knn_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 knn_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 knn_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 knn_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 knn_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 knn_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(knn_bts_dict, orient = 'index', columns = 'best_model')
 knn_bts_df = pd.DataFrame.from_dict(knn_bts_dict,orient = 'index')
 knn_bts_df.columns = ['Logistic_Regression']
 print(knn_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_knn_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 knn_bts_df.columns
 knn_output = pd.concat([model_params_df, knn_bts_df], axis = 0)
 knn_output
 # Format the combined df
 # Drop the best_model_params row from knn_output
 knn_df = knn_output.drop([0], axis = 0)
 knn_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_LR.py
+++ b/uq_ml_models/UQ_LR.py
@ -0,0 +1,207 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Mon May 16 05:59:12 2022
@author: tanu
 """
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Import libs
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn import datasets
 from sklearn.ensemble import ExtraTreesClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.svm import SVC
 from sklearn.base import BaseEstimator
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import SGDClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
 from xgboost import XGBClassifier
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
 #%% Get train-test split and scoring functions
 # X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
 #                                                     , num_df_wtgt['mutation_class']
 #                                                     , test_size    = 0.33
 #                                                     , random_state = 2
 #                                                     , shuffle      = True
 #                                                     , stratify     = num_df_wtgt['mutation_class'])
 y.to_frame().value_counts().plot(kind = 'bar')
 blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
 scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })    
 mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
 jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
 #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['saga']
    },
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['l2', 'none'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag']
    }, 
    {
        'clf__estimator': [LogisticRegression(**rs)],
        #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__estimator__C': np.logspace(0, 4, 10),
        'clf__estimator__penalty': ['l1', 'l2'],
        'clf__estimator__max_iter': list(range(100,800,100)),
        'clf__estimator__solver': ['liblinear']
    }
 ]    
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_lr = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_lr_fit = gscv_lr.fit(X, y)
 gscv_lr_fit_be_mod = gscv_lr_fit.best_params_
 gscv_lr_fit_be_res = gscv_lr_fit.cv_results_
 print('Best model:\n', gscv_lr_fit_be_mod)
 print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2))
 #print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )) 
 test_predict = gscv_lr_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_bts, test_predict))
 print(matthews_corrcoef(y_bts, test_predict))
 # create a dict with all scores
 lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
 lr_bts_dict
 lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 lr_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(lr_bts_dict, orient = 'index', columns = 'best_model')
 lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
 lr_bts_df.columns = ['Logistic_Regression']
 print(lr_bts_df)
 # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
 # d2
 # def Merge(dict1, dict2):
 #     res = {**dict1, **dict2}
 #     return res
 # d3 = Merge(d2, lr_bts_dict)
 # d3
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 lr_bts_df.columns
 lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
 lr_output
 # Format the combined df
 # Drop the best_model_params row from lr_output
 lr_df = lr_output.drop([0], axis = 0)
 lr_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_MLP.py
+++ b/uq_ml_models/UQ_MLP.py
@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [MLPClassifier(**rs
                                        , **njobs
                                        , max_iter = 500)],
        , 'clf__estimator__hidden_layer_sizes': [(1), (2), (3)]
        , 'clf__estimator__max_features': ['auto', 'sqrt']
        , 'clf__estimator__min_samples_leaf': [2, 4, 8]
        , 'clf__estimator__min_samples_split': [10, 20]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_mlp = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_mlp_fit  = gscv_mlp.fit(X, y)
 gscv_mlp_fit_be_mod = gscv_mlp_fit.best_params_
 gscv_mlp_fit_be_res = gscv_mlp_fit.cv_results_
 print('Best model:\n', gscv_mlp_fit_be_mod)
 print('Best models score:\n', gscv_mlp_fit.best_score_, ':' , round(gscv_mlp_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_mlp_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_mlp_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_mlp_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 mlp_bts_dict = {#'best_model': list(gscv_mlp_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 mlp_bts_dict
 mlp_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 mlp_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 mlp_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 mlp_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 mlp_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 mlp_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 mlp_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 mlp_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(mlp_bts_dict, orient = 'index', columns = 'best_model')
 mlp_bts_df = pd.DataFrame.from_dict(mlp_bts_dict,orient = 'index')
 mlp_bts_df.columns = ['Logistic_Regression']
 print(mlp_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_mlp_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 mlp_bts_df.columns
 mlp_output = pd.concat([model_params_df, mlp_bts_df], axis = 0)
 mlp_output
 # Format the combined df
 # Drop the best_model_params row from mlp_output
 mlp_df = mlp_output.drop([0], axis = 0)
 mlp_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_QDA.py
+++ b/uq_ml_models/UQ_QDA.py
@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [QuadraticDiscriminantAnalysis()]        
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_qda = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_qda_fit  = gscv_qda.fit(X, y)
 gscv_qda_fit_be_mod = gscv_qda_fit.best_params_
 gscv_qda_fit_be_res = gscv_qda_fit.cv_results_
 print('Best model:\n', gscv_qda_fit_be_mod)
 print('Best models score:\n', gscv_qda_fit.best_score_, ':' , round(gscv_qda_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_qda_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_qda_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_qda_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 qda_bts_dict = {#'best_model': list(gscv_qda_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 qda_bts_dict
 qda_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 qda_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 qda_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 qda_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 qda_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 qda_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 qda_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 qda_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(qda_bts_dict, orient = 'index', columns = 'best_model')
 qda_bts_df = pd.DataFrame.from_dict(qda_bts_dict,orient = 'index')
 qda_bts_df.columns = ['Logistic_Regression']
 print(qda_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_qda_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 qda_bts_df.columns
 qda_output = pd.concat([model_params_df, qda_bts_df], axis = 0)
 qda_output
 # Format the combined df
 # Drop the best_model_params row from qda_output
 qda_df = qda_output.drop([0], axis = 0)
 qda_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_RC.py
+++ b/uq_ml_models/UQ_RC.py
@ -0,0 +1,132 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [RidgeClassifier(**rs
                                                  , **njobs)],
        , 'clf__estimator__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_rc = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_rc_fit  = gscv_rc.fit(X, y)
 gscv_rc_fit_be_mod = gscv_rc_fit.best_params_
 gscv_rc_fit_be_res = gscv_rc_fit.cv_results_
 print('Best model:\n', gscv_rc_fit_be_mod)
 print('Best models score:\n', gscv_rc_fit.best_score_, ':' , round(gscv_rc_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_rc_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_rc_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_rc_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 rc_bts_dict = {#'best_model': list(gscv_rc_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 rc_bts_dict
 rc_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 rc_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 rc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 rc_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 rc_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 rc_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 rc_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 rc_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(rc_bts_dict, orient = 'index', columns = 'best_model')
 rc_bts_df = pd.DataFrame.from_dict(rc_bts_dict,orient = 'index')
 rc_bts_df.columns = ['Logistic_Regression']
 print(rc_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_rc_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 rc_bts_df.columns
 rc_output = pd.concat([model_params_df, rc_bts_df], axis = 0)
 rc_output
 # Format the combined df
 # Drop the best_model_params row from rc_output
 rc_df = rc_output.drop([0], axis = 0)
 rc_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_RF.py
+++ b/uq_ml_models/UQ_RF.py
@ -0,0 +1,140 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [RandomForestClassifier(**rs
                                                  , **njobs
                                                  , bootstrap = True
                                                  , oob_score = True)],
        'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
        , 'clf__estimator__class_weight':['balanced','balanced_subsample']
        , 'clf__estimator__n_estimators': [10, 25, 50, 100]
        , 'clf__estimator__criterion': ['gini', 'entropy', 'log_loss']
        , 'clf__estimator__max_features': ['sqrt', 'log2', None] #deafult is sqrt
        , 'clf__estimator__min_samples_leaf': [1, 2, 3, 4, 5, 10]
        , 'clf__estimator__min_samples_split': [2, 5, 15, 20]
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_rf = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_rf_fit  = gscv_rf.fit(X, y)
 gscv_rf_fit_be_mod = gscv_rf_fit.best_params_
 gscv_rf_fit_be_res = gscv_rf_fit.cv_results_
 print('Best model:\n', gscv_rf_fit_be_mod)
 print('Best models score:\n', gscv_rf_fit.best_score_, ':' , round(gscv_rf_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_rf_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_rf_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_rf_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 rf_bts_dict = {#'best_model': list(gscv_rf_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 rf_bts_dict
 rf_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 rf_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 rf_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 rf_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 rf_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 rf_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 rf_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 rf_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(rf_bts_dict, orient = 'index', columns = 'best_model')
 rf_bts_df = pd.DataFrame.from_dict(rf_bts_dict,orient = 'index')
 rf_bts_df.columns = ['Logistic_Regression']
 print(rf_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_rf_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 rf_bts_df.columns
 rf_output = pd.concat([model_params_df, rf_bts_df], axis = 0)
 rf_output
 # Format the combined df
 # Drop the best_model_params row from rf_output
 rf_df = rf_output.drop([0], axis = 0)
 rf_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_SVC.py
+++ b/uq_ml_models/UQ_SVC.py
@ -0,0 +1,135 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [SVC(**rs
                                , **njobs)],      
       , 'clf__estimator__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} 
       , 'clf__estimator__C'    : [50, 10, 1.0, 0.1, 0.01]
       , 'clf__estimator__gamma': ['scale', 'auto'] 
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_svc = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_svc_fit  = gscv_svc.fit(X, y)
 gscv_svc_fit_be_mod = gscv_svc_fit.best_params_
 gscv_svc_fit_be_res = gscv_svc_fit.cv_results_
 print('Best model:\n', gscv_svc_fit_be_mod)
 print('Best models score:\n', gscv_svc_fit.best_score_, ':' , round(gscv_svc_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_svc_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_svc_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_svc_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 svc_bts_dict = {#'best_model': list(gscv_svc_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 svc_bts_dict
 svc_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 svc_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 svc_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 svc_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 svc_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 svc_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 svc_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 svc_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(svc_bts_dict, orient = 'index', columns = 'best_model')
 svc_bts_df = pd.DataFrame.from_dict(svc_bts_dict,orient = 'index')
 svc_bts_df.columns = ['Logistic_Regression']
 print(svc_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_svc_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 svc_bts_df.columns
 svc_output = pd.concat([model_params_df, svc_bts_df], axis = 0)
 svc_output
 # Format the combined df
 # Drop the best_model_params row from svc_output
 svc_df = svc_output.drop([0], axis = 0)
 svc_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################
--- a/uq_ml_models/UQ_XGB.py
+++ b/uq_ml_models/UQ_XGB.py
@ -0,0 +1,135 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed May 18 06:03:24 2022
@author: tanu
 """
 #%% RandomForest + hyperparam: BaseEstimator: ClfSwitcher()
 class ClfSwitcher(BaseEstimator):
    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator
    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self
    def predict(self, X, y=None):
        return self.estimator.predict(X)
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    def score(self, X, y):
        return self.estimator.score(X, y)
 parameters = [
    {
        'clf__estimator': [XGBClassifier(**rs
                                                  , **njobs]      
        , 'clf__estimator__learning_rate': [0.01, 0.05, 0.1, 0.2]
        , 'clf__estimator__max_depth': [4, 6, 8, 10, 12, 16, 20]
        , 'clf__estimator__min_samples_leaf': [4, 8, 12, 16, 20]
        , 'clf__estimator__max_features': ['auto', 'sqrt']
        }
 ]
 # Create pipeline
 pipeline = Pipeline([
    ('pre', MinMaxScaler()),
    ('clf', ClfSwitcher()),
 ])
 # Grid search i.e hyperparameter tuning and refitting on mcc
 gscv_xgb = GridSearchCV(pipeline
                    , parameters
                    #, scoring = 'f1', refit = 'f1'
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , **njobs
                    , return_train_score = False
                    , verbose = 3)
 # Fit 
 gscv_xgb_fit  = gscv_xgb.fit(X, y)
 gscv_xgb_fit_be_mod = gscv_xgb_fit.best_params_
 gscv_xgb_fit_be_res = gscv_xgb_fit.cv_results_
 print('Best model:\n', gscv_xgb_fit_be_mod)
 print('Best models score:\n', gscv_xgb_fit.best_score_, ':' , round(gscv_xgb_fit.best_score_, 2))
 print('\nMean test score from fit results:', round(mean(gscv_xgb_fit_be_re['mean_test_mcc']),2))
 print('\nMean test score from fit results:', round(np.nanmean(gscv_xgb_fit_be_res['mean_test_mcc']),2))
 ######################################
 # Blind test
 ######################################
 # See how it does on the BLIND test
 #print('\nBlind test score, mcc:', )
 test_predict = gscv_xgb_fit.predict(X_bts)
 print(test_predict)
 print(np.array(y_bts))
 y_btsf = np.array(y_bts)
 print(accuracy_score(y_btsf, test_predict))
 print(matthews_corrcoef(y_btsf, test_predict))
 # create a dict with all scores
 xgb_bts_dict = {#'best_model': list(gscv_xgb_fit_be_mod.items())
               'bts_fscore'     : None
               , 'bts_mcc'      : None
               , 'bts_precision': None
               , 'bts_recall'   : None
               , 'bts_accuracy' : None
               , 'bts_roc_auc'  : None
               , 'bts_jaccard'  : None }
 xgb_bts_dict
 xgb_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
 xgb_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
 xgb_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
 xgb_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
 xgb_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
 xgb_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
 xgb_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
 xgb_bts_dict
 # Create a df from dict with all scores
 pd.DataFrame.from_dict(xgb_bts_dict, orient = 'index', columns = 'best_model')
 xgb_bts_df = pd.DataFrame.from_dict(xgb_bts_dict,orient = 'index')
 xgb_bts_df.columns = ['Logistic_Regression']
 print(xgb_bts_df)
 # Create df with best model params
 model_params = pd.Series(['best_model_params',  list(gscv_xgb_fit_be_mod.items() )])
 model_params_df = model_params.to_frame()
 model_params_df
 model_params_df.columns = ['Logistic_Regression']
 model_params_df.columns
 # Combine the df of scores and the best model params
 xgb_bts_df.columns
 xgb_output = pd.concat([model_params_df, xgb_bts_df], axis = 0)
 xgb_output
 # Format the combined df
 # Drop the best_model_params row from xgb_output
 xgb_df = xgb_output.drop([0], axis = 0)
 xgb_df
 #FIXME: tidy the index of the formatted df
 ###############################################################################