ML_AI_training/UQ_LR_FS_p1.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 16 05:59:12 2022

@author: tanu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022

@author: tanu
"""
#%% Import libs
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
#####################
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector

rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%%

y.to_frame().value_counts().plot(kind = 'bar')
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')

scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
                 , 'fscore'     : make_scorer(f1_score)
                 , 'mcc'        : make_scorer(matthews_corrcoef)
                 ,  'precision' : make_scorer(precision_score)
                 ,  'recall'    : make_scorer(recall_score)
                 ,  'roc_auc'   : make_scorer(roc_auc_score)
                 ,  'jaccard'   : make_scorer(jaccard_score)
            })

mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}

#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
model_lr = LogisticRegression(**rs)
model_rfecv = RFECV(estimator = model_lr
                    , cv = rskf_cv
                    #, cv = 10
                    , scoring = 'matthews_corrcoef'
                    )

# model_rfecv = SequentialFeatureSelector(estimator = model_lr
#                                           , n_features_to_select = 'auto'
#                                           , tol = None
# #                                         , cv = 10
#                                           , cv = rskf_cv
# #                                          , direction ='backward'
#                                           , direction ='forward'
#                                           , **njobs)

# param_grid = [
#       { 'C': np.logspace(0, 4, 10),
#          'penalty': ['l1', 'l2'],
#          'max_iter': [100],
#          'solver': ['saga']
#          }#,
#      # { 'C': [1],
#      #    'penalty': ['l1'],
#      #    'max_iter': [100],
#      #    'solver': ['saga']
#      #    }
# ]

param_grid2 = [
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'max_iter': list(range(100,800,100)),
        'solver': ['saga']
    },
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l2', 'none'],
        'max_iter': list(range(100,800,100)),
        'solver': ['newton-cg', 'lbfgs', 'sag']
    },
    {
        #'clf__estimator': [LogisticRegression(**rs)],
        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l1', 'l2'],
        'max_iter': list(range(100,800,100)),
        'solver': ['liblinear']
    }

]

#-------------------------------------------------------------------------------
# Grid search CV + FS
gscv_lr = GridSearchCV(model_lr
                    , param_grid2
                    , scoring = mcc_score_fn, refit = 'mcc'
                    , cv = skf_cv
                    , return_train_score = False
                    , verbose = 3
                    , **njobs)

#------------------------------------------------------------------------------
# Create pipeline
pipeline = Pipeline([('pre', MinMaxScaler())
                     #, ('feature_selection', sfs_selector)
                     , ('feature_selection', model_rfecv )
                     , ('clf', gscv_lr)])

# Fit
lr_fs = pipeline.fit(X,y)

pipeline.predict(X_bts)
lr_fs.predict(X_bts)

test_predict = pipeline.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
#y_btsf = np.array(y_bts)

print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))

###############################################################################
#####################
# Feature selection: AFTER model selection
# https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172

###############################################################################

######################################
# Blind test
######################################
# See how it does on the BLIND test
#print('\nBlind test score, mcc:', ))

#test_predict = gscv_lr_fit.predict(X_bts)
test_predict =  pipeline.predict(X_bts)
test_predict_fs = sfs_selector.predict(X_bts)

print(test_predict)

print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))

# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
               'bts_fscore':None
               , 'bts_mcc':None
               , 'bts_precision':None
               , 'bts_recall':None
               , 'bts_accuracy':None
               , 'bts_roc_auc':None
               , 'bts_jaccard':None }
lr_bts_dict
lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
lr_bts_dict

# Create a df from dict with all scores
lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
lr_bts_df.columns = ['Logistic_Regression']
print(lr_bts_df)

# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
# d2
# def Merge(dict1, dict2):
#     res = {**dict1, **dict2}
#     return res
# d3 = Merge(d2, lr_bts_dict)
# d3

# Create df with best model params
model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
model_params_df = model_params.to_frame()
model_params_df
model_params_df.columns = ['Logistic_Regression']
model_params_df.columns

# Combine the df of scores and the best model params
lr_bts_df.columns
lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
lr_output

# Format the combined df
# Drop the best_model_params row from lr_output
lr_df = lr_output.drop([0], axis = 0)
lr_df

#FIXME: tidy the index of the formatted df

###############################################################################
# FIXME: confusion matrix

print(confusion_matrix(y_bts, test_predict))
#%% Feature selection

#####################
# Feature selection: AFTER model selection?
# ADD that within the loop
# https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
#####################
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector

# RFE: ~ model coef or feature_importance
rfe_selector = RFECV(estimator = LogisticRegression(**rs
                                                  , penalty='l1'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C= 1.0)
                   #, n_features_to_select = None # median by default
                   , step = 1
                   , cv = 10)
rfe_selector.fit(X, y)
rfe_fs = X.columns[rfe_selector.get_support()]
print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs)
      , '\nThese are:', rfe_fs)

# SFM: ~ model coef or feature_importance
sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs
                                                  , penalty='l1'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C= 1.0)
                               , threshold = "median"
                               , max_features = None ) # median by default
sfm_selector.fit(X, y)
sfm_fs = X.columns[sfm_selector.get_support()]

print('\nFeatures selected from Select From Model:', len(sfm_fs)
      , '\nThese are:', sfm_fs)

# SFS:ML CV
sfs_selector = SequentialFeatureSelector(estimator = LogisticRegression(**rs
                                                  , penalty='l1'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C = 1.0)
                                         , n_features_to_select = 'auto'
                                         , tol = None
                                         , cv = 10
                                         #, cv = skf_cv
#                                         , direction ='backward'
                                         , direction ='forward'

                                         , **njobs)
sfs_selector.fit(X, y)
sfsb_fs = X.columns[sfs_selector.get_support()]

print('\nFeatures selected from Sequential Feature Selector (Greedy):', len(sfsb_fs)
      , '\nThese are:', sfsb_fs)

#Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV = SKF_CV]
#These are: Index(['ligand_distance', 'duet_stability_change', 'ddg_foldx', 'deepddg',
#      'contacts', 'rd_values', 'snap2_score']

#Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV=10]
#These are: Index(['ligand_distance', 'deepddg', 'contacts', 'rsa', 'kd_values',
#       'rd_values', 'maf']

#-----
# Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = SKF_CV]
# These are: Index(['ligand_distance', 'ddg_dynamut2', 'rsa', 'kd_values', 'rd_values', 'maf']

# Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV  = 10]
#These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf']
###############################################################################
# IMP: nice eg of including it as part of pipeline
# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/