renamed UQ_LR_FS.py to UQ_LR_FS_p1.py

2022-05-21 04:24:28 +01:00 · 2022-05-21 04:24:28 +01:00 · 3742a5f62d
commit 3742a5f62d
parent e16e82e673
2 changed files with 17 additions and 14 deletions
--- a/UQ_LR_FS.py
+++ b/UQ_LR_FS.py
@ -1,301 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon May 16 05:59:12 2022
-
-@author: tanu
-"""
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Mar 15 11:09:50 2022
-
-@author: tanu
-"""
-#%% Import libs
-import numpy as np
-import pandas as pd
-from sklearn.model_selection import GridSearchCV
-from sklearn import datasets
-from sklearn.ensemble import ExtraTreesClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.svm import SVC
-
-from sklearn.base import BaseEstimator
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import SGDClassifier
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import GridSearchCV
-from sklearn.linear_model import LogisticRegression
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
-from xgboost import XGBClassifier
-#####################
-from sklearn.feature_selection import RFE
-from sklearn.feature_selection import RFECV
-from sklearn.linear_model import LogisticRegression
-from sklearn.feature_selection import SelectFromModel
-from sklearn.feature_selection import SequentialFeatureSelector
-
-rs = {'random_state': 42}
-njobs = {'n_jobs': 10}
-#%%
-
-y.to_frame().value_counts().plot(kind = 'bar')
-blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
-
-scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
-                 , 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 ,  'precision' : make_scorer(precision_score)
-                 ,  'recall'    : make_scorer(recall_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 ,  'jaccard'   : make_scorer(jaccard_score)
-            })    
-
-mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
-jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-
-#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
-model_lr = LogisticRegression(**rs)
-model_rfecv = RFECV(estimator = model_lr
-                    , cv = rskf_cv
-                    #, cv = 10
-                    , scoring = 'matthews_corrcoef'
-                    )
-
-# model_rfecv = SequentialFeatureSelector(estimator = model_lr
-#                                           , n_features_to_select = 'auto'
-#                                           , tol = None
-# #                                         , cv = 10
-#                                           , cv = rskf_cv
-# #                                          , direction ='backward'
-#                                           , direction ='forward'
-#                                           , **njobs)
-
-# param_grid = [
-#       { 'C': np.logspace(0, 4, 10),
-#          'penalty': ['l1', 'l2'],
-#          'max_iter': [100],
-#          'solver': ['saga']
-#          }#,
-#      # { 'C': [1],
-#      #    'penalty': ['l1'],
-#      #    'max_iter': [100],
-#      #    'solver': ['saga']
-#      #    }
-# ]    
-
-param_grid2 = [
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        'C': np.logspace(0, 4, 10),
-        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
-        'max_iter': list(range(100,800,100)),
-        'solver': ['saga']
-    },
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        'C': np.logspace(0, 4, 10),
-        'penalty': ['l2', 'none'],
-        'max_iter': list(range(100,800,100)),
-        'solver': ['newton-cg', 'lbfgs', 'sag']
-    }, 
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        'C': np.logspace(0, 4, 10),
-        'penalty': ['l1', 'l2'],
-        'max_iter': list(range(100,800,100)),
-        'solver': ['liblinear']
-    }
-
-]    
-
-#-------------------------------------------------------------------------------
-# Grid search CV + FS
-gscv_lr = GridSearchCV(model_lr
-                    , param_grid2
-                    , scoring = mcc_score_fn, refit = 'mcc'
-                    , cv = skf_cv
-                    , return_train_score = False
-                    , verbose = 3
-                    , **njobs)
-
-#------------------------------------------------------------------------------
-# Create pipeline
-pipeline = Pipeline([('pre', MinMaxScaler())
-                     #, ('feature_selection', sfs_selector)
-                     , ('feature_selection', model_rfecv )
-                     , ('clf', gscv_lr)])
-  
-# Fit
-lr_fs = pipeline.fit(X,y)
-
-pipeline.predict(X_bts)
-lr_fs.predict(X_bts)
-
-test_predict = pipeline.predict(X_bts)
-print(test_predict)
-print(np.array(y_bts))
-#y_btsf = np.array(y_bts)
-
-print(accuracy_score(y_bts, test_predict))
-print(matthews_corrcoef(y_bts, test_predict))
-
-###############################################################################
-#####################
-# Feature selection: AFTER model selection
-# https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
-
-###############################################################################
-
-######################################
-# Blind test
-######################################
-# See how it does on the BLIND test
-#print('\nBlind test score, mcc:', )) 
-
-#test_predict = gscv_lr_fit.predict(X_bts)
-test_predict =  pipeline.predict(X_bts)
-test_predict_fs = sfs_selector.predict(X_bts)
-
-print(test_predict)
-
-print(accuracy_score(y_bts, test_predict))
-print(matthews_corrcoef(y_bts, test_predict))
-
-# create a dict with all scores
-lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
-               'bts_fscore':None
-               , 'bts_mcc':None
-               , 'bts_precision':None
-               , 'bts_recall':None
-               , 'bts_accuracy':None
-               , 'bts_roc_auc':None
-               , 'bts_jaccard':None }
-lr_bts_dict
-lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
-lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
-lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
-lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
-lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
-lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
-lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
-lr_bts_dict
-
-# Create a df from dict with all scores
-lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index')
-lr_bts_df.columns = ['Logistic_Regression']
-print(lr_bts_df)
-
-# d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )}
-# d2
-# def Merge(dict1, dict2):
-#     res = {**dict1, **dict2}
-#     return res
-# d3 = Merge(d2, lr_bts_dict)
-# d3
-
-# Create df with best model params
-model_params = pd.Series(['best_model_params',  list(gscv_lr_fit_be_mod.items() )])
-model_params_df = model_params.to_frame()
-model_params_df
-model_params_df.columns = ['Logistic_Regression']
-model_params_df.columns
-
-# Combine the df of scores and the best model params
-lr_bts_df.columns
-lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0)
-lr_output
-
-# Format the combined df
-# Drop the best_model_params row from lr_output
-lr_df = lr_output.drop([0], axis = 0)
-lr_df
-
-#FIXME: tidy the index of the formatted df
-
-###############################################################################
-# FIXME: confusion matrix
-
-print(confusion_matrix(y_bts, test_predict))
-#%% Feature selection
-
-#####################
-# Feature selection: AFTER model selection?
-# ADD that within the loop
-# https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
-#####################
-from sklearn.feature_selection import RFE
-from sklearn.linear_model import LogisticRegression
-from sklearn.feature_selection import SelectFromModel
-from sklearn.feature_selection import SequentialFeatureSelector
-
-# RFE: ~ model coef or feature_importance
-rfe_selector = RFE(estimator = LogisticRegression(**rs
-                                                  , penalty='l1'
-                                                  , solver='saga'
-                                                  , max_iter = 100
-                                                  , C= 1.0)
-                   , n_features_to_select = None # median by default
-                   , step = 1)
-rfe_selector.fit(X, y)
-rfe_fs = X.columns[rfe_selector.get_support()]
-print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs)
-      , '\nThese are:', rfe_fs)
-
-# SFM: ~ model coef or feature_importance
-sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs
-                                                  , penalty='l1'
-                                                  , solver='saga'
-                                                  , max_iter = 100
-                                                  , C= 1.0)
-                               , threshold = "median"
-                               , max_features = None ) # median by default
-sfm_selector.fit(X, y)
-sfm_fs = X.columns[sfm_selector.get_support()]
-
-print('\nFeatures selected from Select From Model:', len(sfm_fs)
-      , '\nThese are:', sfm_fs)
-
-# SFS:ML CV
-sfs_selector = SequentialFeatureSelector(estimator = LogisticRegression(**rs
-                                                  , penalty='l1'
-                                                  , solver='saga'
-                                                  , max_iter = 100
-                                                  , C = 1.0)
-                                         , n_features_to_select = 'auto'
-                                         , tol = None
-                                         , cv = 10
-                                         #, cv = skf_cv
-#                                         , direction ='backward'
-                                         , direction ='forward'
-
-                                         , **njobs)
-sfs_selector.fit(X, y)
-sfsb_fs = X.columns[sfs_selector.get_support()]
-
-print('\nFeatures selected from Sequential Feature Selector (Greedy):', len(sfsb_fs)
-      , '\nThese are:', sfsb_fs)
-
-#Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV = SKF_CV]
-#These are: Index(['ligand_distance', 'duet_stability_change', 'ddg_foldx', 'deepddg',
-#      'contacts', 'rd_values', 'snap2_score']
-
-#Features selected from Sequential Feature Selector (Greedy, Backward): 7 [CV=10]
-#These are: Index(['ligand_distance', 'deepddg', 'contacts', 'rsa', 'kd_values',
-#       'rd_values', 'maf']
-
-#-----
-# Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV = SKF_CV]
-# These are: Index(['ligand_distance', 'ddg_dynamut2', 'rsa', 'kd_values', 'rd_values', 'maf']
-
-# Features selected from Sequential Feature Selector (Greedy, Forward): 6 [CV  = 10]
-#These are: Index(['duet_stability_change', 'deepddg', 'ddg_dynamut2', 'rsa', 'kd_values', 'maf']
-###############################################################################
-# IMP: nice eg of including it as part of pipeline
-# https://www.tomasbeuzen.com/post/scikit-learn-gridsearch-pipelines/