copy of ML dir to an FS-only version

2022-05-22 23:30:58 +01:00 · 2022-05-22 23:30:58 +01:00 · 80e6b3af96
commit 80e6b3af96
parent 52cc16f3fa
23 changed files with 3115 additions and 243 deletions
--- a/UQ_LR_FS_p1.py
+++ b/UQ_LR_FS_p1.py
@ -12,60 +12,20 @@ Created on Tue Mar 15 11:09:50 2022

@author: tanu
 """
-#%% Import libs
-import numpy as np
-import pandas as pd
-from sklearn.model_selection import GridSearchCV
-from sklearn import datasets
-from sklearn.ensemble import ExtraTreesClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.svm import SVC

-from sklearn.base import BaseEstimator
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.linear_model import SGDClassifier
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import GridSearchCV
-from sklearn.linear_model import LogisticRegression
-from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
-from xgboost import XGBClassifier
-#####################
-from sklearn.feature_selection import RFE
-from sklearn.feature_selection import RFECV
-from sklearn.linear_model import LogisticRegression
-from sklearn.feature_selection import SelectFromModel
-from sklearn.feature_selection import SequentialFeatureSelector
+# Attempting feature selection for LR WITHOUT ClfSwitcher Class

+#%% Import libraries, data, and scoring func: UQ_pnca_ML.py
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10}
-#%%
-
-y.to_frame().value_counts().plot(kind = 'bar')
-blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
-
-scoring_fn =  ({'accuracy'      : make_scorer(accuracy_score)
-                 , 'fscore'     : make_scorer(f1_score)
-                 , 'mcc'        : make_scorer(matthews_corrcoef)
-                 ,  'precision' : make_scorer(precision_score)
-                 ,  'recall'    : make_scorer(recall_score)
-                 ,  'roc_auc'   : make_scorer(roc_auc_score)
-                 ,  'jaccard'   : make_scorer(jaccard_score)
-            })    
-
-mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
-jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
-
-#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
+#%% Logistic Regression + hyperparam + FS: Pipeline takes GridSearchCV (not the other way round!)
 model_lr = LogisticRegression(**rs)
 model_rfecv = RFECV(estimator = model_lr
-                    , cv = rskf_cv
+                    , cv = skf_cv
                    #, cv = 10
                    , scoring = 'matthews_corrcoef'
                    )
-
-# model_rfecv = SequentialFeatureSelector(estimator = model_lr
+# model_sfs = SequentialFeatureSelector(estimator = model_lr
 #                                           , n_features_to_select = 'auto'
 #                                           , tol = None
 # #                                         , cv = 10
@ -74,23 +34,9 @@ model_rfecv = RFECV(estimator = model_lr
 #                                           , direction ='forward'
 #                                           , **njobs)

-# param_grid = [
-#       { 'C': np.logspace(0, 4, 10),
-#          'penalty': ['l1', 'l2'],
-#          'max_iter': [100],
-#          'solver': ['saga']
-#          }#,
-#      # { 'C': [1],
-#      #    'penalty': ['l1'],
-#      #    'max_iter': [100],
-#      #    'solver': ['saga']
-#      #    }
-# ]    
-
 param_grid2 = [
    {
        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        'max_iter': list(range(100,800,100)),
@ -98,7 +44,6 @@ param_grid2 = [
    },
    {
        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l2', 'none'],
        'max_iter': list(range(100,800,100)),
@ -106,13 +51,24 @@ param_grid2 = [
    }, 
    {
        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'C': np.logspace(0, 4, 10),
        'penalty': ['l1', 'l2'],
        'max_iter': list(range(100,800,100)),
        'solver': ['liblinear']
    }
-
+    
+# lesser params for testing
+    # { 'C': np.logspace(0, 4, 10),
+    #     'penalty': ['l1', 'l2'],
+    #     'max_iter': [100],
+    #     'solver': ['saga']
+    #     },
+    # { 'C': [1],
+    #     'penalty': ['l1'],
+    #     'max_iter': [100],
+    #     'solver': ['saga']
+    #     }
+    
 ]    

 #-------------------------------------------------------------------------------
@ -127,24 +83,21 @@ gscv_lr = GridSearchCV(model_lr

 #------------------------------------------------------------------------------
 # Create pipeline
-pipeline = Pipeline([('pre', MinMaxScaler())
+pipeline2 = Pipeline([('pre', MinMaxScaler())
                     #, ('feature_selection', sfs_selector)
                     , ('feature_selection', model_rfecv )
                     , ('clf', gscv_lr)])
  
 # Fit
-lr_fs = pipeline.fit(X,y)
+pipeline2.fit(X,y)
+pipeline2.predict(X_bts)
+
+# Assigning fit an then running predict: sanity check
+#lr_fs = pipeline.fit(X,y)
+#lr_fs.predict(X_bts)

-pipeline.predict(X_bts)
-lr_fs.predict(X_bts)

-test_predict = pipeline.predict(X_bts)
-print(test_predict)
-print(np.array(y_bts))
-#y_btsf = np.array(y_bts)

-print(accuracy_score(y_bts, test_predict))
-print(matthews_corrcoef(y_bts, test_predict))

 ###############################################################################
 #####################
@ -160,13 +113,12 @@ print(matthews_corrcoef(y_bts, test_predict))
 #print('\nBlind test score, mcc:', )) 

 #test_predict = gscv_lr_fit.predict(X_bts)
-test_predict =  pipeline.predict(X_bts)
-test_predict_fs = sfs_selector.predict(X_bts)
+test_predict =  pipeline2.predict(X_bts)

 print(test_predict)

-print(accuracy_score(y_bts, test_predict))
-print(matthews_corrcoef(y_bts, test_predict))
+print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
+print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))

 # create a dict with all scores
 lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
@ -237,7 +189,7 @@ from sklearn.feature_selection import SequentialFeatureSelector

 # RFE: ~ model coef or feature_importance
 rfe_selector = RFECV(estimator = LogisticRegression(**rs
-                                                  , penalty='l1'
+                                                  , penalty='l2'
                                                  , solver='saga'
                                                  , max_iter = 100
                                                  , C= 1.0)
@ -249,6 +201,30 @@ rfe_fs = X.columns[rfe_selector.get_support()]
 print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs)
      , '\nThese are:', rfe_fs)

+# blind test
+TEST_PREDICT = rfe_selector.predict(X_bts)
+TEST_PREDICT
+
+print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, TEST_PREDICT),2))
+print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, TEST_PREDICT),2))
+
+# add pipeline with preprocessing: changes numbers
+pipe = Pipeline([
+    ('pre', MinMaxScaler())
+    #, ('fs', model_rfecv)
+    , ('fs', rfe_selector)
+    , ('clf',  LogisticRegression(**rs))])
+
+pipe.fit(X,y)
+
+tp = pipe.predict(X_bts)
+
+print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, tp),2))
+print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
+
+##################################
+
+
 # SFM: ~ model coef or feature_importance
 sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs
                                                  , penalty='l1'