copy of ML dir to an FS-only version

2022-05-22 23:30:58 +01:00 · 2022-05-22 23:30:58 +01:00 · 80e6b3af96
commit 80e6b3af96
parent 52cc16f3fa
23 changed files with 3115 additions and 243 deletions
--- a/uq_ml_models/UQ_LR_FS2.py
+++ b/uq_ml_models/UQ_LR_FS2.py
@ -13,50 +13,50 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
-model_lr = LogisticRegression(**rs)
-model_rfecv = RFECV(estimator = model_lr
-                    , cv = skf_cv
-                    #, cv = 10
-                    , min_features_to_select = 1 # default
-                    , scoring = 'matthews_corrcoef'
-                    )
+# model_lr = LogisticRegression(**rs)
+# model_rfecv = RFECV(estimator = model_lr
+#                     , cv = skf_cv
+#                     #, cv = 10
+#                     , min_features_to_select = 1 # default
+#                     , scoring = 'matthews_corrcoef'
+#                     )

-param_grid2 = [
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        'C': np.logspace(0, 4, 10),
-        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
-        'max_iter': list(range(100,800,100)),
-        'solver': ['saga']
-    },
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        'C': np.logspace(0, 4, 10),
-        'penalty': ['l2', 'none'],
-        'max_iter': list(range(100,800,100)),
-        'solver': ['newton-cg', 'lbfgs', 'sag']
-    }, 
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
-        'C': np.logspace(0, 4, 10),
-        'penalty': ['l1', 'l2'],
-        'max_iter': list(range(100,800,100)),
-        'solver': ['liblinear']
-    }
+# param_grid2 = [
+#     {
+#         #'clf': [LogisticRegression(**rs)],
+#         #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+#         'C': np.logspace(0, 4, 10),
+#         'penalty': ['none', 'l1', 'l2', 'elasticnet'],
+#         'max_iter': list(range(100,800,100)),
+#         'solver': ['saga']
+#     },
+#     {
+#         #'clf': [LogisticRegression(**rs)],
+#         #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+#         'C': np.logspace(0, 4, 10),
+#         'penalty': ['l2', 'none'],
+#         'max_iter': list(range(100,800,100)),
+#         'solver': ['newton-cg', 'lbfgs', 'sag']
+#     }, 
+#     {
+#         #'clf': [LogisticRegression(**rs)],
+#         #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
+#         'C': np.logspace(0, 4, 10),
+#         'penalty': ['l1', 'l2'],
+#         'max_iter': list(range(100,800,100)),
+#         'solver': ['liblinear']
+#     }

-]    
-#-------------------------------------------------------------------------------
-# Grid search CV + FS
-gscv_lr = GridSearchCV(estimator = model_lr
-                    , param_grid = param_grid2
-                    , scoring = mcc_score_fn, refit = 'mcc'
-                    , cv = skf_cv
-                    , return_train_score = False
-                    , verbose = 3
-                    , **njobs)
+# ]    
+# #-------------------------------------------------------------------------------
+# # Grid search CV + FS
+# gscv_lr = GridSearchCV(estimator = model_lr
+#                     , param_grid = param_grid2
+#                     , scoring = mcc_score_fn, refit = 'mcc'
+#                     , cv = skf_cv
+#                     , return_train_score = False
+#                     , verbose = 3
+#                     , **njobs)

 #------------------------------------------------------------------------------
 ################
@ -64,27 +64,27 @@ gscv_lr = GridSearchCV(estimator = model_lr
 # Cannot get BEST model out
 ################ 
 # Create pipeline
-pipeline = Pipeline([('pre', MinMaxScaler())
-                     #, ('fs', sfs_selector)
-                     , ('fs', model_rfecv )
-                     , ('clf', gscv_lr)])
+# pipeline = Pipeline([('pre', MinMaxScaler())
+#                      #, ('fs', sfs_selector)
+#                      , ('fs', model_rfecv )
+#                      , ('clf', gscv_lr)])
  
-# Fit # dont assign fit
-#lr_fs_fit = pipeline.fit(X,y)
-pipeline.fit(X,y)
+# # Fit # dont assign fit
+# #lr_fs_fit = pipeline.fit(X,y)
+# pipeline.fit(X,y)

-pipeline.best_params_
+# pipeline.best_params_

-#https://github.com/scikit-learn/scikit-learn/issues/7536
-n_fs = gscv_lr.best_estimator_.n_features_in_
-n_fs
+# #https://github.com/scikit-learn/scikit-learn/issues/7536
+# n_fs = gscv_lr.best_estimator_.n_features_in_
+# n_fs

-sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
-print('\nNo. of features selected with RFECV for model'
-      , pipeline.named_steps['clf'].estimator
-      , ':', n_fs
-      , '\nThese are:', sel_features
-      )
+# sel_features = X.columns[pipeline.named_steps['fs'].get_support()]
+# print('\nNo. of features selected with RFECV for model'
+#       , pipeline.named_steps['clf'].estimator
+#       , ':', n_fs
+#       , '\nThese are:', sel_features
+#       )
 ##############################################################
 # THIS ONE
 #########
@ -106,28 +106,45 @@ param_grid2 = [
    
    {'fs__min_features_to_select': [1]
    , 'fs__cv': [skf_cv]
-    #, 'fs__scoring': ['matthews_corrcoef']},
-    #, 'fs__scoring': [mcc_score_fn]}
    },
    
+    # {
+    #     #'clf': [LogisticRegression(**rs)],
+    #     'clf__C': np.logspace(0, 4, 10),
+    #     'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
+    #     'clf__max_iter': list(range(100,800,100)),
+    #     'clf__solver': ['saga']
+    # },
+    # {
+    #     #'clf': [LogisticRegression(**rs)],
+    #     'clf__C': np.logspace(0, 4, 10),
+    #     'clf__penalty': ['l2', 'none'],
+    #     'clf__max_iter': list(range(100,800,100)),
+    #     'clf__solver': ['newton-cg', 'lbfgs', 'sag']
+    # }, 
+    # {
+    #     #'clf': [LogisticRegression(**rs)],
+    #     'clf__C': np.logspace(0, 4, 10),
+    #     'clf__penalty': ['l1', 'l2'],
+    #     'clf__max_iter': list(range(100,800,100)),
+    #     'clf__solver': ['liblinear']
+    # }
    
-    { 
-        #'clf__estimator': [LogisticRegression(**rs)],
+    {  #'clf': [LogisticRegression(**rs)],
        'clf__C': np.logspace(0, 4, 10),
        'clf__penalty': ['l2'],
-        'clf__max_iter': list(range(100,200,100)),
-        #'clf__solver': ['newton-cg', 'lbfgs', 'sag']
-        'clf__solver': ['sag']
-
-    }, 
-    {
-        #'clf__estimator': [LogisticRegression(**rs)],
-        'clf__C': np.logspace(0, 4, 10),
-        'clf__penalty': ['l1', 'l2'],
-        'clf__max_iter': list(range(100,200,100)),
+        'clf__max_iter': [100],
        'clf__solver': ['liblinear']
+    },
+    
+    {  #'clf': [LogisticRegression(**rs)],
+        'clf__C': np.logspace(0, 4, 10),
+        'clf__penalty': ['l2'],
+        'clf__max_iter':[100],
+        'clf__solver': ['saga']
    }

+
 ]
 # step 4: create  pipeline
 pipeline = Pipeline([
@ -149,12 +166,34 @@ gs_final = GridSearchCV(pipeline
 gs_final.fit(X,y)
 gs_final.best_params_
 gs_final.best_score_
+gs_final.best_estimator_

 # assign the fit
-gsfit = gs_final.fit(X,y)
+#gsfit = gs_final.fit(X,y)
 #gsfit.best_estimator_
-gsfit.best_params_
-gsfit.best_score_
+#gsfit.best_params_
+#gsfit.best_score_
+
+test_predict = gs_final.predict(X_bts)
+print(test_predict)
+print('\nMCC on Blind test:'     , round(matthews_corrcoef(y_bts, test_predict),2))
+print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

 # Now get the features out
 all_features = gs_final.feature_names_in_
@ -163,7 +202,6 @@ all_features = gs_final.feature_names_in_
 sel_features = X.columns[gs_final.best_estimator_.named_steps['fs'].get_support()]
 n_sf = gs_final.best_estimator_.named_steps['fs'].n_features_

-
 # get model name
 model_name  = gs_final.best_estimator_.named_steps['clf']
 b_model_params = gs_final.best_params_
@ -179,4 +217,37 @@ print('\n========================================'
      , '\nThese are:', sel_features, '\n\n'
      , '\nBest Model hyperparams:', b_model_params

-      )
+      )
+
+
+######################################
+# Blind test
+######################################
+# See how it does on the BLIND test
+#print('\nBlind test score, mcc:', )) 
+
+#test_predict = gscv_lr_fit.predict(X_bts)
+test_predict =  gs_final.predict(X_bts)
+print(test_predict)
+
+print(accuracy_score(y_bts, test_predict))
+print(matthews_corrcoef(y_bts, test_predict))
+
+# create a dict with all scores
+lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
+               'bts_fscore':None
+               , 'bts_mcc':None
+               , 'bts_precision':None
+               , 'bts_recall':None
+               , 'bts_accuracy':None
+               , 'bts_roc_auc':None
+               , 'bts_jaccard':None }
+lr_bts_dict
+lr_bts_dict['bts_fscore']    = round(f1_score(y_bts, test_predict),2)
+lr_bts_dict['bts_mcc']       = round(matthews_corrcoef(y_bts, test_predict),2)
+lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2)
+lr_bts_dict['bts_recall']    = round(recall_score(y_bts, test_predict),2)
+lr_bts_dict['bts_accuracy']  = round(accuracy_score(y_bts, test_predict),2)
+lr_bts_dict['bts_roc_auc']   = round(roc_auc_score(y_bts, test_predict),2)
+lr_bts_dict['bts_jaccard']   = round(jaccard_score(y_bts, test_predict),2)
+lr_bts_dict