copy of ML dir to an FS-only version

This commit is contained in:
Tanushree Tunstall 2022-05-22 23:30:58 +01:00
parent 52cc16f3fa
commit 80e6b3af96
23 changed files with 3115 additions and 243 deletions

View file

@ -12,60 +12,20 @@ Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%% Import libs
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
#####################
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector
# Attempting feature selection for LR WITHOUT ClfSwitcher Class
#%% Import libraries, data, and scoring func: UQ_pnca_ML.py
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
#%%
y.to_frame().value_counts().plot(kind = 'bar')
blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar')
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jaccard' : make_scorer(jaccard_score)
})
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
jacc_score_fn = {'jcc': make_scorer(jaccard_score)}
#%% Logistic Regression + hyperparam + FS: BaseEstimator: ClfSwitcher()
#%% Logistic Regression + hyperparam + FS: Pipeline takes GridSearchCV (not the other way round!)
model_lr = LogisticRegression(**rs)
model_rfecv = RFECV(estimator = model_lr
, cv = rskf_cv
, cv = skf_cv
#, cv = 10
, scoring = 'matthews_corrcoef'
)
# model_rfecv = SequentialFeatureSelector(estimator = model_lr
# model_sfs = SequentialFeatureSelector(estimator = model_lr
# , n_features_to_select = 'auto'
# , tol = None
# # , cv = 10
@ -74,23 +34,9 @@ model_rfecv = RFECV(estimator = model_lr
# , direction ='forward'
# , **njobs)
# param_grid = [
# { 'C': np.logspace(0, 4, 10),
# 'penalty': ['l1', 'l2'],
# 'max_iter': [100],
# 'solver': ['saga']
# }#,
# # { 'C': [1],
# # 'penalty': ['l1'],
# # 'max_iter': [100],
# # 'solver': ['saga']
# # }
# ]
param_grid2 = [
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['none', 'l1', 'l2', 'elasticnet'],
'max_iter': list(range(100,800,100)),
@ -98,7 +44,6 @@ param_grid2 = [
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l2', 'none'],
'max_iter': list(range(100,800,100)),
@ -106,13 +51,24 @@ param_grid2 = [
},
{
#'clf__estimator': [LogisticRegression(**rs)],
#'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'C': np.logspace(0, 4, 10),
'penalty': ['l1', 'l2'],
'max_iter': list(range(100,800,100)),
'solver': ['liblinear']
}
# lesser params for testing
# { 'C': np.logspace(0, 4, 10),
# 'penalty': ['l1', 'l2'],
# 'max_iter': [100],
# 'solver': ['saga']
# },
# { 'C': [1],
# 'penalty': ['l1'],
# 'max_iter': [100],
# 'solver': ['saga']
# }
]
#-------------------------------------------------------------------------------
@ -127,24 +83,21 @@ gscv_lr = GridSearchCV(model_lr
#------------------------------------------------------------------------------
# Create pipeline
pipeline = Pipeline([('pre', MinMaxScaler())
pipeline2 = Pipeline([('pre', MinMaxScaler())
#, ('feature_selection', sfs_selector)
, ('feature_selection', model_rfecv )
, ('clf', gscv_lr)])
# Fit
lr_fs = pipeline.fit(X,y)
pipeline2.fit(X,y)
pipeline2.predict(X_bts)
# Assigning fit an then running predict: sanity check
#lr_fs = pipeline.fit(X,y)
#lr_fs.predict(X_bts)
pipeline.predict(X_bts)
lr_fs.predict(X_bts)
test_predict = pipeline.predict(X_bts)
print(test_predict)
print(np.array(y_bts))
#y_btsf = np.array(y_bts)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
###############################################################################
#####################
@ -160,13 +113,12 @@ print(matthews_corrcoef(y_bts, test_predict))
#print('\nBlind test score, mcc:', ))
#test_predict = gscv_lr_fit.predict(X_bts)
test_predict = pipeline.predict(X_bts)
test_predict_fs = sfs_selector.predict(X_bts)
test_predict = pipeline2.predict(X_bts)
print(test_predict)
print(accuracy_score(y_bts, test_predict))
print(matthews_corrcoef(y_bts, test_predict))
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, test_predict),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, test_predict),2))
# create a dict with all scores
lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items())
@ -237,7 +189,7 @@ from sklearn.feature_selection import SequentialFeatureSelector
# RFE: ~ model coef or feature_importance
rfe_selector = RFECV(estimator = LogisticRegression(**rs
, penalty='l1'
, penalty='l2'
, solver='saga'
, max_iter = 100
, C= 1.0)
@ -249,6 +201,30 @@ rfe_fs = X.columns[rfe_selector.get_support()]
print('\nFeatures selected from Recursive Feature Elimination:', len(rfe_fs)
, '\nThese are:', rfe_fs)
# blind test
TEST_PREDICT = rfe_selector.predict(X_bts)
TEST_PREDICT
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, TEST_PREDICT),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, TEST_PREDICT),2))
# add pipeline with preprocessing: changes numbers
pipe = Pipeline([
('pre', MinMaxScaler())
#, ('fs', model_rfecv)
, ('fs', rfe_selector)
, ('clf', LogisticRegression(**rs))])
pipe.fit(X,y)
tp = pipe.predict(X_bts)
print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, tp),2))
print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, tp),2))
##################################
# SFM: ~ model coef or feature_importance
sfm_selector = SelectFromModel(estimator = LogisticRegression(**rs
, penalty='l1'