ML_AI_training/base_estimator2.py

195 lines
No EOL
8.1 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 09:50:37 2022
@author: tanu
"""
#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
#%%
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
#%%
#%% my numerical data
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, test_size = 0.33
, random_state = 2
, shuffle = True
, stratify = num_df_wtgt['mutation_class'])
y_train.to_frame().value_counts().plot(kind = 'bar')
y_test.to_frame().value_counts().plot(kind = 'bar')
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
#, 'jaccard' : make_scorer(jaccard_score)
})
#%% ClfSwitcher()
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
#'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
#'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
# {
# 'clf__estimator': [LogisticRegression()],
# #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'],
# 'clf__estimator__max__iter': list(range(100,800,100)),
# 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
# },
]
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
gscv = GridSearchCV(pipeline
, parameters
, cv=5
, n_jobs=12
, return_train_score=False
, verbose=3)
#gscv.fit(train_data, train_labels)
# Fit
gscv.fit(X_train, y_train)
print('Best model:\n', gscv.best_params_)
print('Best models score:\n', gscv.best_score_)
gscv.score(X_test, y_test) # see how it does on test
mod_pred = gscv.predict(X_test)
fscore = f1_score(y_test, mod_pred)
fscore
#%% GridSearchCV: single model
#https://stackoverflow.com/questions/71079357/invalid-parameter-clf-learning-rate-for-estimator-pipeline
pipe_xgb = Pipeline([('clf', XGBClassifier(random_state=42, use_label_encoder=False) )])
grid_params_xgb = [{'clf__max__depth': [2, 4],
'clf__n__estimators': [50, 100],
'clf__learning__rate': [0.0001, 0.001]}]
gs_xgb = GridSearchCV(estimator = pipe_xgb,
param_grid = grid_params_xgb,
scoring='accuracy',
cv=10,
n_jobs=5)
gs_xgb.fit(X_train, y_train)
y_predict = gs_xgb.predict(X_test)
print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict))
print('Best model:\n', gs_xgb.best_params_)
print('Best models score:\n', gs_xgb.best_score_)
# Best model:
# {'clf__learning__rate': 0.0001, 'clf__max__depth': 2, 'clf__n__estimators': 50}
#NOTE: takes time to run!
#%% model
# Note: cannot have '___' in estimator names
# '__' is used only before stating the param names
# '__' is usef in both places when using clf_switcher
pipe_log_reg = Pipeline([('clf', LogisticRegression(random_state=42) )])
grid_params_log_reg = [{
#'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'],
'clf__max_iter': list(range(100,800,100)),
'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}]
gs_log_reg = GridSearchCV(estimator = pipe_log_reg
, param_grid = grid_params_log_reg
, scoring='accuracy'# works
# , scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know
#, scoring = ['accuracy','f1', 'recall']
#, refit = 'recall'
, cv=10
, n_jobs=5)
gs_log_reg.fit(X_train, y_train)
#y_predict = gs_log_reg.predict(X_test)
gs_log_reg_fit = gs_log_reg.fit(X_train, y_train)
gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it
#y_predict = gs_log_reg.predict(X_test)
print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict))
print('Best model:\n', gs_log_reg.best_params_)
print('Best models score:\n', gs_log_reg.best_score_)
# note: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric.
# If this is not needed, refit should be set to False explicitly. True was passed.
#refit : boolean, string, or callable, default=True
#Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a string denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, refit can be set to a function which returns the selected best_index_ given cv_results_. The refitted estimator is made available at the best_estimator_ attribute and permits using predict directly on this GridSearchCV instance. Also for multiple metric evaluation, the attributes best_index_, best_score_ and best_params_ will only be available if refit is set and all of them will be determined w.r.t this specific scorer. best_score_ is not
# returned if refit is callable. See scoring parameter to know more about multiple metric evaluation.
# This GridSearchCV instance was initialized with `refit=False`. predict is available only after refitting on the best parameters. You can refit an estimator manually using the `best_params_` attribute
#https://stackoverflow.com/questions/57986374/how-to-fix-the-error-for-multi-metric-scoring-for-oneclasssvm-and-gridsearchcv
# PROBLEM: using multiple scoring metrics with GridSearchCV
#https://stackoverflow.com/questions/53973563/using-multiple-metric-evaluation-with-gridsearchcv