#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 09:50:37 2022 @author: tanu """ #https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers #%% # https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb import numpy as np import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn import datasets from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC from sklearn.base import BaseEstimator from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from xgboost import XGBClassifier #%% #%% my numerical data X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , test_size = 0.33 , random_state = 2 , shuffle = True , stratify = num_df_wtgt['mutation_class']) y_train.to_frame().value_counts().plot(kind = 'bar') y_test.to_frame().value_counts().plot(kind = 'bar') scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'roc_auc' : make_scorer(roc_auc_score) #, 'jaccard' : make_scorer(jaccard_score) }) #%% ClfSwitcher() class ClfSwitcher(BaseEstimator): def __init__( self, estimator = SGDClassifier(), ): """ A Custom BaseEstimator that can switch between classifiers. :param estimator: sklearn object - The classifier """ self.estimator = estimator def fit(self, X, y=None, **kwargs): self.estimator.fit(X, y) return self def predict(self, X, y=None): return self.estimator.predict(X) def predict_proba(self, X): return self.estimator.predict_proba(X) def score(self, X, y): return self.estimator.score(X, y) parameters = [ { 'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), #'tfidf__stop_words': ['english', None], 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), 'clf__estimator__max_iter': [50, 80], 'clf__estimator__tol': [1e-4], 'clf__estimator__loss': ['hinge', 'log', 'modified_huber'], }, { 'clf__estimator': [MultinomialNB()], #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), #'tfidf__stop_words': [None], 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), }, # { # 'clf__estimator': [LogisticRegression()], # #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], # 'clf__estimator__max__iter': list(range(100,800,100)), # 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], # }, ] pipeline = Pipeline([ ('pre', MinMaxScaler()), ('clf', ClfSwitcher()), ]) gscv = GridSearchCV(pipeline , parameters , cv=5 , n_jobs=12 , return_train_score=False , verbose=3) #gscv.fit(train_data, train_labels) # Fit gscv.fit(X_train, y_train) print('Best model:\n', gscv.best_params_) print('Best models score:\n', gscv.best_score_) gscv.score(X_test, y_test) # see how it does on test mod_pred = gscv.predict(X_test) fscore = f1_score(y_test, mod_pred) fscore #%% GridSearchCV: single model #https://stackoverflow.com/questions/71079357/invalid-parameter-clf-learning-rate-for-estimator-pipeline pipe_xgb = Pipeline([('clf', XGBClassifier(random_state=42, use_label_encoder=False) )]) grid_params_xgb = [{'clf__max__depth': [2, 4], 'clf__n__estimators': [50, 100], 'clf__learning__rate': [0.0001, 0.001]}] gs_xgb = GridSearchCV(estimator = pipe_xgb, param_grid = grid_params_xgb, scoring='accuracy', cv=10, n_jobs=5) gs_xgb.fit(X_train, y_train) y_predict = gs_xgb.predict(X_test) print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict)) print('Best model:\n', gs_xgb.best_params_) print('Best models score:\n', gs_xgb.best_score_) # Best model: # {'clf__learning__rate': 0.0001, 'clf__max__depth': 2, 'clf__n__estimators': 50} #NOTE: takes time to run! #%% model # Note: cannot have '___' in estimator names # '__' is used only before stating the param names # '__' is usef in both places when using clf_switcher pipe_log_reg = Pipeline([('clf', LogisticRegression(random_state=42) )]) grid_params_log_reg = [{ #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], 'clf__max_iter': list(range(100,800,100)), 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], }] gs_log_reg = GridSearchCV(estimator = pipe_log_reg , param_grid = grid_params_log_reg , scoring='accuracy'# works # , scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know #, scoring = ['accuracy','f1', 'recall'] #, refit = 'recall' , cv=10 , n_jobs=5) gs_log_reg.fit(X_train, y_train) #y_predict = gs_log_reg.predict(X_test) gs_log_reg_fit = gs_log_reg.fit(X_train, y_train) gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it #y_predict = gs_log_reg.predict(X_test) print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict)) print('Best model:\n', gs_log_reg.best_params_) print('Best models score:\n', gs_log_reg.best_score_) # note: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. # If this is not needed, refit should be set to False explicitly. True was passed. #refit : boolean, string, or callable, default=True #Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a string denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, refit can be set to a function which returns the selected best_index_ given cv_results_. The refitted estimator is made available at the best_estimator_ attribute and permits using predict directly on this GridSearchCV instance. Also for multiple metric evaluation, the attributes best_index_, best_score_ and best_params_ will only be available if refit is set and all of them will be determined w.r.t this specific scorer. best_score_ is not # returned if refit is callable. See scoring parameter to know more about multiple metric evaluation. # This GridSearchCV instance was initialized with `refit=False`. predict is available only after refitting on the best parameters. You can refit an estimator manually using the `best_params_` attribute #https://stackoverflow.com/questions/57986374/how-to-fix-the-error-for-multi-metric-scoring-for-oneclasssvm-and-gridsearchcv # PROBLEM: using multiple scoring metrics with GridSearchCV #https://stackoverflow.com/questions/53973563/using-multiple-metric-evaluation-with-gridsearchcv