diff --git a/base_estimator2.py b/base_estimator2.py new file mode 100644 index 0000000..a6f49f6 --- /dev/null +++ b/base_estimator2.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 09:50:37 2022 + +@author: tanu +""" +#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers + +#%% +# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb +import numpy as np +import pandas as pd +from sklearn.model_selection import GridSearchCV +from sklearn import datasets +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.svm import SVC + +from sklearn.base import BaseEstimator +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.pipeline import Pipeline +from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder +from xgboost import XGBClassifier + +#%% +#%% my numerical data +X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , test_size = 0.33 + , random_state = 2 + , shuffle = True + , stratify = num_df_wtgt['mutation_class']) + +y_train.to_frame().value_counts().plot(kind = 'bar') +y_test.to_frame().value_counts().plot(kind = 'bar') +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + #, 'jaccard' : make_scorer(jaccard_score) + }) + + +#%% ClfSwitcher() +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + + self.estimator = estimator + + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss + #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), + #'tfidf__stop_words': ['english', None], + 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), + 'clf__estimator__max_iter': [50, 80], + 'clf__estimator__tol': [1e-4], + 'clf__estimator__loss': ['hinge', 'log', 'modified_huber'], + }, + { + 'clf__estimator': [MultinomialNB()], + #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), + #'tfidf__stop_words': [None], + 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), + }, + + # { + # 'clf__estimator': [LogisticRegression()], + # #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + # 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + # 'clf__estimator__max__iter': list(range(100,800,100)), + # 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], + # }, +] + +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + + +gscv = GridSearchCV(pipeline + , parameters + , cv=5 + , n_jobs=12 + , return_train_score=False + , verbose=3) + +#gscv.fit(train_data, train_labels) + +# Fit +gscv.fit(X_train, y_train) +print('Best model:\n', gscv.best_params_) +print('Best models score:\n', gscv.best_score_) +gscv.score(X_test, y_test) # see how it does on test + +mod_pred = gscv.predict(X_test) + +fscore = f1_score(y_test, mod_pred) +fscore +#%% GridSearchCV: single model +#https://stackoverflow.com/questions/71079357/invalid-parameter-clf-learning-rate-for-estimator-pipeline +pipe_xgb = Pipeline([('clf', XGBClassifier(random_state=42, use_label_encoder=False) )]) +grid_params_xgb = [{'clf__max__depth': [2, 4], + 'clf__n__estimators': [50, 100], + 'clf__learning__rate': [0.0001, 0.001]}] + +gs_xgb = GridSearchCV(estimator = pipe_xgb, + param_grid = grid_params_xgb, + scoring='accuracy', + cv=10, + n_jobs=5) +gs_xgb.fit(X_train, y_train) +y_predict = gs_xgb.predict(X_test) +print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict)) + +print('Best model:\n', gs_xgb.best_params_) +print('Best models score:\n', gs_xgb.best_score_) +# Best model: +# {'clf__learning__rate': 0.0001, 'clf__max__depth': 2, 'clf__n__estimators': 50} +#NOTE: takes time to run! +#%% model +# Note: cannot have '___' in estimator names +# '__' is used only before stating the param names +# '__' is usef in both places when using clf_switcher + +pipe_log_reg = Pipeline([('clf', LogisticRegression(random_state=42) )]) +grid_params_log_reg = [{ + #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + 'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__max_iter': list(range(100,800,100)), + 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], + }] + +gs_log_reg = GridSearchCV(estimator = pipe_log_reg + , param_grid = grid_params_log_reg + , scoring='accuracy'# works + # , scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know + #, scoring = ['accuracy','f1', 'recall'] + #, refit = 'recall' + , cv=10 + , n_jobs=5) + +gs_log_reg.fit(X_train, y_train) +#y_predict = gs_log_reg.predict(X_test) +gs_log_reg_fit = gs_log_reg.fit(X_train, y_train) +gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it + +#y_predict = gs_log_reg.predict(X_test) + +print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict)) + +print('Best model:\n', gs_log_reg.best_params_) +print('Best models score:\n', gs_log_reg.best_score_) + +# note: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. +# If this is not needed, refit should be set to False explicitly. True was passed. +#refit : boolean, string, or callable, default=True +#Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a string denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, refit can be set to a function which returns the selected best_index_ given cv_results_. The refitted estimator is made available at the best_estimator_ attribute and permits using predict directly on this GridSearchCV instance. Also for multiple metric evaluation, the attributes best_index_, best_score_ and best_params_ will only be available if refit is set and all of them will be determined w.r.t this specific scorer. best_score_ is not +# returned if refit is callable. See scoring parameter to know more about multiple metric evaluation. +# This GridSearchCV instance was initialized with `refit=False`. predict is available only after refitting on the best parameters. You can refit an estimator manually using the `best_params_` attribute + +#https://stackoverflow.com/questions/57986374/how-to-fix-the-error-for-multi-metric-scoring-for-oneclasssvm-and-gridsearchcv + +# PROBLEM: using multiple scoring metrics with GridSearchCV +#https://stackoverflow.com/questions/53973563/using-multiple-metric-evaluation-with-gridsearchcv \ No newline at end of file diff --git a/base_estimator3.py b/base_estimator3.py new file mode 100644 index 0000000..6627e78 --- /dev/null +++ b/base_estimator3.py @@ -0,0 +1,169 @@ +#%% Import libs +import numpy as np +import pandas as pd +from sklearn.model_selection import GridSearchCV +from sklearn import datasets +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.svm import SVC + +from sklearn.base import BaseEstimator +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.pipeline import Pipeline +from sklearn.model_selection import GridSearchCV +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder +from xgboost import XGBClassifier +#%% Get train-test split and scoring functions +X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , test_size = 0.33 + , random_state = 2 + , shuffle = True + , stratify = num_df_wtgt['mutation_class']) + +y_train.to_frame().value_counts().plot(kind = 'bar') +y_test.to_frame().value_counts().plot(kind = 'bar') +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + #, 'jaccard' : make_scorer(jaccard_score) + }) + +#%% Logistic Regression + hyperparam: GridSearch +# Note: cannot have '___' in estimator names +# '__' is used only before stating the param names +# '__' is usef in both places when using clf_switcher + +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} + +# FIXME: solver and penalty conflict, consider using 1 +grid_params_log_reg = [{ + #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + #'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__max_iter': list(range(100,800,100)), + 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], + }] + +pipe_log_reg = Pipeline([ + ('pre', MinMaxScaler()) + ,('clf', LogisticRegression(**rs))]) + +gs_log_reg = GridSearchCV(pipe_log_reg + , param_grid = grid_params_log_reg + , scoring ='f1' , refit = 'f1' # works + #, scoring = mcc_score_fn, refit = 'mcc' + #, scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know + , cv = 10 + , n_jobs = 10# based on /proc/cpuinfo + , return_train_score = False + , verbose = 3) + +gs_log_reg.fit(X_train, y_train) +#gs_log_reg_fit = gs_log_reg.fit(X_train, y_train) +#gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it +#pp.pprint(gs_log_reg_fit_res) +#y_predict = gs_log_reg.predict(X_test) + +#print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict)) +print('Best model:\n', gs_log_reg.best_params_) +print('Best models score:\n', gs_log_reg.best_score_) + +#GridSearchCV giving score from the best estimator different from the one indicated in refit parameter +#https://stackoverflow.com/questions/66116996/gridsearchcv-giving-score-from-the-best-estimator-different-from-the-one-indicat + +#%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() + +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + self.estimator = estimator + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [LogisticRegression(**rs)], + #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], + #'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], + 'clf__estimator__max_iter': list(range(100,800,100)), + 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] + } +] + +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + + +gscv = GridSearchCV(pipeline + , parameters + , scoring = 'f1', refit = 'f1' + , cv = 10 + , n_jobs = 10 #based on /proc/cpuinfo + , return_train_score = False + , verbose = 3) + +# Fit +gscv.fit(X_train, y_train) +print('Best model:\n', gscv.best_params_) +print('Best models score:\n', gscv.best_score_, ':' ,round(gscv.best_score_, 2)) +# gscv.score(X_test, y_test) # see how it does on test +# check_score = f1_score(y_train, gscv.predict(X_train)) +# check_score # should be the same as the best score when the same metric used! +# mod_pred = gscv.predict(X_test) +# fscore = f1_score(y_test, mod_pred) +# fscore + +gscv_fit_be = gscv.fit(X_train, y_train) +gscv_fit_be_res = gscv_fit_be.cv_results_ +print('\nMean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2)) + +best_model = gscv.best_params_ +best_model.keys() +best_model.values + +cross_val_score(LogisticRegression(random_state=42 + , solver='liblinear' + , max_iter = 100) + , X_train + , y_train + , cv = 10) + + +cval =round(mean(cross_val_score(LogisticRegression(random_state=42 + , solver='liblinear' + , max_iter = 100) + , X_train + , y_train + , cv = 10)),2) + +########check +print('Best models score:', round(gscv.best_score_, 2)) +print('Mean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2)) +print('Best models cval:', cval) +