#%% Import libs import numpy as np import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn import datasets from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC from sklearn.base import BaseEstimator from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from xgboost import XGBClassifier #%% Get train-test split and scoring functions X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , test_size = 0.33 , random_state = 2 , shuffle = True , stratify = num_df_wtgt['mutation_class']) y_train.to_frame().value_counts().plot(kind = 'bar') y_test.to_frame().value_counts().plot(kind = 'bar') scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'roc_auc' : make_scorer(roc_auc_score) #, 'jaccard' : make_scorer(jaccard_score) }) #%% Logistic Regression + hyperparam: GridSearch # Note: cannot have '___' in estimator names # '__' is used only before stating the param names # '__' is usef in both places when using clf_switcher mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} # FIXME: solver and penalty conflict, consider using 1 grid_params_log_reg = [{ #'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], #'clf__penalty': ['none', 'l1', 'l2', 'elasticnet'], 'clf__max_iter': list(range(100,800,100)), 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], }] pipe_log_reg = Pipeline([ ('pre', MinMaxScaler()) ,('clf', LogisticRegression(**rs))]) gs_log_reg = GridSearchCV(pipe_log_reg , param_grid = grid_params_log_reg , scoring ='f1' , refit = 'f1' # works #, scoring = mcc_score_fn, refit = 'mcc' #, scoring = scoring_fn, refit = False # problem doesn't predict because doesn't know , cv = 10 , n_jobs = 10# based on /proc/cpuinfo , return_train_score = False , verbose = 3) gs_log_reg.fit(X_train, y_train) #gs_log_reg_fit = gs_log_reg.fit(X_train, y_train) #gs_log_reg_fit_res = gs_log_reg.cv_results_ # still don't know how to use it #pp.pprint(gs_log_reg_fit_res) #y_predict = gs_log_reg.predict(X_test) #print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_predict)) print('Best model:\n', gs_log_reg.best_params_) print('Best models score:\n', gs_log_reg.best_score_) #GridSearchCV giving score from the best estimator different from the one indicated in refit parameter #https://stackoverflow.com/questions/66116996/gridsearchcv-giving-score-from-the-best-estimator-different-from-the-one-indicat #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() class ClfSwitcher(BaseEstimator): def __init__( self, estimator = SGDClassifier(), ): """ A Custom BaseEstimator that can switch between classifiers. :param estimator: sklearn object - The classifier """ self.estimator = estimator def fit(self, X, y=None, **kwargs): self.estimator.fit(X, y) return self def predict(self, X, y=None): return self.estimator.predict(X) def predict_proba(self, X): return self.estimator.predict_proba(X) def score(self, X, y): return self.estimator.score(X, y) parameters = [ { 'clf__estimator': [LogisticRegression(**rs)], #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], #'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] } ] pipeline = Pipeline([ ('pre', MinMaxScaler()), ('clf', ClfSwitcher()), ]) gscv = GridSearchCV(pipeline , parameters , scoring = 'f1', refit = 'f1' , cv = 10 , n_jobs = 10 #based on /proc/cpuinfo , return_train_score = False , verbose = 3) # Fit gscv.fit(X_train, y_train) print('Best model:\n', gscv.best_params_) print('Best models score:\n', gscv.best_score_, ':' ,round(gscv.best_score_, 2)) # gscv.score(X_test, y_test) # see how it does on test # check_score = f1_score(y_train, gscv.predict(X_train)) # check_score # should be the same as the best score when the same metric used! # mod_pred = gscv.predict(X_test) # fscore = f1_score(y_test, mod_pred) # fscore gscv_fit_be = gscv.fit(X_train, y_train) gscv_fit_be_res = gscv_fit_be.cv_results_ print('\nMean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2)) best_model = gscv.best_params_ best_model.keys() best_model.values cross_val_score(LogisticRegression(random_state=42 , solver='liblinear' , max_iter = 100) , X_train , y_train , cv = 10) cval =round(mean(cross_val_score(LogisticRegression(random_state=42 , solver='liblinear' , max_iter = 100) , X_train , y_train , cv = 10)),2) ########check print('Best models score:', round(gscv.best_score_, 2)) print('Mean test score from fit results:', round(mean(gscv_fit_be_res['mean_test_score']),2)) print('Best models cval:', cval)