#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon May 16 05:59:12 2022 @author: tanu """ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ #%% Import libs import numpy as np import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn import datasets from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC from sklearn.base import BaseEstimator from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder from xgboost import XGBClassifier rs = {'random_state': 42} njobs = {'n_jobs': 10} #%% Get train-test split and scoring functions y.to_frame().value_counts().plot(kind = 'bar') blind_test_df['dst_mode'].to_frame().value_counts().plot(kind = 'bar') scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'roc_auc' : make_scorer(roc_auc_score) , 'jaccard' : make_scorer(jaccard_score) }) mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} jacc_score_fn = {'jcc': make_scorer(jaccard_score)} #%% Logistic Regression + hyperparam: BaseEstimator: ClfSwitcher() class ClfSwitcher(BaseEstimator): def __init__( self, estimator = SGDClassifier(), ): """ A Custom BaseEstimator that can switch between classifiers. :param estimator: sklearn object - The classifier """ self.estimator = estimator def fit(self, X, y=None, **kwargs): self.estimator.fit(X, y) return self def predict(self, X, y=None): return self.estimator.predict(X) def predict_proba(self, X): return self.estimator.predict_proba(X) def score(self, X, y): return self.estimator.score(X, y) parameters = [ { 'clf__estimator': [LogisticRegression(**rs)], #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__estimator__C': np.logspace(0, 4, 10), 'clf__estimator__penalty': ['none', 'l1', 'l2', 'elasticnet'], 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__solver': ['saga'] }, { 'clf__estimator': [LogisticRegression(**rs)], #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__estimator__C': np.logspace(0, 4, 10), 'clf__estimator__penalty': ['l2', 'none'], 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__solver': ['newton-cg', 'lbfgs', 'sag'] }, { 'clf__estimator': [LogisticRegression(**rs)], #'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'clf__estimator__C': np.logspace(0, 4, 10), 'clf__estimator__penalty': ['l1', 'l2'], 'clf__estimator__max_iter': list(range(100,800,100)), 'clf__estimator__solver': ['liblinear'] } ] # Create pipeline pipeline = Pipeline([ ('pre', MinMaxScaler()), ('clf', ClfSwitcher()), ]) # Grid search i.e hyperparameter tuning and refitting on mcc gscv_lr = GridSearchCV(pipeline , parameters #, scoring = 'f1', refit = 'f1' , scoring = mcc_score_fn, refit = 'mcc' #, cv = skf_cv , cv = rskf_cv , **njobs , return_train_score = False , verbose = 3) # Fit gscv_lr_fit = gscv_lr.fit(X, y) gscv_lr_fit_be_mod = gscv_lr_fit.best_params_ gscv_lr_fit_be_res = gscv_lr_fit.cv_results_ print('Best model:\n', gscv_lr_fit_be_mod) print('Best models score:\n', gscv_lr_fit.best_score_, ':' , round(gscv_lr_fit.best_score_, 2)) #print('\nMean test score from fit results:', round(mean(gscv_lr_fit_be_res['mean_test_mcc']),2)) print('\nMean test score from fit results:', round(np.nanmean(gscv_lr_fit_be_res['mean_test_mcc']),2)) ############################################################################### ###################################### # Blind test ###################################### # See how it does on the BLIND test #print('\nBlind test score, mcc:', )) test_predict = gscv_lr_fit.predict(X_bts) print(test_predict) print(np.array(y_bts)) y_btsf = np.array(y_bts) print(accuracy_score(y_bts, test_predict)) print(matthews_corrcoef(y_bts, test_predict)) # create a dict with all scores lr_bts_dict = {#'best_model': list(gscv_lr_fit_be_mod.items()) 'bts_fscore':None , 'bts_mcc':None , 'bts_precision':None , 'bts_recall':None , 'bts_accuracy':None , 'bts_roc_auc':None , 'bts_jaccard':None } lr_bts_dict lr_bts_dict['bts_fscore'] = round(f1_score(y_bts, test_predict),2) lr_bts_dict['bts_mcc'] = round(matthews_corrcoef(y_bts, test_predict),2) lr_bts_dict['bts_precision'] = round(precision_score(y_bts, test_predict),2) lr_bts_dict['bts_recall'] = round(recall_score(y_bts, test_predict),2) lr_bts_dict['bts_accuracy'] = round(accuracy_score(y_bts, test_predict),2) lr_bts_dict['bts_roc_auc'] = round(roc_auc_score(y_bts, test_predict),2) lr_bts_dict['bts_jaccard'] = round(jaccard_score(y_bts, test_predict),2) lr_bts_dict # Create a df from dict with all scores lr_bts_df = pd.DataFrame.from_dict(lr_bts_dict,orient = 'index') lr_bts_df.columns = ['Logistic_Regression'] print(lr_bts_df) # d2 = {'best_model_params': lis(gscv_lr_fit_be_mod.items() )} # d2 # def Merge(dict1, dict2): # res = {**dict1, **dict2} # return res # d3 = Merge(d2, lr_bts_dict) # d3 # Create df with best model params model_params = pd.Series(['best_model_params', list(gscv_lr_fit_be_mod.items())]) model_params_df = model_params.to_frame() model_params_df model_params_df.columns = ['Logistic_Regression'] model_params_df.columns # Combine the df of scores and the best model params lr_bts_df.columns lr_output = pd.concat([model_params_df, lr_bts_df], axis = 0) lr_output # Format the combined df # Drop the best_model_params row from lr_output lr_df = lr_output.drop([0], axis = 0) lr_df #FIXME: tidy the index of the formatted df ############################################################################### # FIXME: confusion matrix print(confusion_matrix(y_bts, test_predict)) cm = confusion_matrix(y_bts, test_predict)