#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Mar 4 15:25:33 2022 @author: tanu """ #%% import os, sys import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef #%% rs = {'random_state': 42} # Done: add preprocessing step with one hot encoder # TODO: supply stratified K-fold cv train and test data # TODO: get accuracy and other scores through K-fold cv # Multiple Classification - Model Pipeline def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): # determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) mlp = MLPClassifier(max_iter=500, **rs) dt = DecisionTreeClassifier(**rs) et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) rf2 = RandomForestClassifier( min_samples_leaf=50, n_estimators=150, bootstrap=True, oob_score=True, n_jobs=-1, random_state=42, max_features='auto') xgb = XGBClassifier(**rs, verbosity=0) clfs = [ ('Logistic Regression', log_reg), ('Naive Bayes', nb), ('K-Nearest Neighbors', knn), ('SVM', svm), ('MLP', mlp), ('Decision Tree', dt), ('Extra Trees', et), ('Random Forest', rf), ('Random Forest2', rf2), ('XGBoost', xgb) ] pipelines = [] scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) for clf_name, clf in clfs: #%% # define the data preparation for the columns t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') pipeline = Pipeline(steps=[('prep', col_transform) , ('classifier', clf)]) #%% pipeline.fit(X_train, y_train) # Model predictions y_pred = pipeline.predict(X_test) # F1-Score fscore = f1_score(y_test, y_pred) # Matthews correlation coefficient mcc = matthews_corrcoef(y_test, y_pred) # Precision pres = precision_score(y_test, y_pred) # Recall recall = recall_score(y_test, y_pred) # Accuracy accu = accuracy_score(y_test, y_pred) # ROC_AUC roc_auc = roc_auc_score(y_test, y_pred) pipelines.append(pipeline) scores_df = scores_df.append({ 'Model' : clf_name , 'F1_Score' : fscore , 'MCC' : mcc , 'Precision' : pres , 'Recall' : recall , 'Accuracy' : accu , 'ROC_AUC' : roc_auc } , ignore_index = True) return pipelines, scores_df