ML_AI_training/MultClassPipe2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  4 15:25:33 2022

@author: tanu
"""
#%%
import os, sys
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
#%%
rs = {'random_state': 42}
# Done: add preprocessing step with one hot encoder
# TODO: supply stratified K-fold cv train and test data
# TODO: get accuracy and other scores through K-fold cv

# Multiple Classification - Model Pipeline
def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):

    # determine categorical and numerical features
    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
    numerical_ix
    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
    categorical_ix


    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=500, **rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    rf2 = RandomForestClassifier(
                          min_samples_leaf=50,
                          n_estimators=150,
                          bootstrap=True,
                          oob_score=True,
                          n_jobs=-1,
                          random_state=42,
                          max_features='auto')

    xgb = XGBClassifier(**rs, verbosity=0)

    clfs = [
            ('Logistic Regression', log_reg),
            ('Naive Bayes', nb),
            ('K-Nearest Neighbors', knn),
            ('SVM', svm),
            ('MLP', mlp),
            ('Decision Tree', dt),
            ('Extra Trees', et),
            ('Random Forest', rf),
            ('Random Forest2', rf2),
            ('XGBoost', xgb)
            ]

    pipelines = []

    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])

    for clf_name, clf in clfs:
#%%
        # define the data preparation for the columns
        t = [('cat', OneHotEncoder(), categorical_ix)
             , ('num', MinMaxScaler(), numerical_ix)]

        col_transform = ColumnTransformer(transformers = t
                                          , remainder='passthrough')

        pipeline = Pipeline(steps=[('prep', col_transform)
                                   , ('classifier', clf)])

#%%
        pipeline.fit(X_train, y_train)

        # Model predictions
        y_pred  = pipeline.predict(X_test)

        # F1-Score
        fscore  = f1_score(y_test, y_pred)
        # Matthews correlation coefficient
        mcc =  matthews_corrcoef(y_test, y_pred)
        # Precision
        pres    = precision_score(y_test, y_pred)
        # Recall
        recall   = recall_score(y_test, y_pred)
        # Accuracy
        accu    = accuracy_score(y_test, y_pred)
        # ROC_AUC
        roc_auc = roc_auc_score(y_test, y_pred)

        pipelines.append(pipeline)

        scores_df = scores_df.append({
                                      'Model'       : clf_name
                                      , 'F1_Score'  : fscore
                                      , 'MCC'       : mcc
                                      , 'Precision' : pres
                                      , 'Recall'    : recall
                                      , 'Accuracy'  : accu
                                      , 'ROC_AUC'   : roc_auc
                                      }
                                     , ignore_index = True)

    return pipelines, scores_df