diff --git a/MultClassPipe3.py b/MultClassPipe3.py new file mode 100644 index 0000000..d30a85d --- /dev/null +++ b/MultClassPipe3.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% +import os, sys +import pandas as pd +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler, OneHotEncoder + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from statistics import mean, stdev, median, mode +#%% +rs = {'random_state': 42} +# Done: add preprocessing step with one hot encoder +# TODO: supply stratified K-fold cv train and test data +# TODO: get accuracy and other scores through K-fold cv + +# Multiple Classification - Model Pipeline +def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): + + ''' + @ param input_df: input features + @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation) + + @param y_outputF: target (or output) feature + @type: df or np.array + + + returns + multiple classification model scores + + ''' + # Determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + +#%% Define classification models to run + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter = 500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + rf2 = RandomForestClassifier( + min_samples_leaf = 50, + n_estimators = 150, + bootstrap = True, + oob_score = True, + n_jobs = -1, + random_state = 42, + max_features = 'auto') + + xgb = XGBClassifier(**rs, verbosity = 0) + + clfs = [ + ('Logistic Regression' , log_reg) + , ('Naive Bayes' , nb) + , ('K-Nearest Neighbors', knn) + , ('SVM' , svm) + , ('MLP' , mlp) + , ('Decision Tree' , dt) + , ('Extra Trees' , et) + , ('Random Forest' , rf) + , ('Random Forest2' , rf2) + , ('XGBoost' , xgb) + ] + + skf = StratifiedKFold(n_splits = skf_splits + , shuffle = True + #, random_state = seed_skf + , **rs) + + X_array = np.array(input_df) + Y = y_targetF + + # Initialise score metrics list to store skf results + fscoreL = [] + mccL = [] + presL = [] + recallL = [] + accuL = [] + roc_aucL = [] + + for train_index, test_index in skf.split(input_df, y_targetF): + x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] + y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] + + # for train_index, test_index in skf.split(X_array, Y): + # print('\nSKF train index:', train_index + # , '\nSKF test index:', test_index) + # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] + # y_train_fold, y_test_fold = Y[train_index], Y[test_index] + + clf_scores_df = pd.DataFrame() + + for clf_name, clf in clfs: + print('\nRunning the following classification models' + , clf_name) + + model_pipeline = Pipeline(steps=[('prep' , col_transform) + , ('classifier' , clf)]) + + # model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler()) + # , ('classifier' , clf)]) + + + model_pipeline.fit(x_train_fold, y_train_fold) + y_pred_fold = model_pipeline.predict(x_test_fold) + + #---------------- + # Model metrics + #---------------- + # F1-Score + fscore = f1_score(y_test_fold, y_pred_fold) + fscoreL.append(fscore) + fscoreM = mean(fscoreL) + + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + mccL.append(mcc) + mccM = mean(mccL) + + # Precision + pres = precision_score(y_test_fold, y_pred_fold) + presL.append(pres) + presM = mean(presL) + + # Recall + recall = recall_score(y_test_fold, y_pred_fold) + recallL.append(recall) + recallM = mean(recallL) + + # Accuracy + accu = accuracy_score(y_test_fold, y_pred_fold) + accuL.append(accu) + accuM = mean(accuL) + + # ROC_AUC + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_aucL.append(roc_auc) + roc_aucM = mean(roc_aucL) + + clf_scores_df = clf_scores_df.append({'Model' : clf_name + ,'F1_score' : fscoreM + , 'MCC' : mccM + , 'Precision': presM + , 'Recall' : recallM + , 'Accuracy' : accuM + , 'ROC_curve': roc_aucM} + , ignore_index = True) + return clf_scores_df \ No newline at end of file