#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Mar 4 15:25:33 2022 @author: tanu """ #%% import os, sys import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef from statistics import mean, stdev, median, mode #%% rs = {'random_state': 42} # Done: add preprocessing step with one hot encoder # TODO: supply stratified K-fold cv train and test data # TODO: get accuracy and other scores through K-fold cv # Multiple Classification - Model Pipeline def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): ''' @ param input_df: input features @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation) @param y_outputF: target (or output) feature @type: df or np.array returns multiple classification model scores ''' # Determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') #%% Define classification models to run log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) mlp = MLPClassifier(max_iter = 500, **rs) dt = DecisionTreeClassifier(**rs) et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) rf2 = RandomForestClassifier( min_samples_leaf = 50, n_estimators = 150, bootstrap = True, oob_score = True, n_jobs = -1, random_state = 42, max_features = 'auto') xgb = XGBClassifier(**rs, verbosity = 0) clfs = [ ('Logistic Regression' , log_reg) #, ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) , ('MLP' , mlp) , ('Decision Tree' , dt) , ('Extra Trees' , et) , ('Random Forest' , rf) , ('Naive Bayes' , nb) #, ('Random Forest2' , rf2) #, ('XGBoost' , xgb) ] skf = StratifiedKFold(n_splits = skf_splits , shuffle = True #, random_state = seed_skf , **rs) X_array = np.array(input_df) Y = y_targetF # Initialise score metrics list to store skf results # fscoreL = [] # mccL = [] # presL = [] # recallL = [] # accuL = [] # roc_aucL = [] skf_dict = {} #scores_df = pd.DataFrame() for train_index, test_index in skf.split(input_df, y_targetF): x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] #fscoreL = {} # for train_index, test_index in skf.split(X_array, Y): # print('\nSKF train index:', train_index # , '\nSKF test index:', test_index) # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] # y_train_fold, y_test_fold = Y[train_index], Y[test_index] clf_scores_df = pd.DataFrame() for clf_name, clf in clfs: print('\nRunning the following classification models' , clf_name) model_pipeline = Pipeline(steps=[('prep' , col_transform) , ('classifier' , clf)]) # model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler()) # , ('classifier' , clf)]) model_pipeline.fit(x_train_fold, y_train_fold) y_pred_fold = model_pipeline.predict(x_test_fold) #---------------- # Model metrics #---------------- # F1-Score fscore = f1_score(y_test_fold, y_pred_fold) fscoreL[clf_name].append(fscore) print('fscoreL Len: ', len(fscoreL)) #fscoreM = mean(fscoreL[clf]) # Matthews correlation coefficient mcc = matthews_corrcoef(y_test_fold, y_pred_fold) mccL[clf_name].append(mcc) mccM = mean(mccL) # # Precision # pres = precision_score(y_test_fold, y_pred_fold) # presL.append(pres) # presM = mean(presL) # # Recall # recall = recall_score(y_test_fold, y_pred_fold) # recallL.append(recall) # recallM = mean(recallL) # # Accuracy # accu = accuracy_score(y_test_fold, y_pred_fold) # accuL.append(accu) # accuM = mean(accuL) # # ROC_AUC # roc_auc = roc_auc_score(y_test_fold, y_pred_fold) # roc_aucL.append(roc_auc) # roc_aucM = mean(roc_aucL) clf_scores_df = clf_scores_df.append({'Model' : clf_name ,'F1_score' : fscoreM , 'MCC' : mccM , 'Precision': presM , 'Recall' : recallM , 'Accuracy' : accuM , 'ROC_curve': roc_aucM} , ignore_index = True) return(clf_scores_df) #scores_df = scores_df.append(clf_scores_df) # return clf_scores_df