#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Mar 4 15:25:33 2022 @author: tanu """ #%% import os, sys import pandas as pd import numpy as np import pprint as pp import random from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from xgboost import XGBClassifier from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler, OneHotEncoder from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef from statistics import mean, stdev, median, mode #%% rs = {'random_state': 42} njobs = {'n_jobs': 10} # Done: add preprocessing step with one hot encoder # TODO: supply stratified K-fold cv train and test dataskf # TODO: get accuracy and other scores through K-fold cv # Multiple Classification - Model Pipeline def MultClassPipeSKFLoop(input_df, target, skf_cv, var_type = ['numerical','categorical','mixed']): ''' @ param input_df: input features @ type: df with input features WITHOUT the target variable @param target: target (or output) feature @type: df or np.array or Series @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass @type: int or StratifiedKfold() @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-hot encoder) @type: list returns Dict containing multiple classification scores for each model and each Stratified Kfold ''' # Determine categorical and numerical features numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns numerical_ix categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns categorical_ix # Determine preprocessing steps ~ var_type if var_type == 'numerical': t = [('num', MinMaxScaler(), numerical_ix)] if var_type == 'categorical': t = [('cat', OneHotEncoder(), categorical_ix)] if var_type == 'mixed': t = [('cat', OneHotEncoder(), categorical_ix) , ('num', MinMaxScaler(), numerical_ix)] col_transform = ColumnTransformer(transformers = t , remainder='passthrough') #%% Define classification models to run log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) mlp = MLPClassifier(max_iter = 500, **rs) dt = DecisionTreeClassifier(**rs) et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) rf2 = RandomForestClassifier( min_samples_leaf = 50 , n_estimators = 150 , bootstrap = True , oob_score = True , **njobs , **rs , max_features = 'auto') xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder = False) classification_metrics = { 'F1_score': [] ,'MCC': [] ,'Precision': [] ,'Recall': [] , 'Accuracy': [] ,'ROC_AUC': [] } models = [ ('Logistic Regression' , log_reg) , ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) , ('MLP' , mlp) , ('Decision Tree' , dt) , ('Extra Trees' , et) , ('Random Forest' , rf) , ('Naive Bayes' , nb) , ('Random Forest2' , rf2) , ('XGBoost' , xgb) ] # skf = StratifiedKFold(n_splits = 10 # #, shuffle = False, random_state= None) # , shuffle = True,**rs) fold_no = 1 fold_dict={} for model_name, model in models: fold_dict.update({ model_name: {}}) #scores_df = pd.DataFrame() for train_index, test_index in skf_cv.split(input_df, target): x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] y_train_fold, y_test_fold = target.iloc[train_index], target.iloc[test_index] #print("Fold: ", fold_no, len(train_index), len(test_index)) for model_name, model in models: print("\nStart of model", model_name, "\nLoop no.", fold_no) model_pipeline = Pipeline(steps=[('prep' , col_transform) , ('classifier' , model)]) model_pipeline.fit(x_train_fold, y_train_fold) y_pred_fold = model_pipeline.predict(x_test_fold) #---------------- # Model metrics #---------------- fscore = f1_score(y_test_fold, y_pred_fold) mcc = matthews_corrcoef(y_test_fold, y_pred_fold) pres = precision_score(y_test_fold, y_pred_fold) recall = recall_score(y_test_fold, y_pred_fold) #pres = precision_score(y_test_fold, y_pred_fold, zero_division=0) #recall = recall_score(y_test_fold, y_pred_fold, zero_division=0) accu = accuracy_score(y_test_fold, y_pred_fold) roc_auc = roc_auc_score(y_test_fold, y_pred_fold) fold=("fold_"+str(fold_no)) fold_dict[model_name].update({fold: {}}) #pp.pprint(fold_dict) print("\nEnd of model", model_name, "\nLoop no.", fold_no) fold_dict[model_name][fold].update(classification_metrics) #fold_dict[model_name][fold]['F1_score'].append(score) fold_dict[model_name][fold].update({'F1_score' : fscore}) fold_dict[model_name][fold].update({'MCC' : mcc}) fold_dict[model_name][fold].update({'Precision' : pres}) fold_dict[model_name][fold].update({'Recall' : recall}) fold_dict[model_name][fold].update({'Accuracy' : accu}) fold_dict[model_name][fold].update({'ROC_AUC' : roc_auc}) fold_no +=1 return(fold_dict)