123 lines
4.3 KiB
Python
Executable file
123 lines
4.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Fri Mar 4 15:25:33 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
#%%
|
|
import os, sys
|
|
import pandas as pd
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.naive_bayes import BernoulliNB
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.svm import SVC
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
|
from sklearn.neural_network import MLPClassifier
|
|
from sklearn.pipeline import Pipeline
|
|
from xgboost import XGBClassifier
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
|
|
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
|
|
#%%
|
|
rs = {'random_state': 42}
|
|
# Done: add preprocessing step with one hot encoder
|
|
# TODO: supply stratified K-fold cv train and test data
|
|
# TODO: get accuracy and other scores through K-fold cv
|
|
|
|
# Multiple Classification - Model Pipeline
|
|
def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
|
|
|
|
# determine categorical and numerical features
|
|
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
|
|
numerical_ix
|
|
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
|
|
categorical_ix
|
|
|
|
|
|
log_reg = LogisticRegression(**rs)
|
|
nb = BernoulliNB()
|
|
knn = KNeighborsClassifier()
|
|
svm = SVC(**rs)
|
|
mlp = MLPClassifier(max_iter=500, **rs)
|
|
dt = DecisionTreeClassifier(**rs)
|
|
et = ExtraTreesClassifier(**rs)
|
|
rf = RandomForestClassifier(**rs)
|
|
rf2 = RandomForestClassifier(
|
|
min_samples_leaf=50,
|
|
n_estimators=150,
|
|
bootstrap=True,
|
|
oob_score=True,
|
|
n_jobs=-1,
|
|
random_state=42,
|
|
max_features='auto')
|
|
|
|
xgb = XGBClassifier(**rs, verbosity=0)
|
|
|
|
clfs = [
|
|
('Logistic Regression', log_reg),
|
|
('Naive Bayes', nb),
|
|
('K-Nearest Neighbors', knn),
|
|
('SVM', svm),
|
|
('MLP', mlp),
|
|
('Decision Tree', dt),
|
|
('Extra Trees', et),
|
|
('Random Forest', rf),
|
|
('Random Forest2', rf2),
|
|
('XGBoost', xgb)
|
|
]
|
|
|
|
pipelines = []
|
|
|
|
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
|
|
|
for clf_name, clf in clfs:
|
|
#%%
|
|
# define the data preparation for the columns
|
|
t = [('cat', OneHotEncoder(), categorical_ix)
|
|
, ('num', MinMaxScaler(), numerical_ix)]
|
|
|
|
col_transform = ColumnTransformer(transformers = t
|
|
, remainder='passthrough')
|
|
|
|
pipeline = Pipeline(steps=[('prep', col_transform)
|
|
, ('classifier', clf)])
|
|
|
|
#%%
|
|
pipeline.fit(X_train, y_train)
|
|
|
|
# Model predictions
|
|
y_pred = pipeline.predict(X_test)
|
|
|
|
# F1-Score
|
|
fscore = f1_score(y_test, y_pred)
|
|
# Matthews correlation coefficient
|
|
mcc = matthews_corrcoef(y_test, y_pred)
|
|
# Precision
|
|
pres = precision_score(y_test, y_pred)
|
|
# Recall
|
|
recall = recall_score(y_test, y_pred)
|
|
# Accuracy
|
|
accu = accuracy_score(y_test, y_pred)
|
|
# ROC_AUC
|
|
roc_auc = roc_auc_score(y_test, y_pred)
|
|
|
|
pipelines.append(pipeline)
|
|
|
|
scores_df = scores_df.append({
|
|
'Model' : clf_name
|
|
, 'F1_Score' : fscore
|
|
, 'MCC' : mcc
|
|
, 'Precision' : pres
|
|
, 'Recall' : recall
|
|
, 'Accuracy' : accu
|
|
, 'ROC_AUC' : roc_auc
|
|
}
|
|
, ignore_index = True)
|
|
|
|
return pipelines, scores_df
|
|
|