added MultClassPipe2.py that has one hot encoder included
This commit is contained in:
parent
b637ebc6d2
commit
dd8fd5b8ac
2 changed files with 216 additions and 0 deletions
127
MultClassPipe2.py
Normal file
127
MultClassPipe2.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Mar 4 15:25:33 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
|
||||
#%%
|
||||
rs = {'random_state': 42}
|
||||
# Done: add preprocessing step with one hot encoder
|
||||
# TODO: supply stratified K-fold cv train and test data
|
||||
# TODO: get accuracy and other scores through K-fold cv
|
||||
|
||||
# Multiple Classification - Model Pipeline
|
||||
def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
|
||||
|
||||
# determine categorical and numerical features
|
||||
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_ix
|
||||
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
|
||||
categorical_ix
|
||||
|
||||
|
||||
log_reg = LogisticRegression(**rs)
|
||||
nb = BernoulliNB()
|
||||
knn = KNeighborsClassifier()
|
||||
svm = SVC(**rs)
|
||||
mlp = MLPClassifier(max_iter=500, **rs)
|
||||
dt = DecisionTreeClassifier(**rs)
|
||||
et = ExtraTreesClassifier(**rs)
|
||||
rf = RandomForestClassifier(**rs)
|
||||
rf2 = RandomForestClassifier(
|
||||
min_samples_leaf=50,
|
||||
n_estimators=150,
|
||||
bootstrap=True,
|
||||
oob_score=True,
|
||||
n_jobs=-1,
|
||||
random_state=42,
|
||||
max_features='auto')
|
||||
|
||||
xgb = XGBClassifier(**rs, verbosity=0)
|
||||
|
||||
clfs = [
|
||||
('Logistic Regression', log_reg),
|
||||
('Naive Bayes', nb),
|
||||
('K-Nearest Neighbors', knn),
|
||||
('SVM', svm),
|
||||
('MLP', mlp),
|
||||
('Decision Tree', dt),
|
||||
('Extra Trees', et),
|
||||
('Random Forest', rf),
|
||||
('Random Forest2', rf2),
|
||||
('XGBoost', xgb)
|
||||
]
|
||||
|
||||
|
||||
pipelines = []
|
||||
|
||||
scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
|
||||
|
||||
for clf_name, clf in clfs:
|
||||
#%%
|
||||
# pipeline = Pipeline(steps=[
|
||||
# ('scaler', MinMaxScaler()),
|
||||
# #('scaler', StandardScaler()),
|
||||
# ('classifier', clf)
|
||||
# ]
|
||||
# )
|
||||
# define the data preparation for the columns
|
||||
t = [('cat', OneHotEncoder(), categorical_ix)
|
||||
, ('num', MinMaxScaler(), numerical_ix)]
|
||||
|
||||
col_transform = ColumnTransformer(transformers = t
|
||||
, remainder='passthrough')
|
||||
|
||||
pipeline = Pipeline(steps=[('prep', col_transform)
|
||||
, ('classifier', clf)])
|
||||
|
||||
#%%
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
# Model predictions
|
||||
y_pred = pipeline.predict(X_test)
|
||||
|
||||
# F1-Score
|
||||
fscore = f1_score(y_test, y_pred)
|
||||
# Precision
|
||||
pres = precision_score(y_test, y_pred)
|
||||
# Recall
|
||||
rcall = recall_score(y_test, y_pred)
|
||||
# Accuracy
|
||||
accu = accuracy_score(y_test, y_pred)
|
||||
# ROC_AUC
|
||||
roc_auc = roc_auc_score(y_test, y_pred)
|
||||
|
||||
pipelines.append(pipeline)
|
||||
|
||||
scores_df = scores_df.append({
|
||||
'Model' : clf_name,
|
||||
'F1_Score' : fscore,
|
||||
'Precision' : pres,
|
||||
'Recall' : rcall,
|
||||
'Accuracy' : accu,
|
||||
'ROC_AUC' : roc_auc
|
||||
|
||||
},
|
||||
ignore_index = True)
|
||||
|
||||
return pipelines, scores_df
|
||||
|
89
my_data10.py
Normal file
89
my_data10.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Mar 5 12:57:32 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%
|
||||
# data, etc for now comes from my_data6.py and/or my_data5.py
|
||||
#%%
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir(homedir + "/git/ML_AI_training/")
|
||||
|
||||
# my function
|
||||
from MultClassPipe2 import MultClassPipeline2
|
||||
#%% try combinations
|
||||
#import sys, os
|
||||
#os.system("imports.py")
|
||||
def precision(y_true,y_pred):
|
||||
return precision_score(y_true,y_pred,pos_label = 1)
|
||||
def recall(y_true,y_pred):
|
||||
return recall_score(y_true, y_pred, pos_label = 1)
|
||||
def f1(y_true,y_pred):
|
||||
return f1_score(y_true, y_pred, pos_label = 1)
|
||||
|
||||
#%%
|
||||
|
||||
numerical_features_df.shape
|
||||
categorical_features_df.shape
|
||||
all_features_df.shape
|
||||
all_features_df.dtypes
|
||||
#%%
|
||||
target = target1
|
||||
#target = target3
|
||||
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
|
||||
target,
|
||||
test_size = 0.33,
|
||||
random_state = 42)
|
||||
|
||||
X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
|
||||
target,
|
||||
test_size = 0.33,
|
||||
random_state = 42)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(all_features_df,
|
||||
target,
|
||||
test_size = 0.33,
|
||||
random_state = 42)
|
||||
#%%
|
||||
|
||||
|
||||
|
||||
#%% with feature selection
|
||||
|
||||
# Determine categorical and numerical features
|
||||
input_df = numerical_features_df.copy()
|
||||
#input_df = categorical_features_df
|
||||
#input_df = all_features_df
|
||||
|
||||
numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
|
||||
numerical_ix
|
||||
categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
|
||||
categorical_ix
|
||||
|
||||
# prepare data
|
||||
t = [('num', MinMaxScaler(), numerical_ix)
|
||||
, ('cat', OneHotEncoder(), categorical_ix)]
|
||||
|
||||
col_transform = ColumnTransformer(transformers = t
|
||||
, remainder = 'passthrough')
|
||||
|
||||
# model pipeline
|
||||
model = Pipeline(steps=[('prep', col_transform)
|
||||
, ('classifier', LogisticRegression())])
|
||||
|
||||
model.fit(X_train, y_train)
|
||||
y_pred = model.predict(X_test)
|
||||
y_pred
|
||||
|
||||
selector_log = RFECV(estimator = model
|
||||
, cv = 10
|
||||
, step = 1)
|
||||
|
||||
selector_log_x = selector_log.fit_transform(X_train, y_train)
|
||||
|
||||
print(selector_log_x.get_support())
|
||||
X_trainN.columns
|
||||
|
||||
print(selector_logistic_x.ranking_)
|
Loading…
Add table
Add a link
Reference in a new issue