This commit is contained in:
Tanushree Tunstall 2022-03-10 19:20:02 +00:00
parent d733b980ba
commit 69d0c1b557
5 changed files with 607 additions and 31 deletions

View file

@ -92,15 +92,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
clfs = [ clfs = [
('Logistic Regression' , log_reg) ('Logistic Regression' , log_reg)
, ('Naive Bayes' , nb) #, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn) , ('K-Nearest Neighbors', knn)
, ('SVM' , svm) , ('SVM' , svm)
, ('MLP' , mlp) , ('MLP' , mlp)
, ('Decision Tree' , dt) , ('Decision Tree' , dt)
, ('Extra Trees' , et) , ('Extra Trees' , et)
, ('Random Forest' , rf) , ('Random Forest' , rf)
, ('Random Forest2' , rf2) , ('Naive Bayes' , nb)
, ('XGBoost' , xgb)
#, ('Random Forest2' , rf2)
#, ('XGBoost' , xgb)
] ]
skf = StratifiedKFold(n_splits = skf_splits skf = StratifiedKFold(n_splits = skf_splits
@ -112,17 +114,20 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
Y = y_targetF Y = y_targetF
# Initialise score metrics list to store skf results # Initialise score metrics list to store skf results
fscoreL = [] # fscoreL = []
mccL = [] # mccL = []
presL = [] # presL = []
recallL = [] # recallL = []
accuL = [] # accuL = []
roc_aucL = [] # roc_aucL = []
skf_dict = {}
#scores_df = pd.DataFrame()
for train_index, test_index in skf.split(input_df, y_targetF): for train_index, test_index in skf.split(input_df, y_targetF):
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
#fscoreL = {}
# for train_index, test_index in skf.split(X_array, Y): # for train_index, test_index in skf.split(X_array, Y):
# print('\nSKF train index:', train_index # print('\nSKF train index:', train_index
# , '\nSKF test index:', test_index) # , '\nSKF test index:', test_index)
@ -139,7 +144,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
, ('classifier' , clf)]) , ('classifier' , clf)])
# model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler()) # model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler())
# , ('classifier' , clf)]) # , ('classifier' , clf)])
model_pipeline.fit(x_train_fold, y_train_fold) model_pipeline.fit(x_train_fold, y_train_fold)
@ -150,33 +155,34 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
#---------------- #----------------
# F1-Score # F1-Score
fscore = f1_score(y_test_fold, y_pred_fold) fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore) fscoreL[clf_name].append(fscore)
fscoreM = mean(fscoreL) print('fscoreL Len: ', len(fscoreL))
#fscoreM = mean(fscoreL[clf])
# Matthews correlation coefficient # Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold) mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc) mccL[clf_name].append(mcc)
mccM = mean(mccL) mccM = mean(mccL)
# Precision # # Precision
pres = precision_score(y_test_fold, y_pred_fold) # pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres) # presL.append(pres)
presM = mean(presL) # presM = mean(presL)
# Recall # # Recall
recall = recall_score(y_test_fold, y_pred_fold) # recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall) # recallL.append(recall)
recallM = mean(recallL) # recallM = mean(recallL)
# Accuracy # # Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold) # accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu) # accuL.append(accu)
accuM = mean(accuL) # accuM = mean(accuL)
# ROC_AUC # # ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold) # roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc) # roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL) # roc_aucM = mean(roc_aucL)
clf_scores_df = clf_scores_df.append({'Model' : clf_name clf_scores_df = clf_scores_df.append({'Model' : clf_name
,'F1_score' : fscoreM ,'F1_score' : fscoreM
@ -186,4 +192,6 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical'
, 'Accuracy' : accuM , 'Accuracy' : accuM
, 'ROC_curve': roc_aucM} , 'ROC_curve': roc_aucM}
, ignore_index = True) , ignore_index = True)
return clf_scores_df return(clf_scores_df)
#scores_df = scores_df.append(clf_scores_df)
# return clf_scores_df

172
loopity_loop.py Normal file
View file

@ -0,0 +1,172 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 4 15:25:33 2022
@author: tanu
"""
#%%
import os, sys
import pandas as pd
import numpy as np
import pprint as pp
import random
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
from statistics import mean, stdev, median, mode
#%%
rs = {'random_state': 42}
# Done: add preprocessing step with one hot encoder
# TODO: supply stratified K-fold cv train and test data
# TODO: get accuracy and other scores through K-fold cv
# Multiple Classification - Model Pipeline
def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10):
'''
@ param input_df: input features
@ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation)
@param y_outputF: target (or output) feature
@type: df or np.array
returns
multiple classification model scores
'''
# Determine categorical and numerical features
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
#%% Define classification models to run
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter = 500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier(
min_samples_leaf = 50,
n_estimators = 150,
bootstrap = True,
oob_score = True,
n_jobs = -1,
random_state = 42,
max_features = 'auto')
xgb = XGBClassifier(**rs, verbosity = 0)
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
,'ROC_curve': []
}
models = [
('Logistic Regression' , log_reg)
#, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn)
# , ('SVM' , svm)
# , ('MLP' , mlp)
# , ('Decision Tree' , dt)
# , ('Extra Trees' , et)
# , ('Random Forest' , rf)
# , ('Naive Bayes' , nb)
#, ('Random Forest2' , rf2)
#, ('XGBoost' , xgb)
]
skf = StratifiedKFold(n_splits = skf_splits
, shuffle = True
, **rs)
skf_dict = {}
fold_no = 1
fold_dict={}
for model_name, model in models:
fold_dict.update({ model_name: {}})
#scores_df = pd.DataFrame()
for train_index, test_index in skf.split(input_df, y_targetF):
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
#print("Fold: ", fold_no, len(train_index), len(test_index))
# for keys in skf_dict:
for model_name, model in models:
print("start of model", model_name, " loop", fold_no)
#skf_dict.update({model_name: classification_metrics })
model_pipeline = Pipeline(steps=[('prep' , col_transform)
, ('classifier' , model)])
model_pipeline.fit(x_train_fold, y_train_fold)
y_pred_fold = model_pipeline.predict(x_test_fold)
#----------------
# Model metrics
#----------------
score=f1_score(y_test_fold, y_pred_fold)
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
fold=("fold_"+str(fold_no))
fold_dict[model_name].update({fold: {}})
pp.pprint(fold_dict)
print("end of model", model_name, " loop", fold_no)
fold_dict[model_name][fold].update(classification_metrics)
#fold_dict[model_name][fold]['F1_score'].append(score)
fold_dict[model_name][fold].update({'F1_score': score})
fold_dict[model_name][fold].update({'MCC': mcc})
fold_no +=1
#pp.pprint(skf_dict)
return(fold_dict)
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
#pp.pprint(t3_res)
#print(t3_res)

195
my_data11.py Normal file
View file

@ -0,0 +1,195 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 5 12:57:32 2022
@author: tanu
"""
#%%
# Data, etc for now comes from my_data6.py and/or my_data5.py
#%% Specify dir and import functions
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
#%% Try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
return f1_score(y_true, y_pred, pos_label = 1)
#%% Check df features
numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
all_features_df.dtypes
#%% Simple train and test data splits
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
target,
test_size = 0.33,
random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(all_features_df,
target,
test_size = 0.33,
random_state = 42)
#%% Stratified K-fold: Single model
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'
input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'
input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'
y_targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
###############################################################################
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
###############################################################################
rs = {'random_state': 42}
del(model1)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ])
# model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
# , ('log_reg', LogisticRegression(**rs)) ])
del(model1)
nb = BernoulliNB()
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('nb', nb) ])
del(model1)
knn = KNeighborsClassifier()
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('knn', knn) ])
del(model1)
svm = SVC(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('svm', svm) ])
del(model1)
mlp = MLPClassifier(max_iter = 500, **rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('mlp', mlp) ])
del(model1)
dt = DecisionTreeClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('dt', dt) ])
del(model1)
et = ExtraTreesClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('et', et) ])
del(model1)
rf = RandomForestClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
, ('rf', rf) ])
###############################################################################
#%% run
del(mm)
skf = StratifiedKFold(n_splits = 10
, shuffle = True
, **rs)
#X_array = np.array(numerical_features_df)
#Y = target1
model_scores_df = pd.DataFrame()
fscoreL = []
mccL = []
presL = []
recallL = []
accuL = []
roc_aucL = []
# for train_index, test_index in skf.split(X_array, Y):
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
for train_index, test_index in skf.split(input_df, y_targetF):
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]
model1.fit(x_train_fold, y_train_fold)
y_pred_fold = model1.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore)
fscoreM = mean(fscoreL)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc)
mccM = mean(mccL)
# Precision
pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres)
presM = mean(presL)
# Recall
recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall)
recallM = mean(recallL)
# Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu)
accuM = mean(accuL)
# ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL)
model_scores_df = model_scores_df.append({'Model' : model1.steps[1][0]
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
print('\nModel metrics:\n', model_scores_df)
mm = model_scores_df.mean()
print('\nModel metrics mean:\n', mm)
print('\nModel metrics:\n', model_scores_df)

161
skf_mm.py Normal file
View file

@ -0,0 +1,161 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 10:33:15 2022
@author: tanu
"""
#%% Stratified KFold: Multiple_models:
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'
input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'
input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'
targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
t = [('num', MinMaxScaler(), numerical_ix)]
if var_type == 'categorical':
t = [('cat', OneHotEncoder(), categorical_ix)]
if var_type == 'mixed':
t = [('cat', OneHotEncoder(), categorical_ix)
, ('num', MinMaxScaler(), numerical_ix)]
###############################################################################
col_transform = ColumnTransformer(transformers = t
, remainder='passthrough')
###############################################################################
rs = {'random_state': 42}
#log_reg = LogisticRegression(**rs)
log_reg = LogisticRegression(class_weight = 'balanced')
nb = BernoulliNB()
rf = RandomForestClassifier(**rs)
clfs = [('Logistic Regression', log_reg)
,('Naive Bayes' , nb)
, ('Random Forest' , rf)
]
#seed_skf = 42
skf = StratifiedKFold(n_splits = 10
, shuffle = True
#, random_state = seed_skf
, **rs)
#scores_df = pd.DataFrame()
fscoreL = []
mccL = []
presL = []
recallL = []
accuL = []
roc_aucL = []
# X_array = np.array(input_df)
# Y = np.array(target1)
# Y = target1
for train_index, test_index in skf.split(input_df, targetF):
print('\nSKF train index:', train_index
, '\nSKF test index:', test_index)
x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index]
# for train_index, test_index in skf.split(X_array, Y):
# print('\nSKF train index:', train_index
# , '\nSKF test index:', test_index)
# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
# y_train_fold, y_test_fold = Y[train_index], Y[test_index]
clf_scores_df = pd.DataFrame()
for clf_name, clf in clfs:
# model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
# , ('classifier', clf)])
model2 = Pipeline(steps=[('preprocess', col_transform)
, ('classifier', clf)])
model2.fit(x_train_fold, y_train_fold)
y_pred_fold = model2.predict(x_test_fold)
#----------------
# Model metrics
#----------------
# F1-Score
fscore = f1_score(y_test_fold, y_pred_fold)
fscoreL.append(fscore)
# print('fscoreL Len: ', len(fscoreL))
fscoreM = mean(fscoreL)
# Matthews correlation coefficient
mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
mccL.append(mcc)
mccM = mean(mccL)
# Precision
pres = precision_score(y_test_fold, y_pred_fold)
presL.append(pres)
presM = mean(presL)
# Recall
recall = recall_score(y_test_fold, y_pred_fold)
recallL.append(recall)
recallM = mean(recallL)
# Accuracy
accu = accuracy_score(y_test_fold, y_pred_fold)
accuL.append(accu)
accuM = mean(accuL)
# ROC_AUC
roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
roc_aucL.append(roc_auc)
roc_aucM = mean(roc_aucL)
clf_scores_df = clf_scores_df.append({'Model': clf_name
,'F1_score' : fscoreM
, 'MCC' : mccM
, 'Precision': presM
, 'Recall' : recallM
, 'Accuracy' : accuM
, 'ROC_curve': roc_aucM}
, ignore_index = True)
#scores_df = scores_df.append(clf_scores_df)
#%% Call functions
tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res
t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res
#CHECK: numbers are awfully close to each other!
t3_res = MultClassPipeSKF(input_df = numerical_features_df
, y_targetF = target1
, var_type = 'numerical'
, skf_splits = 10)
t3_res
#CHECK: numbers are awfully close to each other!
t4_res = MultClassPipeSKF(input_df = all_features_df
, y_targetF = target1
, var_type = 'mixed'
, skf_splits = 10)
t4_res

40
untitled21.py Normal file
View file

@ -0,0 +1,40 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 18:06:34 2022
@author: tanu
"""
models = [
('Logistic Regression' , log_reg)
, ('K-Nearest Neighbors', knn)
]
classification_metrics = {
'F1_score': []
,'MCC': []
,'Precision': []
,'Recall': []
,'Accuracy': []
,'ROC_curve': []
}
folds=[1,2]
fold_no=1
fold_dict={}
for model_name, model in models:
fold_dict.update({model_name: {}})
for f in folds:
fold=("fold_"+str(fold_no))
for model_name, model in models:
print("start of model", model_name, "fold: ", fold)
fold_dict[model_name].update({fold: {}})
fold_dict[model_name][fold].update(classification_metrics)
print("end of model", model_name, "fold: ", fold)
fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)})
fold_no +=1
pp.pprint(fold_dict)