From 69d0c1b557fd30a6f025a72074f964618768a0c2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 10 Mar 2022 19:20:02 +0000 Subject: [PATCH] dict --- MultClassPipe3.py | 70 +++++++++-------- loopity_loop.py | 172 ++++++++++++++++++++++++++++++++++++++++ my_data11.py | 195 ++++++++++++++++++++++++++++++++++++++++++++++ skf_mm.py | 161 ++++++++++++++++++++++++++++++++++++++ untitled21.py | 40 ++++++++++ 5 files changed, 607 insertions(+), 31 deletions(-) create mode 100644 loopity_loop.py create mode 100644 my_data11.py create mode 100644 skf_mm.py create mode 100644 untitled21.py diff --git a/MultClassPipe3.py b/MultClassPipe3.py index d30a85d..b5570ae 100644 --- a/MultClassPipe3.py +++ b/MultClassPipe3.py @@ -92,15 +92,17 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' clfs = [ ('Logistic Regression' , log_reg) - , ('Naive Bayes' , nb) + #, ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) , ('MLP' , mlp) , ('Decision Tree' , dt) , ('Extra Trees' , et) , ('Random Forest' , rf) - , ('Random Forest2' , rf2) - , ('XGBoost' , xgb) + , ('Naive Bayes' , nb) + + #, ('Random Forest2' , rf2) + #, ('XGBoost' , xgb) ] skf = StratifiedKFold(n_splits = skf_splits @@ -112,17 +114,20 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' Y = y_targetF # Initialise score metrics list to store skf results - fscoreL = [] - mccL = [] - presL = [] - recallL = [] - accuL = [] - roc_aucL = [] + # fscoreL = [] + # mccL = [] + # presL = [] + # recallL = [] + # accuL = [] + # roc_aucL = [] + skf_dict = {} + #scores_df = pd.DataFrame() for train_index, test_index in skf.split(input_df, y_targetF): x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] - + #fscoreL = {} + # for train_index, test_index in skf.split(X_array, Y): # print('\nSKF train index:', train_index # , '\nSKF test index:', test_index) @@ -139,7 +144,7 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' , ('classifier' , clf)]) # model_pipeline = Pipeline(steps=[('prep' , MinMaxScaler()) - # , ('classifier' , clf)]) + # , ('classifier' , clf)]) model_pipeline.fit(x_train_fold, y_train_fold) @@ -150,33 +155,34 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' #---------------- # F1-Score fscore = f1_score(y_test_fold, y_pred_fold) - fscoreL.append(fscore) - fscoreM = mean(fscoreL) + fscoreL[clf_name].append(fscore) + print('fscoreL Len: ', len(fscoreL)) + #fscoreM = mean(fscoreL[clf]) # Matthews correlation coefficient mcc = matthews_corrcoef(y_test_fold, y_pred_fold) - mccL.append(mcc) + mccL[clf_name].append(mcc) mccM = mean(mccL) - # Precision - pres = precision_score(y_test_fold, y_pred_fold) - presL.append(pres) - presM = mean(presL) + # # Precision + # pres = precision_score(y_test_fold, y_pred_fold) + # presL.append(pres) + # presM = mean(presL) - # Recall - recall = recall_score(y_test_fold, y_pred_fold) - recallL.append(recall) - recallM = mean(recallL) + # # Recall + # recall = recall_score(y_test_fold, y_pred_fold) + # recallL.append(recall) + # recallM = mean(recallL) - # Accuracy - accu = accuracy_score(y_test_fold, y_pred_fold) - accuL.append(accu) - accuM = mean(accuL) + # # Accuracy + # accu = accuracy_score(y_test_fold, y_pred_fold) + # accuL.append(accu) + # accuM = mean(accuL) - # ROC_AUC - roc_auc = roc_auc_score(y_test_fold, y_pred_fold) - roc_aucL.append(roc_auc) - roc_aucM = mean(roc_aucL) + # # ROC_AUC + # roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + # roc_aucL.append(roc_auc) + # roc_aucM = mean(roc_aucL) clf_scores_df = clf_scores_df.append({'Model' : clf_name ,'F1_score' : fscoreM @@ -186,4 +192,6 @@ def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical' , 'Accuracy' : accuM , 'ROC_curve': roc_aucM} , ignore_index = True) - return clf_scores_df \ No newline at end of file + return(clf_scores_df) + #scores_df = scores_df.append(clf_scores_df) +# return clf_scores_df \ No newline at end of file diff --git a/loopity_loop.py b/loopity_loop.py new file mode 100644 index 0000000..936cc6d --- /dev/null +++ b/loopity_loop.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +import random +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler, OneHotEncoder + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from statistics import mean, stdev, median, mode +#%% +rs = {'random_state': 42} +# Done: add preprocessing step with one hot encoder +# TODO: supply stratified K-fold cv train and test data +# TODO: get accuracy and other scores through K-fold cv + +# Multiple Classification - Model Pipeline +def MultClassPipeSKF(input_df, y_targetF, var_type = ['numerical', 'categorical','mixed'], skf_splits = 10): + + ''' + @ param input_df: input features + @ type: df (gets converted to np.array for stratified Kfold, and helps identify names to apply column transformation) + + @param y_outputF: target (or output) feature + @type: df or np.array + + + returns + multiple classification model scores + + ''' + # Determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + +#%% Define classification models to run + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter = 500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + rf2 = RandomForestClassifier( + min_samples_leaf = 50, + n_estimators = 150, + bootstrap = True, + oob_score = True, + n_jobs = -1, + random_state = 42, + max_features = 'auto') + + xgb = XGBClassifier(**rs, verbosity = 0) + classification_metrics = { + 'F1_score': [] + ,'MCC': [] + ,'Precision': [] + ,'Recall': [] + ,'Accuracy': [] + ,'ROC_curve': [] + } + models = [ + ('Logistic Regression' , log_reg) + #, ('Naive Bayes' , nb) + , ('K-Nearest Neighbors', knn) + # , ('SVM' , svm) + # , ('MLP' , mlp) + # , ('Decision Tree' , dt) + # , ('Extra Trees' , et) + # , ('Random Forest' , rf) + # , ('Naive Bayes' , nb) + + #, ('Random Forest2' , rf2) + #, ('XGBoost' , xgb) + ] + + skf = StratifiedKFold(n_splits = skf_splits + , shuffle = True + , **rs) + + skf_dict = {} + fold_no = 1 + fold_dict={} + + + for model_name, model in models: + fold_dict.update({ model_name: {}}) + + #scores_df = pd.DataFrame() + for train_index, test_index in skf.split(input_df, y_targetF): + x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] + y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] + #print("Fold: ", fold_no, len(train_index), len(test_index)) + + # for keys in skf_dict: + + for model_name, model in models: + print("start of model", model_name, " loop", fold_no) + #skf_dict.update({model_name: classification_metrics }) + model_pipeline = Pipeline(steps=[('prep' , col_transform) + , ('classifier' , model)]) + model_pipeline.fit(x_train_fold, y_train_fold) + y_pred_fold = model_pipeline.predict(x_test_fold) + + #---------------- + # Model metrics + #---------------- + score=f1_score(y_test_fold, y_pred_fold) + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + + fold=("fold_"+str(fold_no)) + + fold_dict[model_name].update({fold: {}}) + pp.pprint(fold_dict) + print("end of model", model_name, " loop", fold_no) + + fold_dict[model_name][fold].update(classification_metrics) + #fold_dict[model_name][fold]['F1_score'].append(score) + fold_dict[model_name][fold].update({'F1_score': score}) + fold_dict[model_name][fold].update({'MCC': mcc}) + + fold_no +=1 + #pp.pprint(skf_dict) + + return(fold_dict) + +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +#pp.pprint(t3_res) +#print(t3_res) diff --git a/my_data11.py b/my_data11.py new file mode 100644 index 0000000..e007cd1 --- /dev/null +++ b/my_data11.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Mar 5 12:57:32 2022 + +@author: tanu +""" +#%% +# Data, etc for now comes from my_data6.py and/or my_data5.py +#%% Specify dir and import functions +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") +#%% Try combinations +#import sys, os +#os.system("imports.py") +def precision(y_true,y_pred): + return precision_score(y_true,y_pred,pos_label = 1) +def recall(y_true,y_pred): + return recall_score(y_true, y_pred, pos_label = 1) +def f1(y_true,y_pred): + return f1_score(y_true, y_pred, pos_label = 1) + +#%% Check df features +numerical_features_df.shape +categorical_features_df.shape +all_features_df.shape +all_features_df.dtypes +#%% Simple train and test data splits +target = target1 +#target = target3 +X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, + target, + test_size = 0.33, + random_state = 42) + +X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, + target, + test_size = 0.33, + random_state = 42) + +X_train, X_test, y_train, y_test = train_test_split(all_features_df, + target, + test_size = 0.33, + random_state = 42) +#%% Stratified K-fold: Single model +input_df = numerical_features_df +#X_array = np.array(input_df) +var_type = 'numerical' + +input_df = all_features_df +#X_array = np.array(input_df) +var_type = 'mixed' + +input_df = categorical_features_df +#X_array = np.array(input_df) +var_type = 'categorical' + +y_targetF = target1 +#============================================================================== +numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns +numerical_ix + +categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns +categorical_ix + +# Determine preprocessing steps ~ var_type +if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + +if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + +if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + +############################################################################### +col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + +############################################################################### +rs = {'random_state': 42} +del(model1) + +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ]) + +# model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) +# , ('log_reg', LogisticRegression(**rs)) ]) + +del(model1) +nb = BernoulliNB() +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('nb', nb) ]) + +del(model1) +knn = KNeighborsClassifier() +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('knn', knn) ]) +del(model1) +svm = SVC(**rs) +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('svm', svm) ]) +del(model1) +mlp = MLPClassifier(max_iter = 500, **rs) +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('mlp', mlp) ]) +del(model1) +dt = DecisionTreeClassifier(**rs) +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('dt', dt) ]) +del(model1) +et = ExtraTreesClassifier(**rs) +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('et', et) ]) +del(model1) +rf = RandomForestClassifier(**rs) +model1 = Pipeline(steps = [('preprocess', MinMaxScaler()) + , ('rf', rf) ]) +############################################################################### +#%% run +del(mm) + +skf = StratifiedKFold(n_splits = 10 + , shuffle = True + , **rs) + +#X_array = np.array(numerical_features_df) +#Y = target1 + +model_scores_df = pd.DataFrame() +fscoreL = [] +mccL = [] +presL = [] +recallL = [] +accuL = [] +roc_aucL = [] + +# for train_index, test_index in skf.split(X_array, Y): +# x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] +# y_train_fold, y_test_fold = Y[train_index], Y[test_index] +for train_index, test_index in skf.split(input_df, y_targetF): + x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] + y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index] + + model1.fit(x_train_fold, y_train_fold) + y_pred_fold = model1.predict(x_test_fold) + + #---------------- + # Model metrics + #---------------- + # F1-Score + fscore = f1_score(y_test_fold, y_pred_fold) + fscoreL.append(fscore) + fscoreM = mean(fscoreL) + + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + mccL.append(mcc) + mccM = mean(mccL) + + # Precision + pres = precision_score(y_test_fold, y_pred_fold) + presL.append(pres) + presM = mean(presL) + + # Recall + recall = recall_score(y_test_fold, y_pred_fold) + recallL.append(recall) + recallM = mean(recallL) + + # Accuracy + accu = accuracy_score(y_test_fold, y_pred_fold) + accuL.append(accu) + accuM = mean(accuL) + + # ROC_AUC + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_aucL.append(roc_auc) + roc_aucM = mean(roc_aucL) + + model_scores_df = model_scores_df.append({'Model' : model1.steps[1][0] + ,'F1_score' : fscoreM + , 'MCC' : mccM + , 'Precision': presM + , 'Recall' : recallM + , 'Accuracy' : accuM + , 'ROC_curve': roc_aucM} + , ignore_index = True) +print('\nModel metrics:\n', model_scores_df) +mm = model_scores_df.mean() + +print('\nModel metrics mean:\n', mm) + +print('\nModel metrics:\n', model_scores_df) diff --git a/skf_mm.py b/skf_mm.py new file mode 100644 index 0000000..603e614 --- /dev/null +++ b/skf_mm.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 10 10:33:15 2022 + +@author: tanu +""" +#%% Stratified KFold: Multiple_models: +input_df = numerical_features_df +#X_array = np.array(input_df) +var_type = 'numerical' + +input_df = all_features_df +#X_array = np.array(input_df) +var_type = 'mixed' + +input_df = categorical_features_df +#X_array = np.array(input_df) +var_type = 'categorical' + +targetF = target1 +#============================================================================== +numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns +numerical_ix + +categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns +categorical_ix +# Determine preprocessing steps ~ var_type +if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + +if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + +if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + +############################################################################### +col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + +############################################################################### +rs = {'random_state': 42} + +#log_reg = LogisticRegression(**rs) +log_reg = LogisticRegression(class_weight = 'balanced') +nb = BernoulliNB() +rf = RandomForestClassifier(**rs) + +clfs = [('Logistic Regression', log_reg) + ,('Naive Bayes' , nb) + , ('Random Forest' , rf) + ] + +#seed_skf = 42 +skf = StratifiedKFold(n_splits = 10 + , shuffle = True + #, random_state = seed_skf + , **rs) +#scores_df = pd.DataFrame() +fscoreL = [] +mccL = [] +presL = [] +recallL = [] +accuL = [] +roc_aucL = [] + +# X_array = np.array(input_df) +# Y = np.array(target1) +# Y = target1 + +for train_index, test_index in skf.split(input_df, targetF): + print('\nSKF train index:', train_index + , '\nSKF test index:', test_index) + x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index] + y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index] +# for train_index, test_index in skf.split(X_array, Y): +# print('\nSKF train index:', train_index +# , '\nSKF test index:', test_index) + # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index] + # y_train_fold, y_test_fold = Y[train_index], Y[test_index] + + + clf_scores_df = pd.DataFrame() + for clf_name, clf in clfs: + # model2 = Pipeline(steps=[('preprocess', MinMaxScaler()) + # , ('classifier', clf)]) + model2 = Pipeline(steps=[('preprocess', col_transform) + , ('classifier', clf)]) + + model2.fit(x_train_fold, y_train_fold) + y_pred_fold = model2.predict(x_test_fold) + + #---------------- + # Model metrics + #---------------- + # F1-Score + fscore = f1_score(y_test_fold, y_pred_fold) + fscoreL.append(fscore) +# print('fscoreL Len: ', len(fscoreL)) + fscoreM = mean(fscoreL) + + # Matthews correlation coefficient + mcc = matthews_corrcoef(y_test_fold, y_pred_fold) + mccL.append(mcc) + mccM = mean(mccL) + + # Precision + pres = precision_score(y_test_fold, y_pred_fold) + presL.append(pres) + presM = mean(presL) + + # Recall + recall = recall_score(y_test_fold, y_pred_fold) + recallL.append(recall) + recallM = mean(recallL) + + # Accuracy + accu = accuracy_score(y_test_fold, y_pred_fold) + accuL.append(accu) + accuM = mean(accuL) + + # ROC_AUC + roc_auc = roc_auc_score(y_test_fold, y_pred_fold) + roc_aucL.append(roc_auc) + roc_aucM = mean(roc_aucL) + + clf_scores_df = clf_scores_df.append({'Model': clf_name + ,'F1_score' : fscoreM + , 'MCC' : mccM + , 'Precision': presM + , 'Recall' : recallM + , 'Accuracy' : accuM + , 'ROC_curve': roc_aucM} + , ignore_index = True) + #scores_df = scores_df.append(clf_scores_df) + + +#%% Call functions + +tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN) +tN_res + +t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df) +t2_res + +#CHECK: numbers are awfully close to each other! + +t3_res = MultClassPipeSKF(input_df = numerical_features_df + , y_targetF = target1 + , var_type = 'numerical' + , skf_splits = 10) +t3_res + +#CHECK: numbers are awfully close to each other! +t4_res = MultClassPipeSKF(input_df = all_features_df + , y_targetF = target1 + , var_type = 'mixed' + , skf_splits = 10) +t4_res \ No newline at end of file diff --git a/untitled21.py b/untitled21.py new file mode 100644 index 0000000..a3b23be --- /dev/null +++ b/untitled21.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 10 18:06:34 2022 + +@author: tanu +""" +models = [ + ('Logistic Regression' , log_reg) + , ('K-Nearest Neighbors', knn) + ] + +classification_metrics = { + 'F1_score': [] + ,'MCC': [] + ,'Precision': [] + ,'Recall': [] + ,'Accuracy': [] + ,'ROC_curve': [] + } + +folds=[1,2] +fold_no=1 +fold_dict={} +for model_name, model in models: + fold_dict.update({model_name: {}}) + +for f in folds: + fold=("fold_"+str(fold_no)) + for model_name, model in models: + print("start of model", model_name, "fold: ", fold) + fold_dict[model_name].update({fold: {}}) + fold_dict[model_name][fold].update(classification_metrics) + + print("end of model", model_name, "fold: ", fold) + fold_dict[model_name][fold].update({'F1_score': random.randrange(1,10)}) + fold_no +=1 + pp.pprint(fold_dict) + + \ No newline at end of file