From 42c8c47e2d3f548db50aae83717b5778e19e38d2 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 26 May 2022 07:39:47 +0100 Subject: [PATCH] added UQ_MultClassPipe4.py and UQ_imbalance.py --- UQ_MultClassPipe4.py | 243 +++++++++++++++++++++++++++++++++++++++++++ UQ_imbalance.py | 68 ++++++++++++ 2 files changed, 311 insertions(+) create mode 100644 UQ_MultClassPipe4.py create mode 100644 UQ_imbalance.py diff --git a/UQ_MultClassPipe4.py b/UQ_MultClassPipe4.py new file mode 100644 index 0000000..634ebca --- /dev/null +++ b/UQ_MultClassPipe4.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% + +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +#from copy import deepcopy +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef, jaccard_score +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours + +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis +from sklearn.neural_network import MLPClassifier + +from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.svm import SVC +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + + +#%% +rs = {'random_state': 42} +njobs = {'n_jobs': 10} + +scoring_fn = ({ 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'accuracy' : make_scorer(accuracy_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jaccard' : make_scorer(jaccard_score) + }) + + +# Multiple Classification - Model Pipeline +def MultClassPipeSKFCV(input_df, target, skf_cv, var_type = ['numerical', 'categorical','mixed']): + + ''' + @ param input_df: input features + @ type: df with input features WITHOUT the target variable + + @param target: target (or output) feature + @type: df or np.array or Series + + @param skv_cv: stratifiedK fold int or object to allow shuffle and random state to pass + @type: int or StratifiedKfold() + + @var_type: numerical, categorical and mixed to determine what col_transform to apply (MinMaxScalar and/or one-ho t encoder) + @type: list + + returns + Dict containing multiple classification scores for each model and mean of each Stratified Kfold including training + + ''' + # determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + # Determine preprocessing steps ~ var_type + if var_type == 'numerical': + t = [('num', MinMaxScaler(), numerical_ix)] + + if var_type == 'categorical': + t = [('cat', OneHotEncoder(), categorical_ix)] + + if var_type == 'mixed': + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + + #%% Specify multiple Classification models + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter = 500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs, n_estimators = 1000 ) + rf2 = RandomForestClassifier( + min_samples_leaf = 5 + , n_estimators = 100 #10 + , bootstrap = True + , oob_score = True + , **njobs + , **rs + , max_features = 'auto') + xgb = XGBClassifier(**rs, verbosity = 0, use_label_encoder =False) + + lda = LinearDiscriminantAnalysis() + + mnb = MultinomialNB() + + pa = PassiveAggressiveClassifier(**rs, **njobs) + + sgd = SGDClassifier(**rs, **njobs) + + models = [('Logistic Regression', log_reg) + , ('Naive Bayes' , nb) + , ('K-Nearest Neighbors', knn) + , ('SVM' , svm) + , ('MLP' , mlp) + # , ('Decision Tree' , dt) + # , ('Extra Trees' , et) + # , ('Random Forest' , rf) + # , ('Naive Bayes' , nb) + # , ('Random Forest2' , rf2) + # , ('XGBoost' , xgb) + # , ('LDA' , lda) + # , ('MultinomialNB' , mnb) + # , ('PassiveAggresive' , pa) + # , ('StochasticGDescent' , sgd) + ] + + mm_skf_scoresD = {} + + for model_name, model_fn in models: + print('\nModel_name:', model_name + , '\nModel func:' , model_fn + , '\nList of models:', models) + + model_pipeline = Pipeline([ + ('prep' , col_transform) + , ('model' , model_fn)]) + + print('Running model pipeline:', model_pipeline) + skf_cv_mod = cross_validate(model_pipeline + , input_df + , target + , cv = skf_cv + , scoring = scoring_fn + , return_train_score = True) + mm_skf_scoresD[model_name] = {} + for key, value in skf_cv_mod.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', mean(value)) + mm_skf_scoresD[model_name][key] = round(mean(value),2) + #pp.pprint(mm_skf_scoresD) + + #return(mm_skf_scoresD) + + + + +#%% + #========================= + # Blind test: BTS results + #========================= + # Build the final results with all scores for a feature selected model + #bts_predict = gscv_fs.predict(X_bts) + model_pipeline.fit(input_df, target) + bts_predict = model_pipeline.predict(X_bts) + + print('\nMCC on Blind test:' , round(matthews_corrcoef(y_bts, bts_predict),2)) + print('\nAccuracy on Blind test:', round(accuracy_score(y_bts, bts_predict),2)) + bts_mcc_score = round(matthews_corrcoef(y_bts, bts_predict),2) + + # Diff b/w train and bts test scores + # train_test_diff = train_bscore - bts_mcc_score + # print('\nDiff b/w train and blind test score (MCC):', train_test_diff) + + + # create a dict with all scores + lr_btsD = { 'model_name': model_name + , 'bts_mcc':None + , 'bts_fscore':None + , 'bts_precision':None + , 'bts_recall':None + , 'bts_accuracy':None + , 'bts_roc_auc':None + , 'bts_jaccard':None} + + + lr_btsD + lr_btsD['bts_mcc'] = bts_mcc_score + lr_btsD['bts_fscore'] = round(f1_score(y_bts, bts_predict),2) + lr_btsD['bts_precision'] = round(precision_score(y_bts, bts_predict),2) + lr_btsD['bts_recall'] = round(recall_score(y_bts, bts_predict),2) + lr_btsD['bts_accuracy'] = round(accuracy_score(y_bts, bts_predict),2) + lr_btsD['bts_roc_auc'] = round(roc_auc_score(y_bts, bts_predict),2) + lr_btsD['bts_jaccard'] = round(jaccard_score(y_bts, bts_predict),2) + lr_btsD + + return(lr_btsD) + diff --git a/UQ_imbalance.py b/UQ_imbalance.py new file mode 100644 index 0000000..0def695 --- /dev/null +++ b/UQ_imbalance.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 26 05:19:25 2022 + +@author: tanu +""" +#%% https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.svm import SVC +from imblearn.over_sampling import SMOTE +#%%############################################################################ + +def train_SVM(df): + # select the feature columns + X = df.loc[:, df.columns != 'label'] + # select the label column + y = df.label + + # train an SVM with linear kernel + clf = SVC(kernel='linear') + clf.fit(X, y) + + return clf + + +def plot_svm_boundary(clf, df, title): + fig, ax = plt.subplots() + X0, X1 = df.iloc[:, 0], df.iloc[:, 1] + + x_min, x_max = X0.min() - 1, X0.max() + 1 + y_min, y_max = X1.min() - 1, X1.max() + 1 + xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) + + Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + out = ax.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8) + + ax.scatter(X0, X1, c=df.label, cmap=plt.cm.coolwarm, s=20, edgecolors='k') + ax.set_ylabel('y') + ax.set_xlabel('x') + ax.set_title(title) + plt.show() +#%%############################################################################ +# SMOTE number of neighbors +#k = 1 (pnca, extra trees baseline is 0.49,numerical only) +k = 1 + +sm = SMOTE(sampling_strategy = 'auto', k_neighbors = k, **rs) +X_sm, y_sm = sm.fit_resample(X, y) +print(len(X_sm)) #228 +print(Counter(y)) +y_sm_df = y_sm.to_frame() +y_sm_df.value_counts().plot(kind = 'bar') + +oversample = RandomOverSampler(sampling_strategy='minority') +X_ros, y_ros = oversample.fit_resample(X, y) +print(len(X_ros)) #228 + +undersample = RandomUnderSampler(sampling_strategy='majority') +X_rus, y_rus = undersample.fit_resample(X, y) +print(len(X_rus)) #142 + +sm_enn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='all')) +X_enn, y_enn = sm_enn.fit_resample(X, y) +print(len(X_enn)) #53