diff --git a/MultClassPipe2.py b/MultClassPipe2.py new file mode 100644 index 0000000..e4ea381 --- /dev/null +++ b/MultClassPipe2.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 4 15:25:33 2022 + +@author: tanu +""" +#%% +import os, sys +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import Pipeline +from xgboost import XGBClassifier +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import MinMaxScaler, OneHotEncoder +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score +#%% +rs = {'random_state': 42} +# Done: add preprocessing step with one hot encoder +# TODO: supply stratified K-fold cv train and test data +# TODO: get accuracy and other scores through K-fold cv + +# Multiple Classification - Model Pipeline +def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df): + + # determine categorical and numerical features + numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns + numerical_ix + categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns + categorical_ix + + + log_reg = LogisticRegression(**rs) + nb = BernoulliNB() + knn = KNeighborsClassifier() + svm = SVC(**rs) + mlp = MLPClassifier(max_iter=500, **rs) + dt = DecisionTreeClassifier(**rs) + et = ExtraTreesClassifier(**rs) + rf = RandomForestClassifier(**rs) + rf2 = RandomForestClassifier( + min_samples_leaf=50, + n_estimators=150, + bootstrap=True, + oob_score=True, + n_jobs=-1, + random_state=42, + max_features='auto') + + xgb = XGBClassifier(**rs, verbosity=0) + + clfs = [ + ('Logistic Regression', log_reg), + ('Naive Bayes', nb), + ('K-Nearest Neighbors', knn), + ('SVM', svm), + ('MLP', mlp), + ('Decision Tree', dt), + ('Extra Trees', et), + ('Random Forest', rf), + ('Random Forest2', rf2), + ('XGBoost', xgb) + ] + + + pipelines = [] + + scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC']) + + for clf_name, clf in clfs: +#%% + # pipeline = Pipeline(steps=[ + # ('scaler', MinMaxScaler()), + # #('scaler', StandardScaler()), + # ('classifier', clf) + # ] + # ) + # define the data preparation for the columns + t = [('cat', OneHotEncoder(), categorical_ix) + , ('num', MinMaxScaler(), numerical_ix)] + + col_transform = ColumnTransformer(transformers = t + , remainder='passthrough') + + pipeline = Pipeline(steps=[('prep', col_transform) + , ('classifier', clf)]) + +#%% + pipeline.fit(X_train, y_train) + + # Model predictions + y_pred = pipeline.predict(X_test) + + # F1-Score + fscore = f1_score(y_test, y_pred) + # Precision + pres = precision_score(y_test, y_pred) + # Recall + rcall = recall_score(y_test, y_pred) + # Accuracy + accu = accuracy_score(y_test, y_pred) + # ROC_AUC + roc_auc = roc_auc_score(y_test, y_pred) + + pipelines.append(pipeline) + + scores_df = scores_df.append({ + 'Model' : clf_name, + 'F1_Score' : fscore, + 'Precision' : pres, + 'Recall' : rcall, + 'Accuracy' : accu, + 'ROC_AUC' : roc_auc + + }, + ignore_index = True) + + return pipelines, scores_df + diff --git a/my_data10.py b/my_data10.py new file mode 100644 index 0000000..d37ff41 --- /dev/null +++ b/my_data10.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Mar 5 12:57:32 2022 + +@author: tanu +""" +#%% +# data, etc for now comes from my_data6.py and/or my_data5.py +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +from MultClassPipe2 import MultClassPipeline2 +#%% try combinations +#import sys, os +#os.system("imports.py") +def precision(y_true,y_pred): + return precision_score(y_true,y_pred,pos_label = 1) +def recall(y_true,y_pred): + return recall_score(y_true, y_pred, pos_label = 1) +def f1(y_true,y_pred): + return f1_score(y_true, y_pred, pos_label = 1) + +#%% + +numerical_features_df.shape +categorical_features_df.shape +all_features_df.shape +all_features_df.dtypes +#%% +target = target1 +#target = target3 +X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, + target, + test_size = 0.33, + random_state = 42) + +X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, + target, + test_size = 0.33, + random_state = 42) + +X_train, X_test, y_train, y_test = train_test_split(all_features_df, + target, + test_size = 0.33, + random_state = 42) +#%% + + + +#%% with feature selection + +# Determine categorical and numerical features +input_df = numerical_features_df.copy() +#input_df = categorical_features_df +#input_df = all_features_df + +numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns +numerical_ix +categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns +categorical_ix + +# prepare data +t = [('num', MinMaxScaler(), numerical_ix) + , ('cat', OneHotEncoder(), categorical_ix)] + +col_transform = ColumnTransformer(transformers = t + , remainder = 'passthrough') + +# model pipeline +model = Pipeline(steps=[('prep', col_transform) + , ('classifier', LogisticRegression())]) + +model.fit(X_train, y_train) +y_pred = model.predict(X_test) +y_pred + +selector_log = RFECV(estimator = model + , cv = 10 + , step = 1) + +selector_log_x = selector_log.fit_transform(X_train, y_train) + +print(selector_log_x.get_support()) +X_trainN.columns + +print(selector_logistic_x.ranking_) \ No newline at end of file