added MultClassPipe2.py that has one hot encoder included

2022-03-07 18:27:29 +00:00 · 2022-03-07 18:27:29 +00:00 · dd8fd5b8ac
commit dd8fd5b8ac
parent b637ebc6d2
2 changed files with 216 additions and 0 deletions
--- a/MultClassPipe2.py
+++ b/MultClassPipe2.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 15:25:33 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score
+#%%
+rs = {'random_state': 42}
+# Done: add preprocessing step with one hot encoder
+# TODO: supply stratified K-fold cv train and test data
+# TODO: get accuracy and other scores through K-fold cv
+
+# Multiple Classification - Model Pipeline
+def MultClassPipeline2(X_train, X_test, y_train, y_test, input_df):
+
+    # determine categorical and numerical features
+    numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
+    numerical_ix
+    categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
+    categorical_ix    
+
+
+    log_reg = LogisticRegression(**rs)
+    nb = BernoulliNB()
+    knn = KNeighborsClassifier()
+    svm = SVC(**rs)
+    mlp = MLPClassifier(max_iter=500, **rs)
+    dt = DecisionTreeClassifier(**rs)
+    et = ExtraTreesClassifier(**rs)
+    rf = RandomForestClassifier(**rs)
+    rf2 = RandomForestClassifier(
+                          min_samples_leaf=50,
+                          n_estimators=150,
+                          bootstrap=True,
+                          oob_score=True,
+                          n_jobs=-1,
+                          random_state=42,
+                          max_features='auto')
+    
+    xgb = XGBClassifier(**rs, verbosity=0)
+
+    clfs = [
+            ('Logistic Regression', log_reg), 
+            ('Naive Bayes', nb),
+            ('K-Nearest Neighbors', knn), 
+            ('SVM', svm), 
+            ('MLP', mlp), 
+            ('Decision Tree', dt), 
+            ('Extra Trees', et), 
+            ('Random Forest', rf), 
+            ('Random Forest2', rf2), 
+            ('XGBoost', xgb)
+            ]
+
+
+    pipelines = []
+
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+    for clf_name, clf in clfs:
+#%%
+        # pipeline = Pipeline(steps=[
+        #                            ('scaler', MinMaxScaler()),
+        #                            #('scaler', StandardScaler()),
+        #                            ('classifier', clf)
+        #                            ]
+        #                     )
+        # define the data preparation for the columns
+        t = [('cat', OneHotEncoder(), categorical_ix)
+             , ('num', MinMaxScaler(), numerical_ix)]
+        
+        col_transform = ColumnTransformer(transformers = t
+                                          , remainder='passthrough')
+                
+        pipeline = Pipeline(steps=[('prep', col_transform)
+                                   , ('classifier', clf)])
+            
+#%%        
+        pipeline.fit(X_train, y_train)
+
+        # Model predictions
+        y_pred  = pipeline.predict(X_test)
+        
+        # F1-Score
+        fscore  = f1_score(y_test, y_pred)
+        # Precision
+        pres    = precision_score(y_test, y_pred)
+        # Recall
+        rcall   = recall_score(y_test, y_pred)
+        # Accuracy
+        accu    = accuracy_score(y_test, y_pred)
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test, y_pred)
+
+        pipelines.append(pipeline)
+
+        scores_df = scores_df.append({
+                                      'Model'     : clf_name, 
+                                      'F1_Score'  : fscore,
+                                      'Precision' : pres,
+                                      'Recall'    : rcall,
+                                      'Accuracy'  : accu,
+                                      'ROC_AUC'   : roc_auc
+                                      
+                                      }, 
+                                     ignore_index = True)
+        
+    return pipelines, scores_df
+
--- a/my_data10.py
+++ b/my_data10.py
@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# data, etc for now  comes from my_data6.py and/or my_data5.py
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+
+# my function
+from MultClassPipe2 import MultClassPipeline2 
+#%% try combinations
+#import sys, os
+#os.system("imports.py")
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+#%%
+
+numerical_features_df.shape
+categorical_features_df.shape
+all_features_df.shape
+all_features_df.dtypes
+#%%
+target = target1
+#target = target3
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+#%%
+
+
+
+#%% with feature selection
+
+# Determine categorical and numerical features
+input_df = numerical_features_df.copy()
+#input_df = categorical_features_df
+#input_df = all_features_df
+
+numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix
+
+# prepare data
+t = [('num', MinMaxScaler(), numerical_ix)
+     , ('cat', OneHotEncoder(), categorical_ix)]
+      
+col_transform = ColumnTransformer(transformers = t
+                                  , remainder  = 'passthrough')
+
+# model pipeline
+model = Pipeline(steps=[('prep', col_transform)
+                        , ('classifier', LogisticRegression())])
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+y_pred
+
+selector_log = RFECV(estimator = model
+                       , cv = 10
+                       , step = 1)
+
+selector_log_x = selector_log.fit_transform(X_train, y_train)
+
+print(selector_log_x.get_support())
+X_trainN.columns
+
+print(selector_logistic_x.ranking_)