renamed hyperparams to gscv

2022-03-22 11:08:20 +00:00 · 2022-03-22 11:08:20 +00:00 · ad5ebad7f8
commit ad5ebad7f8
parent a82358dbb4
31 changed files with 4433 additions and 0 deletions
--- a/earlier_versions/MultClassPipe.py
+++ b/earlier_versions/MultClassPipe.py
@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar  4 15:25:33 2022
+
+@author: tanu
+"""
+#%%
+import os, sys
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
+#%%
+rs = {'random_state': 42}
+# TODO: add preprocessing step with one hot encoder
+
+# Multiple Classification - Model Pipeline
+def MultClassPipeline(X_train, X_test, y_train, y_test):
+
+    log_reg = LogisticRegression(**rs)
+    nb = BernoulliNB()
+    knn = KNeighborsClassifier()
+    svm = SVC(**rs)
+    mlp = MLPClassifier(max_iter=500, **rs)
+    dt = DecisionTreeClassifier(**rs)
+    et = ExtraTreesClassifier(**rs)
+    rf = RandomForestClassifier(**rs)
+    rf2 = RandomForestClassifier(
+                          min_samples_leaf=50,
+                          n_estimators=150,
+                          bootstrap=True,
+                          oob_score=True,
+                          n_jobs=-1,
+                          random_state=42,
+                          max_features='auto')
+    
+    xgb = XGBClassifier(**rs, verbosity=0)
+
+    clfs = [
+            ('Logistic Regression', log_reg), 
+            ('Naive Bayes', nb),
+            ('K-Nearest Neighbors', knn), 
+            ('SVM', svm), 
+            ('MLP', mlp), 
+            ('Decision Tree', dt), 
+            ('Extra Trees', et), 
+            ('Random Forest', rf), 
+            ('Random Forest2', rf2), 
+            ('XGBoost', xgb)
+            ]
+
+
+    pipelines = []
+
+    scores_df = pd.DataFrame(columns=['Model', 'F1_Score', 'MCC', 'Precision', 'Recall', 'Accuracy', 'ROC_AUC'])
+
+    for clf_name, clf in clfs:
+
+        pipeline = Pipeline(steps=[
+                                   ('scaler', MinMaxScaler()),
+                                   #('scaler', StandardScaler()),
+                                   ('classifier', clf)
+                                   ]
+                            )
+        pipeline.fit(X_train, y_train)
+
+        # Model predictions
+        y_pred  = pipeline.predict(X_test)
+        
+        # F1-Score
+        fscore  = f1_score(y_test, y_pred)
+        # Precision
+        pres    = precision_score(y_test, y_pred)
+        # Recall
+        recall   = recall_score(y_test, y_pred)
+        # Accuracy
+        accu    = accuracy_score(y_test, y_pred)
+        # ROC_AUC
+        roc_auc = roc_auc_score(y_test, y_pred)
+        # Matthews correlation coefficient
+        mcc =  matthews_corrcoef(y_test, y_pred)
+        
+        pipelines.append(pipeline)
+
+        scores_df = scores_df.append({
+                                      'Model'       : clf_name
+                                      , 'F1_Score'  : fscore
+                                      , 'MCC'       : mcc
+                                      , 'Precision' : pres
+                                      , 'Recall'    : recall
+                                      , 'Accuracy'  : accu
+                                      , 'ROC_AUC'   : roc_auc
+                                      }
+                                     , ignore_index = True)
+        
+    return pipelines, scores_df
+