added MultClassPipe2.py that has one hot encoder included

2022-03-07 18:27:29 +00:00 · 2022-03-07 18:27:29 +00:00 · dd8fd5b8ac
commit dd8fd5b8ac
parent b637ebc6d2
2 changed files with 216 additions and 0 deletions
--- a/my_data10.py
+++ b/my_data10.py
@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Mar  5 12:57:32 2022
+
+@author: tanu
+"""
+#%%
+# data, etc for now  comes from my_data6.py and/or my_data5.py
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+
+# my function
+from MultClassPipe2 import MultClassPipeline2 
+#%% try combinations
+#import sys, os
+#os.system("imports.py")
+def precision(y_true,y_pred):
+    return precision_score(y_true,y_pred,pos_label = 1)
+def recall(y_true,y_pred):
+    return recall_score(y_true, y_pred, pos_label = 1)
+def f1(y_true,y_pred):
+    return f1_score(y_true, y_pred, pos_label = 1)
+
+#%%
+
+numerical_features_df.shape
+categorical_features_df.shape
+all_features_df.shape
+all_features_df.dtypes
+#%%
+target = target1
+#target = target3
+X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+
+X_train, X_test, y_train, y_test = train_test_split(all_features_df, 
+                                                    target, 
+                                                    test_size = 0.33, 
+                                                    random_state = 42)
+#%%
+
+
+
+#%% with feature selection
+
+# Determine categorical and numerical features
+input_df = numerical_features_df.copy()
+#input_df = categorical_features_df
+#input_df = all_features_df
+
+numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
+numerical_ix
+categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
+categorical_ix
+
+# prepare data
+t = [('num', MinMaxScaler(), numerical_ix)
+     , ('cat', OneHotEncoder(), categorical_ix)]
+      
+col_transform = ColumnTransformer(transformers = t
+                                  , remainder  = 'passthrough')
+
+# model pipeline
+model = Pipeline(steps=[('prep', col_transform)
+                        , ('classifier', LogisticRegression())])
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+y_pred
+
+selector_log = RFECV(estimator = model
+                       , cv = 10
+                       , step = 1)
+
+selector_log_x = selector_log.fit_transform(X_train, y_train)
+
+print(selector_log_x.get_support())
+X_trainN.columns
+
+print(selector_logistic_x.ranking_)