ML_AI_training/my_data10.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  5 12:57:32 2022

@author: tanu
"""
#%%
# data, etc for now  comes from my_data6.py and/or my_data5.py
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")

# my function
from MultClassPipe2 import MultClassPipeline2
#%% try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)

#%%

numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
all_features_df.dtypes
#%%
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)

X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)

X_train, X_test, y_train, y_test = train_test_split(all_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)
#%%


#%% with feature selection

# Determine categorical and numerical features
input_df = numerical_features_df.copy()
#input_df = categorical_features_df
#input_df = all_features_df

numerical_ix = all_features_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix
categorical_ix = all_features_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix

# prepare data
t = [('num', MinMaxScaler(), numerical_ix)
     , ('cat', OneHotEncoder(), categorical_ix)]

col_transform = ColumnTransformer(transformers = t
                                  , remainder  = 'passthrough')

# model pipeline
model = Pipeline(steps=[('prep', col_transform)
                        , ('classifier', LogisticRegression())])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

selector_log = RFECV(estimator = model
                       , cv = 10
                       , step = 1)

selector_log_x = selector_log.fit_transform(X_train, y_train)

print(selector_log_x.get_support())
X_trainN.columns

print(selector_logistic_x.ranking_)