ML_AI_training/my_datap11.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  5 12:57:32 2022

@author: tanu
"""
#%%
# Data, etc for now  comes from my_data6.py and/or my_data5.py
#%% Specify dir and import functions
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
#%% Try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)

#%% Check df features
numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
all_features_df.dtypes
#%% Simple train and test data splits
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)

X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)

X_train, X_test, y_train, y_test = train_test_split(all_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)
#%% Stratified K-fold: Single model
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'

input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'

input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'

y_targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix

categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix

# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]

if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]

if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]

###############################################################################
col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')

###############################################################################
rs = {'random_state': 42}
del(model1)

model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('log_reg', LogisticRegression(class_weight = 'unbalanced')) ])

# model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
#                                , ('log_reg', LogisticRegression(**rs)) ])

del(model1)
nb      = BernoulliNB()
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('nb', nb) ])

del(model1)
knn     = KNeighborsClassifier()
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                 , ('knn', knn) ])
del(model1)
svm     = SVC(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('svm', svm) ])
del(model1)
mlp     = MLPClassifier(max_iter = 500, **rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('mlp', mlp) ])
del(model1)
dt      = DecisionTreeClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('dt', dt) ])
del(model1)
et      = ExtraTreesClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('et', et) ])
del(model1)
rf      = RandomForestClassifier(**rs)
model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                                , ('rf', rf) ])
###############################################################################
#%% run
del(mm)

skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , **rs)

#X_array = np.array(numerical_features_df)
#Y = target1

model_scores_df = pd.DataFrame()
fscoreL      = []
mccL         = []
presL        = []
recallL      = []
accuL        = []
roc_aucL     = []

# for train_index, test_index in skf.split(X_array, Y):
#     x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
#     y_train_fold, y_test_fold = Y[train_index], Y[test_index]
for train_index, test_index in skf.split(input_df, y_targetF):
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = y_targetF.iloc[train_index], y_targetF.iloc[test_index]

    model1.fit(x_train_fold, y_train_fold)
    y_pred_fold  = model1.predict(x_test_fold)

    #----------------
    # Model metrics
    #----------------
    # F1-Score
    fscore = f1_score(y_test_fold, y_pred_fold)
    fscoreL.append(fscore)
    fscoreM = mean(fscoreL)

    # Matthews correlation coefficient
    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
    mccL.append(mcc)
    mccM = mean(mccL)

    # Precision
    pres = precision_score(y_test_fold, y_pred_fold)
    presL.append(pres)
    presM = mean(presL)

    # Recall
    recall = recall_score(y_test_fold, y_pred_fold)
    recallL.append(recall)
    recallM = mean(recallL)

    # Accuracy
    accu = accuracy_score(y_test_fold, y_pred_fold)
    accuL.append(accu)
    accuM = mean(accuL)

    # ROC_AUC
    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
    roc_aucL.append(roc_auc)
    roc_aucM = mean(roc_aucL)

    model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
                                          ,'F1_score'  : fscoreM
                                          , 'MCC'      : mccM
                                          , 'Precision': presM
                                          , 'Recall'   : recallM
                                          , 'Accuracy' : accuM
                                          , 'ROC_curve': roc_aucM}
                                         , ignore_index = True)
print('\nModel metrics:\n', model_scores_df)
mm = model_scores_df.mean()

print('\nModel metrics mean:\n', mm)

print('\nModel metrics:\n', model_scores_df)