ML_AI_training/my_data10.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar  5 12:57:32 2022

@author: tanu
"""
#%%
# Data, etc for now  comes from my_data6.py and/or my_data5.py
#%% Specify dir and import functions
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
#%% Try combinations
#import sys, os
#os.system("imports.py")
def precision(y_true,y_pred):
    return precision_score(y_true,y_pred,pos_label = 1)
def recall(y_true,y_pred):
    return recall_score(y_true, y_pred, pos_label = 1)
def f1(y_true,y_pred):
    return f1_score(y_true, y_pred, pos_label = 1)

#%% Check df features
numerical_features_df.shape
categorical_features_df.shape
all_features_df.shape
all_features_df.dtypes
#%% Simple train and test data splits
target = target1
#target = target3
X_trainN, X_testN, y_trainN, y_testN = train_test_split(numerical_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)

X_trainC, X_testC, y_trainC, y_testC = train_test_split(categorical_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)

X_train, X_test, y_train, y_test = train_test_split(all_features_df,
                                                    target,
                                                    test_size = 0.33,
                                                    random_state = 42)
#%% Stratified K-fold: Single model

model1 = Pipeline(steps = [('preprocess', MinMaxScaler())
                               , ('log_reg', LogisticRegression(class_weight = 'balanced')) ])
model1
rs = {'random_state': 42}
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
clfs = [('Logistic Regression', log_reg)
        ,('Naive Bayes', nb)]

seed_skf = 42
skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      , random_state = seed_skf)

X_array = np.array(numerical_features_df)
Y = target1

model_scores_df = pd.DataFrame()
fscoreL      = []
mccL         = []
presL        = []
recallL      = []
accuL        = []
roc_aucL     = []

for train_index, test_index in skf.split(X_array, Y):
    x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]

    model1.fit(x_train_fold, y_train_fold)
    y_pred_fold  = model1.predict(x_test_fold)

    #----------------
    # Model metrics
    #----------------
    # F1-Score
    fscore = f1_score(y_test_fold, y_pred_fold)
    fscoreL.append(fscore)
    fscoreM = mean(fscoreL)

    # Matthews correlation coefficient
    mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
    mccL.append(mcc)
    mccM = mean(mccL)

    # Precision
    pres = precision_score(y_test_fold, y_pred_fold)
    presL.append(pres)
    presM = mean(presL)

    # Recall
    recall = recall_score(y_test_fold, y_pred_fold)
    recallL.append(recall)
    recallM = mean(recallL)

    # Accuracy
    accu = accuracy_score(y_test_fold, y_pred_fold)
    accuL.append(accu)
    accuM = mean(accuL)

    # ROC_AUC
    roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
    roc_aucL.append(roc_auc)
    roc_aucM = mean(roc_aucL)

model_scores_df = model_scores_df.append({'Model'      : model1.steps[1][0]
                                          ,'F1_score'  : fscoreM
                                          , 'MCC'      : mccM
                                          , 'Precision': presM
                                          , 'Recall'   : recallM
                                          , 'Accuracy' : accuM
                                          , 'ROC_curve': roc_aucM}
                                         , ignore_index = True)
print('\nModel metrics:', model_scores_df)
#%% stratified KFold: Multiple_models:
input_df = numerical_features_df
#X_array = np.array(input_df)
Y = target1
var_type = 'numerical'

input_df = all_features_df
#X_array = np.array(input_df)
Y = target1
var_type = 'mixed'

input_df = categorical_features_df
#X_array = np.array(input_df)
Y = target1
var_type = 'categorical'

#=================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix

categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix

# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]

if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]

if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]

##############################
col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')


rs = {'random_state': 42}

#log_reg = LogisticRegression(**rs)
log_reg = LogisticRegression(class_weight = 'balanced')
nb = BernoulliNB()
rf = RandomForestClassifier(**rs)

clfs = [('Logistic Regression', log_reg)
        ,('Naive Bayes', nb)
        , ('Random Forest'      , rf)
        ]

#seed_skf = 42
skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      #, random_state = seed_skf
                      , **rs)
#scores_df  = pd.DataFrame()
fscoreL      = []
mccL         = []
presL        = []
recallL      = []
accuL        = []
roc_aucL     = []

for train_index, test_index in skf.split(input_df, Y):
    print('\nSKF train index:', train_index
          , '\nSKF test index:', test_index)
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = Y.iloc[train_index], Y.iloc[test_index]
# for train_index, test_index in skf.split(X_array, Y):
#      print('\nSKF train index:', train_index
#            , '\nSKF test index:', test_index)
    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]


    clf_scores_df = pd.DataFrame()
    for clf_name, clf in clfs:
        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
        #                            , ('classifier', clf)])
        model2 = Pipeline(steps=[('preprocess', col_transform)
                                    , ('classifier', clf)])

        model2.fit(x_train_fold, y_train_fold)
        y_pred_fold  = model2.predict(x_test_fold)

        #----------------
        # Model metrics
        #----------------
        # F1-Score
        fscore = f1_score(y_test_fold, y_pred_fold)
        fscoreL.append(fscore)
        fscoreM = mean(fscoreL)

        # Matthews correlation coefficient
        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
        mccL.append(mcc)
        mccM = mean(mccL)

        # Precision
        pres = precision_score(y_test_fold, y_pred_fold)
        presL.append(pres)
        presM = mean(presL)

        # Recall
        recall = recall_score(y_test_fold, y_pred_fold)
        recallL.append(recall)
        recallM = mean(recallL)

        # Accuracy
        accu = accuracy_score(y_test_fold, y_pred_fold)
        accuL.append(accu)
        accuM = mean(accuL)

        # ROC_AUC
        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
        roc_aucL.append(roc_auc)
        roc_aucM = mean(roc_aucL)

        clf_scores_df = clf_scores_df.append({'Model': clf_name
                                              ,'F1_score'  : fscoreM
                                              , 'MCC'      : mccM
                                              , 'Precision': presM
                                              , 'Recall'   : recallM
                                              , 'Accuracy' : accuM
                                              , 'ROC_curve': roc_aucM}
                                             , ignore_index = True)
    #scores_df = scores_df.append(clf_scores_df)


#%% Call functions

tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res

t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res

#CHECK: numbers are awfully close to each other!

t3_res = MultClassPipeSKF(input_df = numerical_features_df
                          , y_targetF = target1
                          , var_type = 'numerical'
                          , skf_splits = 10)
t3_res

#CHECK: numbers are awfully close to each other!
t4_res = MultClassPipeSKF(input_df = all_features_df
                          , y_targetF = target1
                          , var_type = 'mixed'
                          , skf_splits = 10)
t4_res