ML_AI_training/earlier_versions/skf_mm.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 10 10:33:15 2022

@author: tanu
"""
#%% Stratified KFold: Multiple_models:
input_df = numerical_features_df
#X_array = np.array(input_df)
var_type = 'numerical'

input_df = all_features_df
#X_array = np.array(input_df)
var_type = 'mixed'

input_df = categorical_features_df
#X_array = np.array(input_df)
var_type = 'categorical'

targetF = target1
#==============================================================================
numerical_ix = input_df.select_dtypes(include=['int64', 'float64']).columns
numerical_ix

categorical_ix = input_df.select_dtypes(include=['object', 'bool']).columns
categorical_ix
# Determine preprocessing steps ~ var_type
if var_type == 'numerical':
    t = [('num', MinMaxScaler(), numerical_ix)]

if var_type == 'categorical':
    t = [('cat', OneHotEncoder(), categorical_ix)]

if var_type == 'mixed':
    t = [('cat', OneHotEncoder(), categorical_ix)
         , ('num', MinMaxScaler(), numerical_ix)]

###############################################################################
col_transform = ColumnTransformer(transformers = t
                                   , remainder='passthrough')

###############################################################################
rs = {'random_state': 42}

#log_reg = LogisticRegression(**rs)
log_reg = LogisticRegression(class_weight = 'balanced')
nb = BernoulliNB()
rf = RandomForestClassifier(**rs)

clfs = [('Logistic Regression', log_reg)
        ,('Naive Bayes'       , nb)
        , ('Random Forest'    , rf)
        ]

#seed_skf = 42
skf = StratifiedKFold(n_splits = 10
                      , shuffle = True
                      #, random_state = seed_skf
                      , **rs)
#scores_df  = pd.DataFrame()
fscoreL      = []
mccL         = []
presL        = []
recallL      = []
accuL        = []
roc_aucL     = []

# X_array = np.array(input_df)
# Y = np.array(target1)
# Y = target1

for train_index, test_index in skf.split(input_df, targetF):
    print('\nSKF train index:', train_index
          , '\nSKF test index:', test_index)
    x_train_fold, x_test_fold = input_df.iloc[train_index], input_df.iloc[test_index]
    y_train_fold, y_test_fold = targetF.iloc[train_index], targetF.iloc[test_index]
# for train_index, test_index in skf.split(X_array, Y):
#      print('\nSKF train index:', train_index
#            , '\nSKF test index:', test_index)
    # x_train_fold, x_test_fold = X_array[train_index], X_array[test_index]
    # y_train_fold, y_test_fold = Y[train_index], Y[test_index]


    clf_scores_df = pd.DataFrame()
    for clf_name, clf in clfs:
        # model2 = Pipeline(steps=[('preprocess', MinMaxScaler())
        #                            , ('classifier', clf)])
        model2 = Pipeline(steps=[('preprocess', col_transform)
                                    , ('classifier', clf)])

        model2.fit(x_train_fold, y_train_fold)
        y_pred_fold  = model2.predict(x_test_fold)

        #----------------
        # Model metrics
        #----------------
        # F1-Score
        fscore = f1_score(y_test_fold, y_pred_fold)
        fscoreL.append(fscore)
#        print('fscoreL Len: ', len(fscoreL))
        fscoreM = mean(fscoreL)

        # Matthews correlation coefficient
        mcc = matthews_corrcoef(y_test_fold, y_pred_fold)
        mccL.append(mcc)
        mccM = mean(mccL)

        # Precision
        pres = precision_score(y_test_fold, y_pred_fold)
        presL.append(pres)
        presM = mean(presL)

        # Recall
        recall = recall_score(y_test_fold, y_pred_fold)
        recallL.append(recall)
        recallM = mean(recallL)

        # Accuracy
        accu = accuracy_score(y_test_fold, y_pred_fold)
        accuL.append(accu)
        accuM = mean(accuL)

        # ROC_AUC
        roc_auc = roc_auc_score(y_test_fold, y_pred_fold)
        roc_aucL.append(roc_auc)
        roc_aucM = mean(roc_aucL)

        clf_scores_df = clf_scores_df.append({'Model': clf_name
                                              ,'F1_score'  : fscoreM
                                              , 'MCC'      : mccM
                                              , 'Precision': presM
                                              , 'Recall'   : recallM
                                              , 'Accuracy' : accuM
                                              , 'ROC_curve': roc_aucM}
                                             , ignore_index = True)
    #scores_df = scores_df.append(clf_scores_df)


#%% Call functions

tN_res = MultClassPipeline(X_trainN, X_testN, y_trainN, y_testN)
tN_res

t2_res = MultClassPipeline2(X_train, X_test, y_train, y_test, input_df = all_features_df)
t2_res

#CHECK: numbers are awfully close to each other!

t3_res = MultClassPipeSKF(input_df = numerical_features_df
                          , y_targetF = target1
                          , var_type = 'numerical'
                          , skf_splits = 10)
t3_res

#CHECK: numbers are awfully close to each other!
t4_res = MultClassPipeSKF(input_df = all_features_df
                          , y_targetF = target1
                          , var_type = 'mixed'
                          , skf_splits = 10)
t4_res