ML_AI_training/cross_validate_vs_loopity_loop.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022

@author: tanu
"""
#%%
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import KFold
#%% Data
X = num_df_wtgt[numerical_FN]
y = num_df_wtgt['mutation_class']

X_train, X_test, y_train, y_test = train_test_split(X
                                            ,y
                                            , test_size    = 0.33
                                            , random_state = 2
                                            , shuffle      = True

rs = {'random_state': 42}
njobs = {'n_jobs': 10}
skf_cv = StratifiedKFold(n_splits = 10
                          #, shuffle = False, random_state= None)
                            , shuffle = True,**rs)
#%% Stratified Kfold: cross_val_score vs cross_validate vs for loop
#https://vitalflux.com/k-fold-cross-validation-python-example/

#Pipeline(steps=[('prep'         , col_transform)
#                 , ('classifier' , model)])

pipeline = make_pipeline(MinMaxScaler()
                          # , RandomForestClassifier(min_samples_leaf = 50
                          #                          , n_estimators     = 150
                          #                          , bootstrap        = True
                          #                          , oob_score        = True
                          #                          , **njobs
                          #                          , **rs
                          #                          , max_features     = 'auto'))
                         , BernoulliNB())
                         #, KNeighborsClassifier())
#%% Loopity Loop vs cross_val_score (NO SHUFFLE)
# Pass instance of pipeline and training and test data set
# cv=10 represents the StratifiedKFold with 10 folds
scores = cross_val_score(pipeline
                         , X = num_df_wtgt[numerical_FN]
                         , y = num_df_wtgt['mutation_class']
                         #, shuffle = True
                         , scoring = 'f1'
                         , cv = 10
                         , **njobs)

mean_score = mean(scores)
round(mean_score, 2)

# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
# 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF
# 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF
# 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB
# 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN
# ROC_AUC DOES NOT match!

#%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY!
# Pass instance of pipeline and training and test data set
# cv=10 represents the StratifiedKFold with 10 folds
scores_sh = cross_val_score(pipeline
                         , X = num_df_wtgt[numerical_FN]
                         , y = num_df_wtgt['mutation_class']
                         #, shuffle = True
                         , scoring = 'f1'
                         #, cv = 10
                         , cv = skf_cv
                         , **njobs)
mean_score_sh = mean(scores_sh)
round(mean_score_sh, 2)
#%% Loopity Loop vs cross_validate  (NO SHUFFLE)
skf_cv = cross_validate(pipeline
                      , num_df_wtgt[numerical_FN]
                      , num_df_wtgt['mutation_class']
                      , cv = 10
                      , scoring = scoring_fn
                      #, shuffle = True, **rs
                      , return_train_score=True)
skf_cv
cvscores_df = pd.DataFrame(skf_cv)
cvscores_df_test = cvscores_df.filter(like='test_', axis=1)
cvscores_df_test_mean = cvscores_df_test.mean(axis = 0)
cvscores_df_test_mean

# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
#%% Loopity Loop vs cross_validate  (SHUFFLE) ===> YAYYYY!
# https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv
skf_cv_sh = cross_validate(pipeline
                      , num_df_wtgt[numerical_FN]
                      , num_df_wtgt['mutation_class']
                      #, cv = 10
                      , cv = skf_cv
                      , scoring = scoring_fn
                      , return_train_score=True)
skf_cv_sh
cvscores_df_sh = pd.DataFrame(skf_cv_sh)
cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1)
cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0)
cvscores_df_test_mean_sh
#%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!!
log_reg = LogisticRegression(**rs)
nb      = BernoulliNB()
knn     = KNeighborsClassifier()
svm     = SVC(**rs)
mlp     = MLPClassifier(max_iter = 500, **rs)
dt      = DecisionTreeClassifier(**rs)
et      = ExtraTreesClassifier(**rs)
rf      = RandomForestClassifier(**rs)
rf2     = RandomForestClassifier(
                      min_samples_leaf = 50
                      , n_estimators     = 150
                      , bootstrap        = True
                      , oob_score        = True
                      , **njobs
                      , **rs
                      , max_features     = 'auto')
xgb = XGBClassifier(**rs
                    , verbosity = 0, use_label_encoder =False)

models = [('Logistic Regression', log_reg)
        , ('Naive Bayes'        , nb)
        , ('K-Nearest Neighbors', knn)
        , ('SVM'                , svm)
        , ('MLP'                , mlp)
        , ('Decision Tree'      , dt)
        , ('Extra Trees'        , et)
        , ('Random Forest'      , rf)
        , ('Naive Bayes'        , nb)
        , ('Random Forest2'     , rf2)
        , ('XGBoost'            , xgb)]

mm_skf_scoresD = {}
for model_name, model_fn in models:
    # print('\nModel_name:', model_name
    #       , '\nModel func:', model_fn
    #       , '\nList of models:', models)

    model_pipeline = Pipeline([
        ('pre'     , MinMaxScaler())
        , ('model' , model_fn)])

    print('Running model pipeline:', model_pipeline)
    skf_cv_mod = cross_validate(model_pipeline
                          , X
                          , y
                          , cv = skf_cv
                          , scoring = scoring_fn
                          , return_train_score = True)
    mm_skf_scoresD[model_name] = {}
    for key, value in skf_cv_mod.items():
        print('\nkey:', key, '\nvalue:', value)
        print('\nmean value:', mean(value))
        mm_skf_scoresD[model_name][key] = round(mean(value),2)
pp.pprint(mm_skf_scoresD)

# construct df
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
mm_skf_scores_df_all
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results