added proof of concept checks to make sure loopity loop is equivalent to cross_validate with stratified Kfold passed as a cv param

2022-03-17 18:18:43 +00:00 · 2022-03-17 18:18:43 +00:00 · 458a933d73
commit 458a933d73
parent d0c329a1d9
1 changed files with 166 additions and 0 deletions
--- a/practice_cv2.py
+++ b/practice_cv2.py
@ -0,0 +1,166 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Mar 15 11:09:50 2022
@author: tanu
 """
 #%%
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.datasets import load_wine
 from sklearn.model_selection import KFold
 #%% Data
 X = num_df_wtgt[numerical_FN]
 y = num_df_wtgt['mutation_class']
 X_train, X_test, y_train, y_test = train_test_split(X
                                            ,y
                                            , test_size    = 0.33
                                            , random_state = 2
                                            , shuffle      = True
 rs = {'random_state': 42}
 njobs = {'n_jobs': 10} 
 skf_cv = StratifiedKFold(n_splits = 10
                          #, shuffle = False, random_state= None)
                            , shuffle = True,**rs) 
 #%% Stratified Kfold: cross_val_score vs cross_validate vs for loop
 #https://vitalflux.com/k-fold-cross-validation-python-example/
 #Pipeline(steps=[('prep'         , col_transform)
 #                 , ('classifier' , model)])
 pipeline = make_pipeline(MinMaxScaler()
                          # , RandomForestClassifier(min_samples_leaf = 50
                          #                          , n_estimators     = 150
                          #                          , bootstrap        = True
                          #                          , oob_score        = True
                          #                          , **njobs
                          #                          , **rs
                          #                          , max_features     = 'auto'))
                         , BernoulliNB())
                         #, KNeighborsClassifier())
 #%% Loopity Loop vs cross_val_score (NO SHUFFLE)
 # Pass instance of pipeline and training and test data set
 # cv=10 represents the StratifiedKFold with 10 folds
 scores = cross_val_score(pipeline
                         , X = num_df_wtgt[numerical_FN]
                         , y = num_df_wtgt['mutation_class']
                         #, shuffle = True
                         , scoring = 'f1'
                         , cv = 10
                         , **njobs)
 mean_score = mean(scores)
 round(mean_score, 2)
 # Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
 # 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF
 # 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF
 # 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB
 # 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN
 # ROC_AUC DOES NOT match!
 #%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY!
 # Pass instance of pipeline and training and test data set
 # cv=10 represents the StratifiedKFold with 10 folds
 scores_sh = cross_val_score(pipeline
                         , X = num_df_wtgt[numerical_FN]
                         , y = num_df_wtgt['mutation_class']
                         #, shuffle = True
                         , scoring = 'f1'
                         #, cv = 10
                         , cv = skf_cv
                         , **njobs)
 mean_score_sh = mean(scores_sh)
 round(mean_score_sh, 2)
 #%% Loopity Loop vs cross_validate  (NO SHUFFLE)
 skf_cv = cross_validate(pipeline
                      , num_df_wtgt[numerical_FN]
                      , num_df_wtgt['mutation_class']
                      , cv = 10
                      , scoring = scoring_fn
                      #, shuffle = True, **rs
                      , return_train_score=True)
 skf_cv
 cvscores_df = pd.DataFrame(skf_cv)
 cvscores_df_test = cvscores_df.filter(like='test_', axis=1)
 cvscores_df_test_mean = cvscores_df_test.mean(axis = 0)
 cvscores_df_test_mean
 # Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
 #%% Loopity Loop vs cross_validate  (SHUFFLE) ===> YAYYYY!
 # https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv
 skf_cv_sh = cross_validate(pipeline
                      , num_df_wtgt[numerical_FN]
                      , num_df_wtgt['mutation_class']
                      #, cv = 10
                      , cv = skf_cv
                      , scoring = scoring_fn
                      , return_train_score=True)
 skf_cv_sh
 cvscores_df_sh = pd.DataFrame(skf_cv_sh)
 cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1)
 cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0)
 cvscores_df_test_mean_sh
 #%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!!
 log_reg = LogisticRegression(**rs)
 nb      = BernoulliNB()
 knn     = KNeighborsClassifier()
 svm     = SVC(**rs)
 mlp     = MLPClassifier(max_iter = 500, **rs)
 dt      = DecisionTreeClassifier(**rs)
 et      = ExtraTreesClassifier(**rs)
 rf      = RandomForestClassifier(**rs)
 rf2     = RandomForestClassifier(
                      min_samples_leaf = 50
                      , n_estimators     = 150
                      , bootstrap        = True
                      , oob_score        = True
                      , **njobs
                      , **rs
                      , max_features     = 'auto')
 xgb = XGBClassifier(**rs
                    , verbosity = 0, use_label_encoder =False)
 models = [('Logistic Regression', log_reg)
        , ('Naive Bayes'        , nb)
        , ('K-Nearest Neighbors', knn) 
        , ('SVM'                , svm) 
        , ('MLP'                , mlp) 
        , ('Decision Tree'      , dt) 
        , ('Extra Trees'        , et) 
        , ('Random Forest'      , rf) 
        , ('Naive Bayes'        , nb)
        , ('Random Forest2'     , rf2) 
        , ('XGBoost'            , xgb)]
 mm_skf_scoresD = {}
 for model_name, model_fn in models:
    # print('\nModel_name:', model_name
    #       , '\nModel func:', model_fn
    #       , '\nList of models:', models)
    model_pipeline = Pipeline([
        ('pre'     , MinMaxScaler())
        , ('model' , model_fn)])
    print('Running model pipeline:', model_pipeline)
    skf_cv_mod = cross_validate(model_pipeline
                          , X
                          , y
                          , cv = skf_cv
                          , scoring = scoring_fn
                          , return_train_score = True)
    mm_skf_scoresD[model_name] = {}
    for key, value in skf_cv_mod.items():
        print('\nkey:', key, '\nvalue:', value)
        print('\nmean value:', mean(value))
        mm_skf_scoresD[model_name][key] = round(mean(value),2)
 pp.pprint(mm_skf_scoresD)
 # construct df 
 mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
 mm_skf_scores_df_all
 mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
 mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results