renamed practice_cv2 to cross_validate_vs_loopity_loop

2022-03-22 11:03:51 +00:00 · 2022-03-22 11:03:51 +00:00 · a82358dbb4
commit a82358dbb4
parent 0c4f1e1e5f
27 changed files with 0 additions and 4305 deletions
--- a/cross_validate_vs_loopity_loop.py
+++ b/cross_validate_vs_loopity_loop.py
@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 15 11:09:50 2022
+
+@author: tanu
+"""
+#%%
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.datasets import load_wine
+from sklearn.model_selection import KFold
+#%% Data
+X = num_df_wtgt[numerical_FN]
+y = num_df_wtgt['mutation_class']
+
+X_train, X_test, y_train, y_test = train_test_split(X
+                                            ,y
+                                            , test_size    = 0.33
+                                            , random_state = 2
+                                            , shuffle      = True
+
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10} 
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                            , shuffle = True,**rs) 
+#%% Stratified Kfold: cross_val_score vs cross_validate vs for loop
+#https://vitalflux.com/k-fold-cross-validation-python-example/
+
+#Pipeline(steps=[('prep'         , col_transform)
+#                 , ('classifier' , model)])
+
+pipeline = make_pipeline(MinMaxScaler()
+                          # , RandomForestClassifier(min_samples_leaf = 50
+                          #                          , n_estimators     = 150
+                          #                          , bootstrap        = True
+                          #                          , oob_score        = True
+                          #                          , **njobs
+                          #                          , **rs
+                          #                          , max_features     = 'auto'))
+                         , BernoulliNB())
+                         #, KNeighborsClassifier())
+#%% Loopity Loop vs cross_val_score (NO SHUFFLE)
+# Pass instance of pipeline and training and test data set
+# cv=10 represents the StratifiedKFold with 10 folds
+scores = cross_val_score(pipeline
+                         , X = num_df_wtgt[numerical_FN]
+                         , y = num_df_wtgt['mutation_class']
+                         #, shuffle = True
+                         , scoring = 'f1'
+                         , cv = 10
+                         , **njobs)
+
+mean_score = mean(scores)
+round(mean_score, 2)
+
+# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
+# 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF
+# 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF
+# 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB
+# 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN
+# ROC_AUC DOES NOT match!
+
+#%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY!
+# Pass instance of pipeline and training and test data set
+# cv=10 represents the StratifiedKFold with 10 folds
+scores_sh = cross_val_score(pipeline
+                         , X = num_df_wtgt[numerical_FN]
+                         , y = num_df_wtgt['mutation_class']
+                         #, shuffle = True
+                         , scoring = 'f1'
+                         #, cv = 10
+                         , cv = skf_cv
+                         , **njobs)
+mean_score_sh = mean(scores_sh)
+round(mean_score_sh, 2)
+#%% Loopity Loop vs cross_validate  (NO SHUFFLE)
+skf_cv = cross_validate(pipeline
+                      , num_df_wtgt[numerical_FN]
+                      , num_df_wtgt['mutation_class']
+                      , cv = 10
+                      , scoring = scoring_fn
+                      #, shuffle = True, **rs
+                      , return_train_score=True)
+skf_cv
+cvscores_df = pd.DataFrame(skf_cv)
+cvscores_df_test = cvscores_df.filter(like='test_', axis=1)
+cvscores_df_test_mean = cvscores_df_test.mean(axis = 0)
+cvscores_df_test_mean
+
+# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
+#%% Loopity Loop vs cross_validate  (SHUFFLE) ===> YAYYYY!
+# https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv
+skf_cv_sh = cross_validate(pipeline
+                      , num_df_wtgt[numerical_FN]
+                      , num_df_wtgt['mutation_class']
+                      #, cv = 10
+                      , cv = skf_cv
+                      , scoring = scoring_fn
+                      , return_train_score=True)
+skf_cv_sh
+cvscores_df_sh = pd.DataFrame(skf_cv_sh)
+cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1)
+cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0)
+cvscores_df_test_mean_sh
+#%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!!
+log_reg = LogisticRegression(**rs)
+nb      = BernoulliNB()
+knn     = KNeighborsClassifier()
+svm     = SVC(**rs)
+mlp     = MLPClassifier(max_iter = 500, **rs)
+dt      = DecisionTreeClassifier(**rs)
+et      = ExtraTreesClassifier(**rs)
+rf      = RandomForestClassifier(**rs)
+rf2     = RandomForestClassifier(
+                      min_samples_leaf = 50
+                      , n_estimators     = 150
+                      , bootstrap        = True
+                      , oob_score        = True
+                      , **njobs
+                      , **rs
+                      , max_features     = 'auto')
+xgb = XGBClassifier(**rs
+                    , verbosity = 0, use_label_encoder =False)
+
+models = [('Logistic Regression', log_reg)
+        , ('Naive Bayes'        , nb)
+        , ('K-Nearest Neighbors', knn) 
+        , ('SVM'                , svm) 
+        , ('MLP'                , mlp) 
+        , ('Decision Tree'      , dt) 
+        , ('Extra Trees'        , et) 
+        , ('Random Forest'      , rf) 
+        , ('Naive Bayes'        , nb)
+        , ('Random Forest2'     , rf2) 
+        , ('XGBoost'            , xgb)]
+
+mm_skf_scoresD = {}
+for model_name, model_fn in models:
+    # print('\nModel_name:', model_name
+    #       , '\nModel func:', model_fn
+    #       , '\nList of models:', models)
+    
+    model_pipeline = Pipeline([
+        ('pre'     , MinMaxScaler())
+        , ('model' , model_fn)])
+    
+    print('Running model pipeline:', model_pipeline)
+    skf_cv_mod = cross_validate(model_pipeline
+                          , X
+                          , y
+                          , cv = skf_cv
+                          , scoring = scoring_fn
+                          , return_train_score = True)
+    mm_skf_scoresD[model_name] = {}
+    for key, value in skf_cv_mod.items():
+        print('\nkey:', key, '\nvalue:', value)
+        print('\nmean value:', mean(value))
+        mm_skf_scoresD[model_name][key] = round(mean(value),2)
+pp.pprint(mm_skf_scoresD)
+
+# construct df 
+mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
+mm_skf_scores_df_all
+mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
+mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results