added proof of concept checks to make sure loopity loop is equivalent to cross_validate with stratified Kfold passed as a cv param
This commit is contained in:
parent
d0c329a1d9
commit
458a933d73
1 changed files with 166 additions and 0 deletions
166
practice_cv2.py
Normal file
166
practice_cv2.py
Normal file
|
@ -0,0 +1,166 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Mar 15 11:09:50 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%%
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.datasets import load_wine
|
||||||
|
from sklearn.model_selection import KFold
|
||||||
|
#%% Data
|
||||||
|
X = num_df_wtgt[numerical_FN]
|
||||||
|
y = num_df_wtgt['mutation_class']
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X
|
||||||
|
,y
|
||||||
|
, test_size = 0.33
|
||||||
|
, random_state = 2
|
||||||
|
, shuffle = True
|
||||||
|
|
||||||
|
rs = {'random_state': 42}
|
||||||
|
njobs = {'n_jobs': 10}
|
||||||
|
skf_cv = StratifiedKFold(n_splits = 10
|
||||||
|
#, shuffle = False, random_state= None)
|
||||||
|
, shuffle = True,**rs)
|
||||||
|
#%% Stratified Kfold: cross_val_score vs cross_validate vs for loop
|
||||||
|
#https://vitalflux.com/k-fold-cross-validation-python-example/
|
||||||
|
|
||||||
|
#Pipeline(steps=[('prep' , col_transform)
|
||||||
|
# , ('classifier' , model)])
|
||||||
|
|
||||||
|
pipeline = make_pipeline(MinMaxScaler()
|
||||||
|
# , RandomForestClassifier(min_samples_leaf = 50
|
||||||
|
# , n_estimators = 150
|
||||||
|
# , bootstrap = True
|
||||||
|
# , oob_score = True
|
||||||
|
# , **njobs
|
||||||
|
# , **rs
|
||||||
|
# , max_features = 'auto'))
|
||||||
|
, BernoulliNB())
|
||||||
|
#, KNeighborsClassifier())
|
||||||
|
#%% Loopity Loop vs cross_val_score (NO SHUFFLE)
|
||||||
|
# Pass instance of pipeline and training and test data set
|
||||||
|
# cv=10 represents the StratifiedKFold with 10 folds
|
||||||
|
scores = cross_val_score(pipeline
|
||||||
|
, X = num_df_wtgt[numerical_FN]
|
||||||
|
, y = num_df_wtgt['mutation_class']
|
||||||
|
#, shuffle = True
|
||||||
|
, scoring = 'f1'
|
||||||
|
, cv = 10
|
||||||
|
, **njobs)
|
||||||
|
|
||||||
|
mean_score = mean(scores)
|
||||||
|
round(mean_score, 2)
|
||||||
|
|
||||||
|
# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
|
||||||
|
# 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF
|
||||||
|
# 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF
|
||||||
|
# 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB
|
||||||
|
# 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN
|
||||||
|
# ROC_AUC DOES NOT match!
|
||||||
|
|
||||||
|
#%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY!
|
||||||
|
# Pass instance of pipeline and training and test data set
|
||||||
|
# cv=10 represents the StratifiedKFold with 10 folds
|
||||||
|
scores_sh = cross_val_score(pipeline
|
||||||
|
, X = num_df_wtgt[numerical_FN]
|
||||||
|
, y = num_df_wtgt['mutation_class']
|
||||||
|
#, shuffle = True
|
||||||
|
, scoring = 'f1'
|
||||||
|
#, cv = 10
|
||||||
|
, cv = skf_cv
|
||||||
|
, **njobs)
|
||||||
|
mean_score_sh = mean(scores_sh)
|
||||||
|
round(mean_score_sh, 2)
|
||||||
|
#%% Loopity Loop vs cross_validate (NO SHUFFLE)
|
||||||
|
skf_cv = cross_validate(pipeline
|
||||||
|
, num_df_wtgt[numerical_FN]
|
||||||
|
, num_df_wtgt['mutation_class']
|
||||||
|
, cv = 10
|
||||||
|
, scoring = scoring_fn
|
||||||
|
#, shuffle = True, **rs
|
||||||
|
, return_train_score=True)
|
||||||
|
skf_cv
|
||||||
|
cvscores_df = pd.DataFrame(skf_cv)
|
||||||
|
cvscores_df_test = cvscores_df.filter(like='test_', axis=1)
|
||||||
|
cvscores_df_test_mean = cvscores_df_test.mean(axis = 0)
|
||||||
|
cvscores_df_test_mean
|
||||||
|
|
||||||
|
# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
|
||||||
|
#%% Loopity Loop vs cross_validate (SHUFFLE) ===> YAYYYY!
|
||||||
|
# https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv
|
||||||
|
skf_cv_sh = cross_validate(pipeline
|
||||||
|
, num_df_wtgt[numerical_FN]
|
||||||
|
, num_df_wtgt['mutation_class']
|
||||||
|
#, cv = 10
|
||||||
|
, cv = skf_cv
|
||||||
|
, scoring = scoring_fn
|
||||||
|
, return_train_score=True)
|
||||||
|
skf_cv_sh
|
||||||
|
cvscores_df_sh = pd.DataFrame(skf_cv_sh)
|
||||||
|
cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1)
|
||||||
|
cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0)
|
||||||
|
cvscores_df_test_mean_sh
|
||||||
|
#%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!!
|
||||||
|
log_reg = LogisticRegression(**rs)
|
||||||
|
nb = BernoulliNB()
|
||||||
|
knn = KNeighborsClassifier()
|
||||||
|
svm = SVC(**rs)
|
||||||
|
mlp = MLPClassifier(max_iter = 500, **rs)
|
||||||
|
dt = DecisionTreeClassifier(**rs)
|
||||||
|
et = ExtraTreesClassifier(**rs)
|
||||||
|
rf = RandomForestClassifier(**rs)
|
||||||
|
rf2 = RandomForestClassifier(
|
||||||
|
min_samples_leaf = 50
|
||||||
|
, n_estimators = 150
|
||||||
|
, bootstrap = True
|
||||||
|
, oob_score = True
|
||||||
|
, **njobs
|
||||||
|
, **rs
|
||||||
|
, max_features = 'auto')
|
||||||
|
xgb = XGBClassifier(**rs
|
||||||
|
, verbosity = 0, use_label_encoder =False)
|
||||||
|
|
||||||
|
models = [('Logistic Regression', log_reg)
|
||||||
|
, ('Naive Bayes' , nb)
|
||||||
|
, ('K-Nearest Neighbors', knn)
|
||||||
|
, ('SVM' , svm)
|
||||||
|
, ('MLP' , mlp)
|
||||||
|
, ('Decision Tree' , dt)
|
||||||
|
, ('Extra Trees' , et)
|
||||||
|
, ('Random Forest' , rf)
|
||||||
|
, ('Naive Bayes' , nb)
|
||||||
|
, ('Random Forest2' , rf2)
|
||||||
|
, ('XGBoost' , xgb)]
|
||||||
|
|
||||||
|
mm_skf_scoresD = {}
|
||||||
|
for model_name, model_fn in models:
|
||||||
|
# print('\nModel_name:', model_name
|
||||||
|
# , '\nModel func:', model_fn
|
||||||
|
# , '\nList of models:', models)
|
||||||
|
|
||||||
|
model_pipeline = Pipeline([
|
||||||
|
('pre' , MinMaxScaler())
|
||||||
|
, ('model' , model_fn)])
|
||||||
|
|
||||||
|
print('Running model pipeline:', model_pipeline)
|
||||||
|
skf_cv_mod = cross_validate(model_pipeline
|
||||||
|
, X
|
||||||
|
, y
|
||||||
|
, cv = skf_cv
|
||||||
|
, scoring = scoring_fn
|
||||||
|
, return_train_score = True)
|
||||||
|
mm_skf_scoresD[model_name] = {}
|
||||||
|
for key, value in skf_cv_mod.items():
|
||||||
|
print('\nkey:', key, '\nvalue:', value)
|
||||||
|
print('\nmean value:', mean(value))
|
||||||
|
mm_skf_scoresD[model_name][key] = round(mean(value),2)
|
||||||
|
pp.pprint(mm_skf_scoresD)
|
||||||
|
|
||||||
|
# construct df
|
||||||
|
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
||||||
|
mm_skf_scores_df_all
|
||||||
|
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
||||||
|
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
Loading…
Add table
Add a link
Reference in a new issue