renamed practice_cv2 to cross_validate_vs_loopity_loop
This commit is contained in:
parent
0c4f1e1e5f
commit
a82358dbb4
27 changed files with 0 additions and 4305 deletions
166
cross_validate_vs_loopity_loop.py
Normal file
166
cross_validate_vs_loopity_loop.py
Normal file
|
@ -0,0 +1,166 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Mar 15 11:09:50 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%%
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.datasets import load_wine
|
||||
from sklearn.model_selection import KFold
|
||||
#%% Data
|
||||
X = num_df_wtgt[numerical_FN]
|
||||
y = num_df_wtgt['mutation_class']
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X
|
||||
,y
|
||||
, test_size = 0.33
|
||||
, random_state = 2
|
||||
, shuffle = True
|
||||
|
||||
rs = {'random_state': 42}
|
||||
njobs = {'n_jobs': 10}
|
||||
skf_cv = StratifiedKFold(n_splits = 10
|
||||
#, shuffle = False, random_state= None)
|
||||
, shuffle = True,**rs)
|
||||
#%% Stratified Kfold: cross_val_score vs cross_validate vs for loop
|
||||
#https://vitalflux.com/k-fold-cross-validation-python-example/
|
||||
|
||||
#Pipeline(steps=[('prep' , col_transform)
|
||||
# , ('classifier' , model)])
|
||||
|
||||
pipeline = make_pipeline(MinMaxScaler()
|
||||
# , RandomForestClassifier(min_samples_leaf = 50
|
||||
# , n_estimators = 150
|
||||
# , bootstrap = True
|
||||
# , oob_score = True
|
||||
# , **njobs
|
||||
# , **rs
|
||||
# , max_features = 'auto'))
|
||||
, BernoulliNB())
|
||||
#, KNeighborsClassifier())
|
||||
#%% Loopity Loop vs cross_val_score (NO SHUFFLE)
|
||||
# Pass instance of pipeline and training and test data set
|
||||
# cv=10 represents the StratifiedKFold with 10 folds
|
||||
scores = cross_val_score(pipeline
|
||||
, X = num_df_wtgt[numerical_FN]
|
||||
, y = num_df_wtgt['mutation_class']
|
||||
#, shuffle = True
|
||||
, scoring = 'f1'
|
||||
, cv = 10
|
||||
, **njobs)
|
||||
|
||||
mean_score = mean(scores)
|
||||
round(mean_score, 2)
|
||||
|
||||
# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
|
||||
# 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF
|
||||
# 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF
|
||||
# 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB
|
||||
# 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN
|
||||
# ROC_AUC DOES NOT match!
|
||||
|
||||
#%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY!
|
||||
# Pass instance of pipeline and training and test data set
|
||||
# cv=10 represents the StratifiedKFold with 10 folds
|
||||
scores_sh = cross_val_score(pipeline
|
||||
, X = num_df_wtgt[numerical_FN]
|
||||
, y = num_df_wtgt['mutation_class']
|
||||
#, shuffle = True
|
||||
, scoring = 'f1'
|
||||
#, cv = 10
|
||||
, cv = skf_cv
|
||||
, **njobs)
|
||||
mean_score_sh = mean(scores_sh)
|
||||
round(mean_score_sh, 2)
|
||||
#%% Loopity Loop vs cross_validate (NO SHUFFLE)
|
||||
skf_cv = cross_validate(pipeline
|
||||
, num_df_wtgt[numerical_FN]
|
||||
, num_df_wtgt['mutation_class']
|
||||
, cv = 10
|
||||
, scoring = scoring_fn
|
||||
#, shuffle = True, **rs
|
||||
, return_train_score=True)
|
||||
skf_cv
|
||||
cvscores_df = pd.DataFrame(skf_cv)
|
||||
cvscores_df_test = cvscores_df.filter(like='test_', axis=1)
|
||||
cvscores_df_test_mean = cvscores_df_test.mean(axis = 0)
|
||||
cvscores_df_test_mean
|
||||
|
||||
# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
|
||||
#%% Loopity Loop vs cross_validate (SHUFFLE) ===> YAYYYY!
|
||||
# https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv
|
||||
skf_cv_sh = cross_validate(pipeline
|
||||
, num_df_wtgt[numerical_FN]
|
||||
, num_df_wtgt['mutation_class']
|
||||
#, cv = 10
|
||||
, cv = skf_cv
|
||||
, scoring = scoring_fn
|
||||
, return_train_score=True)
|
||||
skf_cv_sh
|
||||
cvscores_df_sh = pd.DataFrame(skf_cv_sh)
|
||||
cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1)
|
||||
cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0)
|
||||
cvscores_df_test_mean_sh
|
||||
#%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!!
|
||||
log_reg = LogisticRegression(**rs)
|
||||
nb = BernoulliNB()
|
||||
knn = KNeighborsClassifier()
|
||||
svm = SVC(**rs)
|
||||
mlp = MLPClassifier(max_iter = 500, **rs)
|
||||
dt = DecisionTreeClassifier(**rs)
|
||||
et = ExtraTreesClassifier(**rs)
|
||||
rf = RandomForestClassifier(**rs)
|
||||
rf2 = RandomForestClassifier(
|
||||
min_samples_leaf = 50
|
||||
, n_estimators = 150
|
||||
, bootstrap = True
|
||||
, oob_score = True
|
||||
, **njobs
|
||||
, **rs
|
||||
, max_features = 'auto')
|
||||
xgb = XGBClassifier(**rs
|
||||
, verbosity = 0, use_label_encoder =False)
|
||||
|
||||
models = [('Logistic Regression', log_reg)
|
||||
, ('Naive Bayes' , nb)
|
||||
, ('K-Nearest Neighbors', knn)
|
||||
, ('SVM' , svm)
|
||||
, ('MLP' , mlp)
|
||||
, ('Decision Tree' , dt)
|
||||
, ('Extra Trees' , et)
|
||||
, ('Random Forest' , rf)
|
||||
, ('Naive Bayes' , nb)
|
||||
, ('Random Forest2' , rf2)
|
||||
, ('XGBoost' , xgb)]
|
||||
|
||||
mm_skf_scoresD = {}
|
||||
for model_name, model_fn in models:
|
||||
# print('\nModel_name:', model_name
|
||||
# , '\nModel func:', model_fn
|
||||
# , '\nList of models:', models)
|
||||
|
||||
model_pipeline = Pipeline([
|
||||
('pre' , MinMaxScaler())
|
||||
, ('model' , model_fn)])
|
||||
|
||||
print('Running model pipeline:', model_pipeline)
|
||||
skf_cv_mod = cross_validate(model_pipeline
|
||||
, X
|
||||
, y
|
||||
, cv = skf_cv
|
||||
, scoring = scoring_fn
|
||||
, return_train_score = True)
|
||||
mm_skf_scoresD[model_name] = {}
|
||||
for key, value in skf_cv_mod.items():
|
||||
print('\nkey:', key, '\nvalue:', value)
|
||||
print('\nmean value:', mean(value))
|
||||
mm_skf_scoresD[model_name][key] = round(mean(value),2)
|
||||
pp.pprint(mm_skf_scoresD)
|
||||
|
||||
# construct df
|
||||
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
|
||||
mm_skf_scores_df_all
|
||||
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
|
||||
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results
|
Loading…
Add table
Add a link
Reference in a new issue