ML_AI_training/cross_validate_vs_loopity_loop.py

166 lines
6.7 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
#%%
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import KFold
#%% Data
X = num_df_wtgt[numerical_FN]
y = num_df_wtgt['mutation_class']
X_train, X_test, y_train, y_test = train_test_split(X
,y
, test_size = 0.33
, random_state = 2
, shuffle = True
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
#%% Stratified Kfold: cross_val_score vs cross_validate vs for loop
#https://vitalflux.com/k-fold-cross-validation-python-example/
#Pipeline(steps=[('prep' , col_transform)
# , ('classifier' , model)])
pipeline = make_pipeline(MinMaxScaler()
# , RandomForestClassifier(min_samples_leaf = 50
# , n_estimators = 150
# , bootstrap = True
# , oob_score = True
# , **njobs
# , **rs
# , max_features = 'auto'))
, BernoulliNB())
#, KNeighborsClassifier())
#%% Loopity Loop vs cross_val_score (NO SHUFFLE)
# Pass instance of pipeline and training and test data set
# cv=10 represents the StratifiedKFold with 10 folds
scores = cross_val_score(pipeline
, X = num_df_wtgt[numerical_FN]
, y = num_df_wtgt['mutation_class']
#, shuffle = True
, scoring = 'f1'
, cv = 10
, **njobs)
mean_score = mean(scores)
round(mean_score, 2)
# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
# 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF
# 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF
# 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB
# 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN
# ROC_AUC DOES NOT match!
#%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY!
# Pass instance of pipeline and training and test data set
# cv=10 represents the StratifiedKFold with 10 folds
scores_sh = cross_val_score(pipeline
, X = num_df_wtgt[numerical_FN]
, y = num_df_wtgt['mutation_class']
#, shuffle = True
, scoring = 'f1'
#, cv = 10
, cv = skf_cv
, **njobs)
mean_score_sh = mean(scores_sh)
round(mean_score_sh, 2)
#%% Loopity Loop vs cross_validate (NO SHUFFLE)
skf_cv = cross_validate(pipeline
, num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, cv = 10
, scoring = scoring_fn
#, shuffle = True, **rs
, return_train_score=True)
skf_cv
cvscores_df = pd.DataFrame(skf_cv)
cvscores_df_test = cvscores_df.filter(like='test_', axis=1)
cvscores_df_test_mean = cvscores_df_test.mean(axis = 0)
cvscores_df_test_mean
# Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT!
#%% Loopity Loop vs cross_validate (SHUFFLE) ===> YAYYYY!
# https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv
skf_cv_sh = cross_validate(pipeline
, num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
#, cv = 10
, cv = skf_cv
, scoring = scoring_fn
, return_train_score=True)
skf_cv_sh
cvscores_df_sh = pd.DataFrame(skf_cv_sh)
cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1)
cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0)
cvscores_df_test_mean_sh
#%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!!
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter = 500, **rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
rf2 = RandomForestClassifier(
min_samples_leaf = 50
, n_estimators = 150
, bootstrap = True
, oob_score = True
, **njobs
, **rs
, max_features = 'auto')
xgb = XGBClassifier(**rs
, verbosity = 0, use_label_encoder =False)
models = [('Logistic Regression', log_reg)
, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn)
, ('SVM' , svm)
, ('MLP' , mlp)
, ('Decision Tree' , dt)
, ('Extra Trees' , et)
, ('Random Forest' , rf)
, ('Naive Bayes' , nb)
, ('Random Forest2' , rf2)
, ('XGBoost' , xgb)]
mm_skf_scoresD = {}
for model_name, model_fn in models:
# print('\nModel_name:', model_name
# , '\nModel func:', model_fn
# , '\nList of models:', models)
model_pipeline = Pipeline([
('pre' , MinMaxScaler())
, ('model' , model_fn)])
print('Running model pipeline:', model_pipeline)
skf_cv_mod = cross_validate(model_pipeline
, X
, y
, cv = skf_cv
, scoring = scoring_fn
, return_train_score = True)
mm_skf_scoresD[model_name] = {}
for key, value in skf_cv_mod.items():
print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value))
mm_skf_scoresD[model_name][key] = round(mean(value),2)
pp.pprint(mm_skf_scoresD)
# construct df
mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD)
mm_skf_scores_df_all
mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0)
mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results