#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ #%% from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_wine from sklearn.model_selection import KFold #%% Data X = num_df_wtgt[numerical_FN] y = num_df_wtgt['mutation_class'] X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size = 0.33 , random_state = 2 , shuffle = True rs = {'random_state': 42} njobs = {'n_jobs': 10} skf_cv = StratifiedKFold(n_splits = 10 #, shuffle = False, random_state= None) , shuffle = True,**rs) #%% Stratified Kfold: cross_val_score vs cross_validate vs for loop #https://vitalflux.com/k-fold-cross-validation-python-example/ #Pipeline(steps=[('prep' , col_transform) # , ('classifier' , model)]) pipeline = make_pipeline(MinMaxScaler() # , RandomForestClassifier(min_samples_leaf = 50 # , n_estimators = 150 # , bootstrap = True # , oob_score = True # , **njobs # , **rs # , max_features = 'auto')) , BernoulliNB()) #, KNeighborsClassifier()) #%% Loopity Loop vs cross_val_score (NO SHUFFLE) # Pass instance of pipeline and training and test data set # cv=10 represents the StratifiedKFold with 10 folds scores = cross_val_score(pipeline , X = num_df_wtgt[numerical_FN] , y = num_df_wtgt['mutation_class'] #, shuffle = True , scoring = 'f1' , cv = 10 , **njobs) mean_score = mean(scores) round(mean_score, 2) # Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT! # 0.67(shuffle = False, as no other option) vs 0.70 (shuffle = T) # RF # 0.67(shuffle = False, as no other option) vs 0.67 (shuffle = F, and no random state) # RF # 0.57(shuffle = False, as no other option) vs 0.57 (shuffle = F, and no random state) # NB # 0.65(shuffle = False, as no other option) vs 0.65 (shuffle = F, and no random state) # KNN # ROC_AUC DOES NOT match! #%% Loopity Loop vs cross_val_score (SHUFFLE) ===> YAYYYY! # Pass instance of pipeline and training and test data set # cv=10 represents the StratifiedKFold with 10 folds scores_sh = cross_val_score(pipeline , X = num_df_wtgt[numerical_FN] , y = num_df_wtgt['mutation_class'] #, shuffle = True , scoring = 'f1' #, cv = 10 , cv = skf_cv , **njobs) mean_score_sh = mean(scores_sh) round(mean_score_sh, 2) #%% Loopity Loop vs cross_validate (NO SHUFFLE) skf_cv = cross_validate(pipeline , num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , cv = 10 , scoring = scoring_fn #, shuffle = True, **rs , return_train_score=True) skf_cv cvscores_df = pd.DataFrame(skf_cv) cvscores_df_test = cvscores_df.filter(like='test_', axis=1) cvscores_df_test_mean = cvscores_df_test.mean(axis = 0) cvscores_df_test_mean # Conclusion:Compared with Loopity loop, shuffle = F: CONCURRENT! #%% Loopity Loop vs cross_validate (SHUFFLE) ===> YAYYYY! # https://gitmotion.com/scikit-learn/70010376/add-shuffle-parameter-to-cross-val-score-and-gridsearchcv skf_cv_sh = cross_validate(pipeline , num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] #, cv = 10 , cv = skf_cv , scoring = scoring_fn , return_train_score=True) skf_cv_sh cvscores_df_sh = pd.DataFrame(skf_cv_sh) cvscores_df_test_sh = cvscores_df_sh.filter(like='test_', axis=1) cvscores_df_test_mean_sh= cvscores_df_test_sh.mean(axis = 0) cvscores_df_test_mean_sh #%% Loopty Loop (ALL models + Shuffle) vs cross_validate(ALL models + Shuffle) : SUCCESS!!!! log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) mlp = MLPClassifier(max_iter = 500, **rs) dt = DecisionTreeClassifier(**rs) et = ExtraTreesClassifier(**rs) rf = RandomForestClassifier(**rs) rf2 = RandomForestClassifier( min_samples_leaf = 50 , n_estimators = 150 , bootstrap = True , oob_score = True , **njobs , **rs , max_features = 'auto') xgb = XGBClassifier(**rs , verbosity = 0, use_label_encoder =False) models = [('Logistic Regression', log_reg) , ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) , ('MLP' , mlp) , ('Decision Tree' , dt) , ('Extra Trees' , et) , ('Random Forest' , rf) , ('Naive Bayes' , nb) , ('Random Forest2' , rf2) , ('XGBoost' , xgb)] mm_skf_scoresD = {} for model_name, model_fn in models: # print('\nModel_name:', model_name # , '\nModel func:', model_fn # , '\nList of models:', models) model_pipeline = Pipeline([ ('pre' , MinMaxScaler()) , ('model' , model_fn)]) print('Running model pipeline:', model_pipeline) skf_cv_mod = cross_validate(model_pipeline , X , y , cv = skf_cv , scoring = scoring_fn , return_train_score = True) mm_skf_scoresD[model_name] = {} for key, value in skf_cv_mod.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', mean(value)) mm_skf_scoresD[model_name][key] = round(mean(value),2) pp.pprint(mm_skf_scoresD) # construct df mm_skf_scores_df_all = pd.DataFrame(mm_skf_scoresD) mm_skf_scores_df_all mm_skf_scores_df_test = mm_skf_scores_df_all.filter(like='test_', axis=0) mm_skf_scores_df_train = mm_skf_scores_df_all.filter(like='train_', axis=0) # helps to see if you trust the results