#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Mar 15 11:09:50 2022 @author: tanu """ from sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import load_wine from sklearn.model_selection import KFold wine = load_wine() X_train, y_train = wine.data, wine.target model = Pipeline([ ('pre', StandardScaler()), ('knn', KNeighborsClassifier()) ]) model.fit(X_train,y_train) from sklearn.model_selection import cross_validate val = cross_validate(model,X_train,y_train, cv = 10) val['test_score'].mean() my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef}) # for scoring in ({'accuracy' : make_scorer(accuracy_score) # , 'fscore' : make_scorer(f1_score) # , 'mcc' : make_scorer(matthews_corrcoef) # , 'precision' : make_scorer(precision_score) # , 'recall' : make_scorer(recall_score) # , 'roc_auc' : make_scorer(roc_auc_score) # , 'jaccard' : make_scorer(jaccard_score) # } # ,'accuracy', 'fscore', 'MCC', 'Precision', 'Recall', 'ROC_AUC', 'jaccard'): scoring_fn = ({'accuracy' : make_scorer(accuracy_score) , 'fscore' : make_scorer(f1_score) , 'mcc' : make_scorer(matthews_corrcoef) , 'precision' : make_scorer(precision_score) , 'recall' : make_scorer(recall_score) , 'roc_auc' : make_scorer(roc_auc_score) #, 'jaccard' : make_scorer(jaccard_score) }) val2 = cross_validate(model,X_train,y_train, cv = 10 , scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc' ) #, scoring=scoring_fn , return_train_score=False) val2 print(val2['test_f1']) print(mean(val2['test_accuracy'])) print(mean(val2['test_f1'])) #print(mean(val2['train_f1'])) print(mean(val2['test_precision'])) #print(mean(val2['train_precision'])) print(mean(val2['test_recall'])) print(mean(val2['test_roc_auc'])) #%% val3 = cross_validate(model , X_train , y_train , cv = 10 , scoring = scoring_fn , return_train_score=False) val3 print(mean(val3['test_accuracy'])) print(mean(val3['test_fscore'])) print(mean(val3['test_mcc'])) print(mean(val3['test_precision'])) print(mean(val3['test_recall'])) print(mean(val3['test_roc_auc'])) # differs #====================== # with CV.split scores = [] scores #best_svr = SVR(kernel='rbf') model = Pipeline([ ('pre', StandardScaler()), ('knn', KNeighborsClassifier()) ]) cv = KFold(n_splits=10 #, random_state=42 #, shuffle=True) ) for train_index, test_index in cv.split(num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class']): #print("Train Index: ", train_index, "\n") #print("Test Index: ", test_index) X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index] model.fit(X_train, y_train) scores.append(model.score(X_test, y_test)) mean(scores) ################ scores_skf = [] skf = StratifiedKFold(n_splits = 10 #, shuffle = True #, **r ) for train_index, test_index in skf.split(num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class']): #print("Train Index: ", train_index, "\n") #print("Test Index: ", test_index) X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index] model.fit(X_train, y_train) scores_skf.append(model.score(X_test, y_test)) mean(scores_skf) val = cross_validate(model, X_train,y_train , cv = 10) val['test_score'].mean() #%% compare loopity loop vs CV with SKF rs = {'random_state': 42} X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , test_size = 0.33 , **rs , shuffle = True , stratify = num_df_wtgt['mutation_class']) log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) model_single_pipeline = Pipeline([ ('pre', MinMaxScaler()) , ('model', log_reg) #, ('model', nb) #, ('model', knn) ]) skf_cv = cross_validate(model_single_pipeline #, X_train #, y_train , num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , cv = 10 , scoring = scoring_fn , return_train_score=True) skf_cv print(round(mean(skf_cv['test_accuracy']),2)) print(round(mean(skf_cv['test_fscore']),2)) print(round(mean(skf_cv['test_mcc']),2)) print(round(mean(skf_cv['test_precision']),2)) print(round(mean(skf_cv['test_recall']),2)) print(round(mean(skf_cv['test_roc_auc']),2)) # differs # %% Extracting skf_cv mean values and assiging to a dict models_single = [ ('Logistic Regression' , log_reg) #, ('Naive Bayes' , nb) #, ('K-Nearest Neighbors', knn) # , ('SVM' , svm) ] foo_single = {} for model_name, model in models_single: print(model_name) #model_name_dict = {'model_name': model_name} foo_single[model_name] = {} for key, value in skf_cv.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', mean(value)) foo_single[model_name][key] = round(mean(value),2) pp.pprint(foo_single) foo_single_df = pd.DataFrame(foo_single) foo_single_df foo_single_df.filter(like='test_', axis=0) # ONLY for a single score cval_score = cross_val_score(model , num_df_wtgt[numerical_FN] , num_df_wtgt['mutation_class'] , scoring = 'f1_macro' , cv=10) print(cval_score) print(round(mean(cval_score), 2)) # %% Running multiple model with CV log_reg = LogisticRegression(**rs) nb = BernoulliNB() knn = KNeighborsClassifier() svm = SVC(**rs) models = [ ('Logistic Regression' , log_reg) , ('Naive Bayes' , nb) , ('K-Nearest Neighbors', knn) , ('SVM' , svm) ] foo = {} for model_name, model_fn in models: # print('\nModel_name:', model_name # , '\nModel func:', model_fn # , '\nList of models:', models) model_pipeline = Pipeline([ ('pre' , MinMaxScaler()) , ('model' , model_fn)]) print('Running model pipeline:', model_pipeline) skf_cv = cross_validate(model_pipeline , X_train , y_train , cv = 10 , scoring = scoring_fn , return_train_score = True) foo[model_name] = {} for key, value in skf_cv.items(): print('\nkey:', key, '\nvalue:', value) print('\nmean value:', mean(value)) foo[model_name][key] = round(mean(value),2) pp.pprint(foo) # construtc df foo_df = pd.DataFrame(foo) foo_df scores_df = foo_df.filter(like='test_', axis=0) a = pd.DataFrame(foo) b = pd.DataFrame.from_dict(foo) c = pd.DataFrame.from_records(foo)