diff --git a/MultClassPipe3_CALL.py b/MultClassPipe3_CALL.py new file mode 100644 index 0000000..6699707 --- /dev/null +++ b/MultClassPipe3_CALL.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +# stratified shuffle split +X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , test_size = 0.33 + , **rs + , shuffle = True + , stratify = num_df_wtgt['mutation_class']) + +y_train.to_frame().value_counts().plot(kind = 'bar') +y_test.to_frame().value_counts().plot(kind = 'bar') + +MultClassPipelineCV(X_train, X_test, y_train, y_test + , input_df = num_df_wtgt[numerical_FN] + , var_type = 'numerical') + + +skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test + , input_df = num_df_wtgt[numerical_FN] + , var_type = 'numerical') + +pp.pprint(skf_cv_scores) +# construct a df +skf_cv_scores_df = pd.DataFrame(skf_cv_scores) +skf_cv_scores_df +skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0) +skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0) diff --git a/base_estimator.py b/base_estimator.py new file mode 100644 index 0000000..de9ddbb --- /dev/null +++ b/base_estimator.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 09:50:37 2022 + +@author: tanu +""" +#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers + +#%% +# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb +import numpy as np +import pandas as pd +from sklearn.model_selection import GridSearchCV + + +from sklearn import datasets +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.svm import SVC + +from sklearn.base import BaseEstimator +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.pipeline import Pipeline +from sklearn.model_selection import GridSearchCV +#%% +class EstimatorSelectionHelper: + + def __init__(self, models, params): + self.models = models + self.params = params + self.keys = models.keys() + self.grid_searches = {} + + def fit(self, X, y, **grid_kwargs): + for key in self.keys: + print('Running GridSearchCV for %s.' % key) + model = self.models[key] + params = self.params[key] + grid_search = GridSearchCV(model, params, **grid_kwargs) + grid_search.fit(X, y) + self.grid_searches[key] = grid_search + print('Done.') + + def score_summary(self, sort_by='mean_test_score'): + frames = [] + for name, grid_search in self.grid_searches.items(): + frame = pd.DataFrame(grid_search.cv_results_) + frame = frame.filter(regex='^(?!.*param_).*$') + frame['estimator'] = len(frame)*[name] + frames.append(frame) + df = pd.concat(frames) + + df = df.sort_values([sort_by], ascending=False) + df = df.reset_index() + df = df.drop(['rank_test_score', 'index'], 1) + + columns = df.columns.tolist() + columns.remove('estimator') + columns = ['estimator']+columns + df = df[columns] + return df + +#%% +breast_cancer = datasets.load_breast_cancer() +X_cancer = breast_cancer.data +y_cancer = breast_cancer.target + + +models1 = { + 'ExtraTreesClassifier': ExtraTreesClassifier(), + 'RandomForestClassifier': RandomForestClassifier(), + 'AdaBoostClassifier': AdaBoostClassifier(), + 'GradientBoostingClassifier': GradientBoostingClassifier() +} + +params1 = { + 'ExtraTreesClassifier': { 'n_estimators': [16, 32] }, + 'RandomForestClassifier': [ + { 'n_estimators': [16, 32] }, + {'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}], + 'AdaBoostClassifier': { 'n_estimators': [16, 32] }, + 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] } +} + + +helper1 = EstimatorSelectionHelper(models1, params1) +helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2) +helper1.score_summary() + +mm_df = helper1.score_summary() +# COMMENT: Not sure what scores is it mean of and the options available thus + +#%% + +class ClfSwitcher(BaseEstimator): + def __init__( + self, + estimator = SGDClassifier(), + ): + """ + A Custom BaseEstimator that can switch between classifiers. + :param estimator: sklearn object - The classifier + """ + + self.estimator = estimator + + + def fit(self, X, y=None, **kwargs): + self.estimator.fit(X, y) + return self + + def predict(self, X, y=None): + return self.estimator.predict(X) + + def predict_proba(self, X): + return self.estimator.predict_proba(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +parameters = [ + { + 'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss + #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), + #'tfidf__stop_words': ['english', None], + 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), + 'clf__estimator__max_iter': [50, 80], + 'clf__estimator__tol': [1e-4], + 'clf__estimator__loss': ['hinge', 'log', 'modified_huber'], + }, + { + 'clf__estimator': [MultinomialNB()], + #'tfidf__max_df': (0.25, 0.5, 0.75, 1.0), + #'tfidf__stop_words': [None], + 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), + }, +] + +pipeline = Pipeline([ + ('pre', MinMaxScaler()), + ('clf', ClfSwitcher()), +]) + + +gscv = GridSearchCV(pipeline + , parameters + , cv=5 + , n_jobs=12 + , return_train_score=False + , verbose=3) + +#gscv.fit(train_data, train_labels) + +#%% my numerical data +X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , test_size = 0.33 + , random_state = 2 + , shuffle = True + , stratify = num_df_wtgt['mutation_class']) + +y_train.to_frame().value_counts().plot(kind = 'bar') +y_test.to_frame().value_counts().plot(kind = 'bar') +#%% +gscv.fit(X_train, y_train) +print('Best model:\n', gscv.best_params_) +print('Best models score:\n', gscv.best_score_) +gscv.score(X_test, y_test) # see how it does on test + +#=========================================== +mod_pred = gscv.predict(X_test) + +fscore = f1_score(y_test, mod_pred) +fscore +#%% same as above +# custom classifier + +class MyClassifier(BaseEstimator): + + def __init__(self, classifier_type: str = 'SGDClassifier'): + """ + A Custome BaseEstimator that can switch between classifiers. + :param classifier_type: string - The switch for different classifiers + """ + self.classifier_type = classifier_type + + + def fit(self, X, y=None): + if self.classifier_type == 'SGDClassifier': + self.classifier_ = SGDClassifier() + elif self.classifier_type == 'MultinomialNB': + self.classifier_ = MultinomialNB() + else: + raise ValueError('Unkown classifier type.') + + self.classifier_.fit(X, y) + return self + + def predict(self, X, y=None): + return self.classifier_.predict(X) + + def score(self, X, y): + return self.estimator.score(X, y) + +pipeline = Pipeline([ + ('pre', MinMaxScaler()) + #, ('clf', ClfSwitcher() + , ('clf', MyClassifier()) +]) + +# parameter_space = { +# 'clf__classifier_type': ['SGDClassifier', 'MultinomialNB'] +# } + +parameter_space = [ + { + 'clf__estimator': [SGDClassifier()], + 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), + 'clf__estimator__max_iter': [50, 80], + 'clf__estimator__tol': [1e-4], + 'clf__estimator__loss': ['hinge', 'log', 'modified_huber'], + }, + { + 'clf__estimator': [MultinomialNB()], + 'clf__estimator__alpha': (1e-2, 1e-3, 1e-1), + }, +] + +search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5) +search.fit(X_train, y_train) +print('Best model:\n', search.best_params_) +print('Best models score:\n', gscv.best_score_) diff --git a/practice_cv.py b/practice_cv.py new file mode 100644 index 0000000..bfd6734 --- /dev/null +++ b/practice_cv.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 15 11:09:50 2022 + +@author: tanu +""" +from sklearn.neighbors import KNeighborsClassifier +from sklearn.datasets import load_wine +from sklearn.model_selection import KFold + +wine = load_wine() +X_train, y_train = wine.data, wine.target +model = Pipeline([ +('pre', StandardScaler()), +('knn', KNeighborsClassifier()) +]) +model.fit(X_train,y_train) + +from sklearn.model_selection import cross_validate +val = cross_validate(model,X_train,y_train, cv = 10) +val['test_score'].mean() + +my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef}) + + +# for scoring in ({'accuracy' : make_scorer(accuracy_score) +# , 'fscore' : make_scorer(f1_score) +# , 'mcc' : make_scorer(matthews_corrcoef) +# , 'precision' : make_scorer(precision_score) +# , 'recall' : make_scorer(recall_score) +# , 'roc_auc' : make_scorer(roc_auc_score) +# , 'jaccard' : make_scorer(jaccard_score) +# } +# ,'accuracy', 'fscore', 'MCC', 'Precision', 'Recall', 'ROC_AUC', 'jaccard'): + +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + #, 'jaccard' : make_scorer(jaccard_score) + }) + +val2 = cross_validate(model,X_train,y_train, cv = 10 + + , scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc' ) + #, scoring=scoring_fn + + , return_train_score=False) + +val2 +print(val2['test_f1']) +print(mean(val2['test_accuracy'])) +print(mean(val2['test_f1'])) +#print(mean(val2['train_f1'])) +print(mean(val2['test_precision'])) +#print(mean(val2['train_precision'])) +print(mean(val2['test_recall'])) +print(mean(val2['test_roc_auc'])) + +#%% +val3 = cross_validate(model + , X_train + , y_train + , cv = 10 + , scoring = scoring_fn + , return_train_score=False) + +val3 +print(mean(val3['test_accuracy'])) +print(mean(val3['test_fscore'])) +print(mean(val3['test_mcc'])) +print(mean(val3['test_precision'])) +print(mean(val3['test_recall'])) +print(mean(val3['test_roc_auc'])) # differs + +#====================== +# with CV.split +scores = [] +scores +#best_svr = SVR(kernel='rbf') +model = Pipeline([ +('pre', StandardScaler()), +('knn', KNeighborsClassifier()) +]) +cv = KFold(n_splits=10 + #, random_state=42 + #, shuffle=True) + ) +for train_index, test_index in cv.split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class']): + #print("Train Index: ", train_index, "\n") + #print("Test Index: ", test_index) + + X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index] + model.fit(X_train, y_train) + scores.append(model.score(X_test, y_test)) + +mean(scores) + +################ +scores_skf = [] +skf = StratifiedKFold(n_splits = 10 + #, shuffle = True + #, **r + ) + +for train_index, test_index in skf.split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class']): + #print("Train Index: ", train_index, "\n") + #print("Test Index: ", test_index) + + X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index] + model.fit(X_train, y_train) + scores_skf.append(model.score(X_test, y_test)) + +mean(scores_skf) + + +val = cross_validate(model, X_train,y_train , cv = 10) +val['test_score'].mean() +#%% compare loopity loop vs CV with SKF +rs = {'random_state': 42} +X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , test_size = 0.33 + , **rs + , shuffle = True + , stratify = num_df_wtgt['mutation_class']) + +log_reg = LogisticRegression(**rs) +nb = BernoulliNB() +knn = KNeighborsClassifier() +svm = SVC(**rs) + +model_single_pipeline = Pipeline([ +('pre', MinMaxScaler()) +, ('model', log_reg) +#, ('model', nb) +#, ('model', knn) + +]) + +skf_cv = cross_validate(model_single_pipeline + #, X_train + #, y_train + , num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , cv = 10 + , scoring = scoring_fn + , return_train_score=True) + +skf_cv +print(round(mean(skf_cv['test_accuracy']),2)) +print(round(mean(skf_cv['test_fscore']),2)) +print(round(mean(skf_cv['test_mcc']),2)) +print(round(mean(skf_cv['test_precision']),2)) +print(round(mean(skf_cv['test_recall']),2)) +print(round(mean(skf_cv['test_roc_auc']),2)) # differs + + +# %% Extracting skf_cv mean values and assiging to a dict +models_single = [ + ('Logistic Regression' , log_reg) + #, ('Naive Bayes' , nb) + #, ('K-Nearest Neighbors', knn) + # , ('SVM' , svm) + ] + +foo_single = {} +for model_name, model in models_single: + print(model_name) + #model_name_dict = {'model_name': model_name} + foo_single[model_name] = {} + for key, value in skf_cv.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', mean(value)) + foo_single[model_name][key] = round(mean(value),2) + pp.pprint(foo_single) + +foo_single_df = pd.DataFrame(foo_single) +foo_single_df +foo_single_df.filter(like='test_', axis=0) + +# ONLY for a single score +cval_score = cross_val_score(model + , num_df_wtgt[numerical_FN] + , num_df_wtgt['mutation_class'] + , scoring = 'f1_macro' + , cv=10) +print(cval_score) +print(round(mean(cval_score), 2)) + + +# %% Running multiple model with CV +log_reg = LogisticRegression(**rs) +nb = BernoulliNB() +knn = KNeighborsClassifier() +svm = SVC(**rs) + +models = [ + ('Logistic Regression' , log_reg) + , ('Naive Bayes' , nb) + , ('K-Nearest Neighbors', knn) + , ('SVM' , svm) + ] + +foo = {} +for model_name, model_fn in models: + # print('\nModel_name:', model_name + # , '\nModel func:', model_fn + # , '\nList of models:', models) + + model_pipeline = Pipeline([ + ('pre' , MinMaxScaler()) + , ('model' , model_fn)]) + print('Running model pipeline:', model_pipeline) + skf_cv = cross_validate(model_pipeline + , X_train + , y_train + , cv = 10 + , scoring = scoring_fn + , return_train_score = True) + foo[model_name] = {} + for key, value in skf_cv.items(): + print('\nkey:', key, '\nvalue:', value) + print('\nmean value:', mean(value)) + foo[model_name][key] = round(mean(value),2) +pp.pprint(foo) + +# construtc df +foo_df = pd.DataFrame(foo) +foo_df +scores_df = foo_df.filter(like='test_', axis=0) + + +a = pd.DataFrame(foo) +b = pd.DataFrame.from_dict(foo) +c = pd.DataFrame.from_records(foo) + + +