added practice and base_estimator for all the confusion in my head

This commit is contained in:
Tanushree Tunstall 2022-03-16 10:12:59 +00:00
parent e28a296d98
commit 97620c1bb0
3 changed files with 513 additions and 0 deletions

33
MultClassPipe3_CALL.py Normal file
View file

@ -0,0 +1,33 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
# stratified shuffle split
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, test_size = 0.33
, **rs
, shuffle = True
, stratify = num_df_wtgt['mutation_class'])
y_train.to_frame().value_counts().plot(kind = 'bar')
y_test.to_frame().value_counts().plot(kind = 'bar')
MultClassPipelineCV(X_train, X_test, y_train, y_test
, input_df = num_df_wtgt[numerical_FN]
, var_type = 'numerical')
skf_cv_scores = MultClassPipelineCV(X_train, X_test, y_train, y_test
, input_df = num_df_wtgt[numerical_FN]
, var_type = 'numerical')
pp.pprint(skf_cv_scores)
# construct a df
skf_cv_scores_df = pd.DataFrame(skf_cv_scores)
skf_cv_scores_df
skf_cv_scores_df_test = skf_cv_scores_df.filter(like='test_', axis=0)
skf_cv_scores_df_train = skf_cv_scores_df.filter(like='train_', axis=0)

236
base_estimator.py Normal file
View file

@ -0,0 +1,236 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 09:50:37 2022
@author: tanu
"""
#https://stackoverflow.com/questions/50272416/gridsearch-on-model-and-classifiers
#%%
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#%%
class EstimatorSelectionHelper:
def __init__(self, models, params):
self.models = models
self.params = params
self.keys = models.keys()
self.grid_searches = {}
def fit(self, X, y, **grid_kwargs):
for key in self.keys:
print('Running GridSearchCV for %s.' % key)
model = self.models[key]
params = self.params[key]
grid_search = GridSearchCV(model, params, **grid_kwargs)
grid_search.fit(X, y)
self.grid_searches[key] = grid_search
print('Done.')
def score_summary(self, sort_by='mean_test_score'):
frames = []
for name, grid_search in self.grid_searches.items():
frame = pd.DataFrame(grid_search.cv_results_)
frame = frame.filter(regex='^(?!.*param_).*$')
frame['estimator'] = len(frame)*[name]
frames.append(frame)
df = pd.concat(frames)
df = df.sort_values([sort_by], ascending=False)
df = df.reset_index()
df = df.drop(['rank_test_score', 'index'], 1)
columns = df.columns.tolist()
columns.remove('estimator')
columns = ['estimator']+columns
df = df[columns]
return df
#%%
breast_cancer = datasets.load_breast_cancer()
X_cancer = breast_cancer.data
y_cancer = breast_cancer.target
models1 = {
'ExtraTreesClassifier': ExtraTreesClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier()
}
params1 = {
'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
'RandomForestClassifier': [
{ 'n_estimators': [16, 32] },
{'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16]}],
'AdaBoostClassifier': { 'n_estimators': [16, 32] },
'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }
}
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)
helper1.score_summary()
mm_df = helper1.score_summary()
# COMMENT: Not sure what scores is it mean of and the options available thus
#%%
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
#'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
#'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
#'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
pipeline = Pipeline([
('pre', MinMaxScaler()),
('clf', ClfSwitcher()),
])
gscv = GridSearchCV(pipeline
, parameters
, cv=5
, n_jobs=12
, return_train_score=False
, verbose=3)
#gscv.fit(train_data, train_labels)
#%% my numerical data
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, test_size = 0.33
, random_state = 2
, shuffle = True
, stratify = num_df_wtgt['mutation_class'])
y_train.to_frame().value_counts().plot(kind = 'bar')
y_test.to_frame().value_counts().plot(kind = 'bar')
#%%
gscv.fit(X_train, y_train)
print('Best model:\n', gscv.best_params_)
print('Best models score:\n', gscv.best_score_)
gscv.score(X_test, y_test) # see how it does on test
#===========================================
mod_pred = gscv.predict(X_test)
fscore = f1_score(y_test, mod_pred)
fscore
#%% same as above
# custom classifier
class MyClassifier(BaseEstimator):
def __init__(self, classifier_type: str = 'SGDClassifier'):
"""
A Custome BaseEstimator that can switch between classifiers.
:param classifier_type: string - The switch for different classifiers
"""
self.classifier_type = classifier_type
def fit(self, X, y=None):
if self.classifier_type == 'SGDClassifier':
self.classifier_ = SGDClassifier()
elif self.classifier_type == 'MultinomialNB':
self.classifier_ = MultinomialNB()
else:
raise ValueError('Unkown classifier type.')
self.classifier_.fit(X, y)
return self
def predict(self, X, y=None):
return self.classifier_.predict(X)
def score(self, X, y):
return self.estimator.score(X, y)
pipeline = Pipeline([
('pre', MinMaxScaler())
#, ('clf', ClfSwitcher()
, ('clf', MyClassifier())
])
# parameter_space = {
# 'clf__classifier_type': ['SGDClassifier', 'MultinomialNB']
# }
parameter_space = [
{
'clf__estimator': [SGDClassifier()],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
search = GridSearchCV(pipeline , parameter_space, n_jobs=-1, cv=5)
search.fit(X_train, y_train)
print('Best model:\n', search.best_params_)
print('Best models score:\n', gscv.best_score_)

244
practice_cv.py Normal file
View file

@ -0,0 +1,244 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 11:09:50 2022
@author: tanu
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import KFold
wine = load_wine()
X_train, y_train = wine.data, wine.target
model = Pipeline([
('pre', StandardScaler()),
('knn', KNeighborsClassifier())
])
model.fit(X_train,y_train)
from sklearn.model_selection import cross_validate
val = cross_validate(model,X_train,y_train, cv = 10)
val['test_score'].mean()
my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
# for scoring in ({'accuracy' : make_scorer(accuracy_score)
# , 'fscore' : make_scorer(f1_score)
# , 'mcc' : make_scorer(matthews_corrcoef)
# , 'precision' : make_scorer(precision_score)
# , 'recall' : make_scorer(recall_score)
# , 'roc_auc' : make_scorer(roc_auc_score)
# , 'jaccard' : make_scorer(jaccard_score)
# }
# ,'accuracy', 'fscore', 'MCC', 'Precision', 'Recall', 'ROC_AUC', 'jaccard'):
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
#, 'jaccard' : make_scorer(jaccard_score)
})
val2 = cross_validate(model,X_train,y_train, cv = 10
, scoring=('accuracy', 'f1', 'precision', 'recall', 'roc_auc' )
#, scoring=scoring_fn
, return_train_score=False)
val2
print(val2['test_f1'])
print(mean(val2['test_accuracy']))
print(mean(val2['test_f1']))
#print(mean(val2['train_f1']))
print(mean(val2['test_precision']))
#print(mean(val2['train_precision']))
print(mean(val2['test_recall']))
print(mean(val2['test_roc_auc']))
#%%
val3 = cross_validate(model
, X_train
, y_train
, cv = 10
, scoring = scoring_fn
, return_train_score=False)
val3
print(mean(val3['test_accuracy']))
print(mean(val3['test_fscore']))
print(mean(val3['test_mcc']))
print(mean(val3['test_precision']))
print(mean(val3['test_recall']))
print(mean(val3['test_roc_auc'])) # differs
#======================
# with CV.split
scores = []
scores
#best_svr = SVR(kernel='rbf')
model = Pipeline([
('pre', StandardScaler()),
('knn', KNeighborsClassifier())
])
cv = KFold(n_splits=10
#, random_state=42
#, shuffle=True)
)
for train_index, test_index in cv.split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']):
#print("Train Index: ", train_index, "\n")
#print("Test Index: ", test_index)
X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index]
model.fit(X_train, y_train)
scores.append(model.score(X_test, y_test))
mean(scores)
################
scores_skf = []
skf = StratifiedKFold(n_splits = 10
#, shuffle = True
#, **r
)
for train_index, test_index in skf.split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']):
#print("Train Index: ", train_index, "\n")
#print("Test Index: ", test_index)
X_train, X_test, y_train, y_test = num_df_wtgt[numerical_FN].iloc[train_index], num_df_wtgt[numerical_FN].iloc[test_index], num_df_wtgt['mutation_class'].iloc[train_index], num_df_wtgt['mutation_class'].iloc[test_index]
model.fit(X_train, y_train)
scores_skf.append(model.score(X_test, y_test))
mean(scores_skf)
val = cross_validate(model, X_train,y_train , cv = 10)
val['test_score'].mean()
#%% compare loopity loop vs CV with SKF
rs = {'random_state': 42}
X_train, X_test, y_train, y_test = train_test_split(num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, test_size = 0.33
, **rs
, shuffle = True
, stratify = num_df_wtgt['mutation_class'])
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
model_single_pipeline = Pipeline([
('pre', MinMaxScaler())
, ('model', log_reg)
#, ('model', nb)
#, ('model', knn)
])
skf_cv = cross_validate(model_single_pipeline
#, X_train
#, y_train
, num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, cv = 10
, scoring = scoring_fn
, return_train_score=True)
skf_cv
print(round(mean(skf_cv['test_accuracy']),2))
print(round(mean(skf_cv['test_fscore']),2))
print(round(mean(skf_cv['test_mcc']),2))
print(round(mean(skf_cv['test_precision']),2))
print(round(mean(skf_cv['test_recall']),2))
print(round(mean(skf_cv['test_roc_auc']),2)) # differs
# %% Extracting skf_cv mean values and assiging to a dict
models_single = [
('Logistic Regression' , log_reg)
#, ('Naive Bayes' , nb)
#, ('K-Nearest Neighbors', knn)
# , ('SVM' , svm)
]
foo_single = {}
for model_name, model in models_single:
print(model_name)
#model_name_dict = {'model_name': model_name}
foo_single[model_name] = {}
for key, value in skf_cv.items():
print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value))
foo_single[model_name][key] = round(mean(value),2)
pp.pprint(foo_single)
foo_single_df = pd.DataFrame(foo_single)
foo_single_df
foo_single_df.filter(like='test_', axis=0)
# ONLY for a single score
cval_score = cross_val_score(model
, num_df_wtgt[numerical_FN]
, num_df_wtgt['mutation_class']
, scoring = 'f1_macro'
, cv=10)
print(cval_score)
print(round(mean(cval_score), 2))
# %% Running multiple model with CV
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
models = [
('Logistic Regression' , log_reg)
, ('Naive Bayes' , nb)
, ('K-Nearest Neighbors', knn)
, ('SVM' , svm)
]
foo = {}
for model_name, model_fn in models:
# print('\nModel_name:', model_name
# , '\nModel func:', model_fn
# , '\nList of models:', models)
model_pipeline = Pipeline([
('pre' , MinMaxScaler())
, ('model' , model_fn)])
print('Running model pipeline:', model_pipeline)
skf_cv = cross_validate(model_pipeline
, X_train
, y_train
, cv = 10
, scoring = scoring_fn
, return_train_score = True)
foo[model_name] = {}
for key, value in skf_cv.items():
print('\nkey:', key, '\nvalue:', value)
print('\nmean value:', mean(value))
foo[model_name][key] = round(mean(value),2)
pp.pprint(foo)
# construtc df
foo_df = pd.DataFrame(foo)
foo_df
scores_df = foo_df.filter(like='test_', axis=0)
a = pd.DataFrame(foo)
b = pd.DataFrame.from_dict(foo)
c = pd.DataFrame.from_records(foo)