ML_AI_training/gscv_eg.py

158 lines
7.3 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 20 13:02:54 2022
@author: tanu
"""
# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/
#%% LogisticRegression
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
#%% RidgeClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = RidgeClassifier()
alpha = [0.9, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.1, 1.0]
# define grid search
grid = dict(alpha=alpha)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# NOTES:
# alpha: If all alphas return the same mean, which do you chose?
# Python seems to chose the first one?
# https://stats.stackexchange.com/questions/166950/alpha-parameter-in-ridge-regression-is-high
# The L2 norm term in ridge regression is weighted by the regularization parameter
# alpha. So, the alpha parameter need not be small. But, for a larger alpha, the
# flexibility of the fit would be very strict.
# So, if the alpha value is 0, it means that it is just an Ordinary Least Squares
# Regression model. So, the larger is the alpha, the higher is the smoothness constraint.
# So, the smaller the value of alpha, the higher would be the magnitude of the coefficients.
# Could be that the model does not fit very well. With a very large alpha,
# the algorithm more or else ignores the IV's and fits a mean. Placidia
# @Placidia, yes I would completely agree with your comment. I was just trying to
# explain the significance of alpha as a parameter (as asked in the question) in
# Ridge Regression, and how it's change would affect the fit and the coefficients.
# Thank you for including the point in the comment.
# ** READ: https://machinelearningcompass.com/machine_learning_models/ridge_regression/
#%% KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
#p = [1,2]
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# NOTES:
# https://vitalflux.com/k-nearest-neighbors-explained-with-python-examples/
# https://vitalflux.com/overfitting-underfitting-concepts-interview-questions/
# Larger value of K ==> model may underfit
# Smaller value of K ==> the model may overfit.
#%%Support Vector Machine (SVM)
# example of grid searching key hyperparametres for SVC
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# NOTES:
# https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel
# SVM terms: hyperplane, C and soft margins
# hyperplane that can min(max(dist)) of the suppor vectors from tne hyperplane
# High C ==> increase overfitting
# Low C ==> increase underfitting
# But if C is a regularization parameter, why does a high C increase
# overfitting, when generally speaking regularization is done to
# mitigate overfitting, i.e., by creating a more general model?
# C is a regularisation parameter, but it is essentially attached to
# the data misfit term (the sum of the slack variables) rather than
# the regularisation term (the margin bit), so a larger value of C
# means less regularisation, rather than more. Alternatively you can
# view the usual representation of the rgularisation parameter
# as 1/C.
#C is a regularization parameter that controls the trade off
#between the achieving a low training error and a low testing
# error that is the ability to generalize your classifier to unseen data.
# C Parameter is used for controlling the outliers:
# low C implies ==> we are allowing more outliers
# high C implies we are allowing fewer outliers.