ML_AI_training/classification_names_params.py

264 lines
No EOL
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 18 09:47:48 2022
@author: tanu
"""
#%% Useful links
# https://stackoverflow.com/questions/41844311/list-of-all-classification-algorithms
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
# https://scikit-learn.org/stable/modules/svm.html#classification
# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/ # [params]
# https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html [ algo]
# As a general rule of thumb, it is required to run baseline models on the dataset. I know H2O- AutoML and other AutoML packages do this. But I want to try using Scikit-learn Pipeline,
# https://codereview.stackexchange.com/questions/256934/model-pipeline-to-run-multiple-classifiers-for-ml-classification
# https://uk.mathworks.com/help/stats/hyperparameter-optimization-in-classification-learner-app.html
# QDA: https://www.geeksforgeeks.org/quadratic-discriminant-analysis/
names = [
"Nearest Neighbors",
"Linear SVM",
"RBF SVM",
"Gaussian Process",
"Decision Tree",
"Random Forest",
"Neural Net",
"AdaBoost",
"Naive Bayes",
"QDA",
]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(1.0 * RBF(1.0)),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1, max_iter=1000),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis(),
]
# NOTE Logistic regression
# The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:
# newton-cg - [l2, none]
# lbfgs - [l2, none]
# liblinear - [l1, l2]
# sag - [l2, none]
# saga - [elasticnet, l1, l2, none]
# SVR?
# estimator=SVR(kernel='rbf')
# param_grid={
# 'C': [1.1, 5.4, 170, 1001],
# 'epsilon': [0.0003, 0.007, 0.0109, 0.019, 0.14, 0.05, 8, 0.2, 3, 2, 7],
# 'gamma': [0.7001, 0.008, 0.001, 3.1, 1, 1.3, 5]
# }
#%% Classification algorithms param grid
#%% LogisticRegression()
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
gs_lr = Pipeline((
('pre' , MinMaxScaler())
,('clf', LogisticRegression(**rs
, **njobs))
))
gs_lr_params = {
'clf__C' : [0.0001, 0.001, 0.01, 0.1 ,1, 10, 100]
#'C': np.logspace(-4, 4, 50)
, 'clf__penalty': ['l1', 'l2', 'elasticnet', 'none']
, 'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
#%% DecisionTreeClassifier()
gs_dt = Pipeline((
('pre' , MinMaxScaler())
, ('clf', DecisionTreeClassifier(**rs
, **njobs))
))
gs_dt_params = {
'clf__max_depth': [ 2, 4, 6, 8, 10]
, 'clf__criterion':['gini','entropy']
, "clf__max_features":["auto", None]
, "clf__max_leaf_nodes":[10,20,30,40]
}
#%% KNeighborsClassifier()
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
gs_knn = Pipeline((
('pre' , MinMaxScaler())
,('clf', KNeighborsClassifier(**rs
, **njobs))
))
gs_knn_params = {
'clf__n_neighbors': [3, 7, 10]
#, 'clf__n_neighbors': range(1, 21, 2)
,'clf__metric' : ['euclidean', 'manhattan', 'minkowski']
, 'clf__weights' : ['uniform', 'distance']
}
#%% RandomForestClassifier()
gs_rf = Pipeline((
('pre' , MinMaxScaler())
,('clf', RandomForestClassifier(**rs
, **njobs
, bootstrap = True
, oob_score = True))
))
gs_rf_params = {
'clf__max_depth': [4, 6, 8, 10, 12, 16, 20, None]
, 'clf__class_weight':['balanced','balanced_subsample']
, 'clf__n_estimators': [10, 100, 1000]
, 'clf__criterion': ['gini', 'entropy']
, 'clf__max_features': ['auto', 'sqrt']
, 'clf__min_samples_leaf': [2, 4, 8, 50]
, 'clf__min_samples_split': [10, 20]
}
#%% XGBClassifier()
# https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
# https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
gs_xgb = Pipeline((
('pre' , MinMaxScaler())
,('clf', XGBClassifier(**rs
, **njobs))
))
gs_xgb_params = {
'clf__learning_rate': [0.01, 0.05, 0.1, 0.2]
, 'clf__max_depth': [4, 6, 8, 10, 12, 16, 20]
, 'clf__min_samples_leaf': [4, 8, 12, 16, 20]
, 'clf__max_features': ['auto', 'sqrt']
}
#%% MLPClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
gs_mlp = Pipeline((
('pre' , MinMaxScaler())
,('clf', MLPClassifier(**rs
, **njobs
, max_iter = 500))
))
gs_mlp_params = {
'clf__hidden_layer_sizes': [(1), (2), (3)]
, 'clf__max_features': ['auto', 'sqrt']
, 'clf__min_samples_leaf': [2, 4, 8]
, 'clf__min_samples_split': [10, 20]
}
#%% RidgeClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html
gs_rc = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', RidgeClassifier(**rs
, **njobs))
))
gs_rc_params = {
'clf__alpha': [0.1, 0.2, 0.5, 0.8, 1.0]
}
#%% SVC()
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
gs_svc = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', SVC(**rs
, **njobs))
))
gs_svc_params = {
'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}
, 'clf__C' : [50, 10, 1.0, 0.1, 0.01]
, 'clf__gamma': ['scale', 'auto'] }
#%% BaggingClassifier()
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
gs_bdt = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', BaggingClassifier(**rs
, **njobs
, bootstrap = True
, oob_score = True))
))
gs_bdt_params = {
'clf__n_estimators' : [10, 100, 1000]
# If None, then the base estimator is a DecisionTreeClassifier.
, 'clf__base_estimator' : ['None', 'SVC()', 'KNeighborsClassifier()']# if none, DT is used
, 'clf__gamma': ['scale', 'auto'] }
#%% GradientBoostingClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
gs_gb = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', GradientBoostingClassifier(**rs))
))
gs_bdt_params = {
'clf__n_estimators' : [10, 100, 1000]
, 'clf__n_estimators' : [10, 100, 1000]
, 'clf__learning_rate': [0.001, 0.01, 0.1]
, 'clf__subsample' : [0.5, 0.7, 1.0]
, 'clf__max_depth' : [3, 7, 9]
}
#%% AdaBoostClassifier()
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier
gs_gb = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', AdaBoostClassifier(**rs))
))
gs_bdt_params = {
'clf__n_estimators': [none, 1, 2]
, 'clf__base_estiamtor' : ['None', 1*SVC(), 1*KNeighborsClassifier()]
#, 'clf___splitter' : ["best", "random"]
}
#%% GaussianProcessClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html
#GaussianProcessClassifier(1.0 * RBF(1.0)),
gs_gpc = Pipeline((
('pre' , MinMaxScaler()) # CHECK if it wants -1 to 1
,('clf', GaussianProcessClassifier(**rs))
))
gs_gpc_params = {
'clf__kernel': [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
}
#%% GaussianNB()
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
gs_gnb = Pipeline((
('pre' , MinMaxScaler())
, ('pca', PCA() )# CHECK if it wants -1 to 1
,('clf', GaussianNB(**rs))
))
gs_gnb_params = {
'clf__priors': [None]
, 'clf__var_smoothing': np.logspace(0,-9, num=100)
}
#%% QuadraticDiscriminantAnalysis()
#https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html
gs_qda = Pipeline((
('pre' , MinMaxScaler())
#, ('pca', PCA() )# CHECK if it wants -1 to 1
,('clf', QuadraticDiscriminantAnalysis())
))
#%% BernoulliNB()
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html
gs_gnb = Pipeline((
('pre' , MinMaxScaler())
,('clf', BernoulliNB())
))
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
gs_gnb_params = {
'clf__alpha': [0, 1]
, 'clf__binarize':['None', 0]
, 'clf__fit_prior': [True]
, 'clf__class_prior': ['None']
}