109 lines
3.6 KiB
Python
109 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Feb 16 11:56:57 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
|
|
# source
|
|
#https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/
|
|
|
|
###############################################
|
|
# Pipeline 1: Data Preparation and Modeling
|
|
###############################################
|
|
|
|
# Create a pipeline that standardizes the data then creates a model
|
|
from pandas import read_csv
|
|
from sklearn.model_selection import KFold
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
|
|
|
# load data
|
|
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
|
|
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
|
|
dataframe = read_csv(url, names=names)
|
|
array = dataframe.values
|
|
X = array[:,0:8]
|
|
Y = array[:,8]
|
|
|
|
# create pipeline
|
|
estimators = []
|
|
estimators.append(('standardize', StandardScaler()))
|
|
estimators.append(('lda', LinearDiscriminantAnalysis()))
|
|
model = Pipeline(estimators)
|
|
|
|
# evaluate pipeline
|
|
seed = 7
|
|
#kfold = KFold(n_splits=10, random_state=seed)
|
|
kfold = KFold(n_splits=10, random_state=None)
|
|
results = cross_val_score(model, X, Y, cv=kfold)
|
|
print(results.mean())
|
|
|
|
###############################################
|
|
# Pipeline 2: Feature Extraction and Modeling
|
|
###############################################
|
|
|
|
# Create a pipeline that extracts features from the data then creates a model
|
|
from sklearn.pipeline import FeatureUnion
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.feature_selection import SelectKBest
|
|
|
|
# load data
|
|
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
|
|
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
|
|
dataframe = read_csv(url, names=names)
|
|
array = dataframe.values
|
|
X = array[:,0:8]
|
|
Y = array[:,8]
|
|
|
|
# create feature union
|
|
features = []
|
|
features.append(('pca', PCA(n_components=3)))
|
|
features.append(('select_best', SelectKBest(k=6)))
|
|
feature_union = FeatureUnion(features)
|
|
|
|
# create pipeline
|
|
estimators = []
|
|
estimators.append(('feature_union', feature_union))
|
|
estimators.append(('logistic', LogisticRegression()))
|
|
model = Pipeline(estimators)
|
|
|
|
# evaluate pipeline
|
|
seed = 7
|
|
kfold = KFold(n_splits=10, random_state=seed)
|
|
results = cross_val_score(model, X, Y, cv=kfold)
|
|
print(results.mean())
|
|
|
|
#%%############################################################################
|
|
from sklearn import datasets
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
# import some data within sklearn for iris classification
|
|
iris = datasets.load_iris()
|
|
X = iris.data
|
|
y = iris.target
|
|
|
|
# Splitting data into train and testing part
|
|
# The 25 % of data is test size of the data
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
|
|
# importing pipes for making the Pipe flow
|
|
from sklearn.pipeline import Pipeline
|
|
# pipe flow is :
|
|
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
|
|
pipe = Pipeline([('pca', PCA(n_components = 2))
|
|
, ('std', StandardScaler())
|
|
, ('decision_tree', DecisionTreeClassifier())]
|
|
, verbose = True)
|
|
|
|
# fitting the data in the pipe
|
|
pipe.fit(X_train, y_train)
|
|
|
|
# scoring data
|
|
from sklearn.metrics import accuracy_score
|
|
print(accuracy_score(y_test, pipe.predict(X_test)))
|