#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Feb 16 11:56:57 2022 @author: tanu """ # source #https://machinelearningmastery.com/automate-machine-learning-workflows-pipelines-python-scikit-learn/ ############################################### # Pipeline 1: Data Preparation and Modeling ############################################### # Create a pipeline that standardizes the data then creates a model from pandas import read_csv from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # load data url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(url, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] # create pipeline estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('lda', LinearDiscriminantAnalysis())) model = Pipeline(estimators) # evaluate pipeline seed = 7 #kfold = KFold(n_splits=10, random_state=seed) kfold = KFold(n_splits=10, random_state=None) results = cross_val_score(model, X, Y, cv=kfold) print(results.mean()) ############################################### # Pipeline 2: Feature Extraction and Modeling ############################################### # Create a pipeline that extracts features from the data then creates a model from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest # load data url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(url, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] # create feature union features = [] features.append(('pca', PCA(n_components=3))) features.append(('select_best', SelectKBest(k=6))) feature_union = FeatureUnion(features) # create pipeline estimators = [] estimators.append(('feature_union', feature_union)) estimators.append(('logistic', LogisticRegression())) model = Pipeline(estimators) # evaluate pipeline seed = 7 kfold = KFold(n_splits=10, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold) print(results.mean()) #%%############################################################################ from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.tree import DecisionTreeClassifier # import some data within sklearn for iris classification iris = datasets.load_iris() X = iris.data y = iris.target # Splitting data into train and testing part # The 25 % of data is test size of the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) # importing pipes for making the Pipe flow from sklearn.pipeline import Pipeline # pipe flow is : # PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification pipe = Pipeline([('pca', PCA(n_components = 2)) , ('std', StandardScaler()) , ('decision_tree', DecisionTreeClassifier())] , verbose = True) # fitting the data in the pipe pipe.fit(X_train, y_train) # scoring data from sklearn.metrics import accuracy_score print(accuracy_score(y_test, pipe.predict(X_test)))