404 lines
No EOL
13 KiB
Python
404 lines
No EOL
13 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Created on Thu Feb 17 14:52:55 2022
|
||
|
||
@author: tanu
|
||
"""
|
||
from sklearn.datasets import load_boston
|
||
from sklearn import linear_model
|
||
from sklearn import preprocessing
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
print(np.__version__)
|
||
print(pd.__version__)
|
||
|
||
boston = load_boston()
|
||
dir(boston)
|
||
#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
|
||
X, y = boston.data, boston.target
|
||
df = pd.DataFrame(X, columns = boston.feature_names)
|
||
df['MEDV'] = y
|
||
|
||
sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
|
||
plt.show()
|
||
|
||
#Model fitting
|
||
#To fit a model using just a single predictor we first extract the training variables.
|
||
X_train = df['CRIM']
|
||
y_train = y
|
||
# Unfortunately, sklearn ’s various model fitting functions typically expect a
|
||
# two dimensional array for the covariates. Since we have extracted only
|
||
# a single feature here it is only one dimensional. We need to reshape the
|
||
# X_train values to be the appropriate shape.
|
||
# This is not necessary if using more than a single feature.
|
||
|
||
if len(X_train.values.shape) == 1:
|
||
X_train = X_train.values.reshape(-1, 1)
|
||
|
||
# Create a LinearRegression object: This object is of a broader class of estima-
|
||
#tor objects.
|
||
model = linear_model.LinearRegression()
|
||
model.fit(X_train, y_train)
|
||
|
||
# We can make predictions from our fitted model with the .predict() method.
|
||
new_value = np.array(4.09, ndmin = 2)
|
||
model.predict(new_value)
|
||
multiple_values = np.array([1, 2, 3], ndmin = 2).T
|
||
model.predict(multiple_values)
|
||
|
||
#Fitted values
|
||
#Fitted values of a model typically describes the predicted ŷ for the obser-
|
||
#vations X . To get the model fitted values we could just predict from the
|
||
#model using the values used to train it.
|
||
fitted = model.predict(X_train)
|
||
ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
|
||
sns.lineplot(df['CRIM'], fitted, ax = ax)
|
||
plt.show()
|
||
|
||
# Interpreting the coefficients
|
||
# The coefficients of the fitted model are kept in the model.coef_ attribute.
|
||
# This gives us the expected change in y for a unit change in X .
|
||
model.coef_
|
||
|
||
|
||
#2.3 Multiple linear regression
|
||
X_train = df.iloc[:,:3]
|
||
grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
|
||
grid.map_offdiag(sns.scatterplot)
|
||
grid.map_diag(sns.distplot)
|
||
plt.show()
|
||
model.fit(X_train, y_train)
|
||
new_values = np.array(X_train.mean(), ndmin = 2)
|
||
model.predict(new_values)
|
||
#Residuals
|
||
#In classical statistics, one of our assump-
|
||
#tions it that the residuals are normally dis-
|
||
#tributed.Small RSS implies the fitted model is
|
||
#closer to the observations.
|
||
|
||
fitted = model.predict(X_train)
|
||
resid = y_train - fitted
|
||
# Standardise to remove effect of measurement scale
|
||
resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
|
||
plt.figure()
|
||
for i in range(3):
|
||
xvar = X_train.iloc[:,i]
|
||
ax = plt.subplot(3, 1, i + 1)
|
||
ax.scatter(xvar, resid)
|
||
ax.set_xlabel(boston.feature_names[i])
|
||
ax.set_ylabel("Residuals")
|
||
ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
|
||
|
||
plt.show()
|
||
plt.figure()
|
||
ax = plt.subplot(3, 1, 1)
|
||
ax.scatter(fitted,resid)
|
||
ax.set_xlabel('Fitted values')
|
||
ax.set_ylabel('Residuals')
|
||
|
||
ax = plt.subplot(3,1,2)
|
||
ax.scatter(fitted,y_train)
|
||
ax.set_xlabel('Fitted values')
|
||
ax.set_ylabel('Predicted values')
|
||
|
||
ax = plt.subplot(3, 1,3)
|
||
import scipy.stats as stats
|
||
stats.probplot(resid,dist = 'norm',plot = ax)
|
||
plt.show()
|
||
|
||
#Scaling data: many types available
|
||
# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
|
||
#Scaling is crucial for many statistical and machine learning algorithms
|
||
# • k-means and hierarchical clustering
|
||
# – Data units & variance play crucial role in cluster selection
|
||
# • Using gradient descent optimization
|
||
# – Scaled data allows the weights to update at an equal speed
|
||
# • Scaled data allows the regression coefficients to be compared
|
||
|
||
#########################################################
|
||
# Min-max scaling
|
||
# DOESN'T change the shape
|
||
# DOES change the bounds, mean and sd
|
||
# NOT often used in LR
|
||
# used more in GDO (gradient Descent Optimisation)
|
||
|
||
# sklearn.preprocessing module has a MinMaxScaler() for this
|
||
##########################################################
|
||
|
||
np.random.seed(1)
|
||
x_n = np.random.normal(2, 5, 500)
|
||
x_t = np.random.standard_t(2, 500)
|
||
x_ln = np.random.lognormal(1, 1, 500)
|
||
df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
|
||
})
|
||
|
||
df_long = df.melt(var_name='Distribution')
|
||
g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
|
||
g.map(plt.hist, 'value', bins = 50)
|
||
plt.show()
|
||
|
||
def min_max(x):
|
||
min = np.min(x)
|
||
s = (x - min)/(np.max(x) - min)
|
||
return (s)
|
||
|
||
scaled = df.apply(min_max).melt(var_name='Distribution')
|
||
|
||
scaled['Scaled'] = True
|
||
df_long['Scaled'] = False
|
||
full_data = pd.concat([df_long, scaled], axis=0)
|
||
|
||
g = sns.FacetGrid(full_data, col='Distribution'
|
||
,row='Scaled'
|
||
, sharex=False
|
||
, sharey=False)
|
||
|
||
g.map(plt.hist, 'value', bins = 50)
|
||
|
||
plt.show()
|
||
|
||
df.apply([np.mean,np.std])
|
||
df.apply(min_max).apply([np.mean,np.std])
|
||
|
||
# sklearn: MinMaxScaler()
|
||
|
||
scaler = preprocessing.MinMaxScaler()
|
||
scaler.fit(X_train)
|
||
|
||
X_train_scaled = scaler.transform(X_train)
|
||
X_train_scaled[:1]
|
||
|
||
##########################################################
|
||
# z-score standardisation
|
||
# DOESN'T change the shape
|
||
# popular in linear models
|
||
# DOESN'T effect the predictions
|
||
# but makes the size of the coeffs directly comparable
|
||
|
||
# sklearn.preprocessing module has a StandardScaler() for this
|
||
##########################################################
|
||
|
||
def z_score(x):
|
||
mean = np.mean(x)
|
||
std = np.std(x, ddof=1)
|
||
return (x - mean)/std
|
||
|
||
scaled = df.apply(z_score).melt(var_name='Distribution')
|
||
scaled['Scaled'] = True
|
||
full_data = pd.concat([df_long, scaled], axis=0)
|
||
g = sns.FacetGrid(full_data, col='Distribution'
|
||
, row ='Scaled'
|
||
, sharex=False
|
||
,sharey=False)
|
||
g.map(plt.hist, 'value', bins=50)
|
||
|
||
###############################################
|
||
# Dividing by two standard deviations
|
||
# http://www.stat.columbia.edu/
|
||
# ~gelman/research/published/ standardizing7.pdf
|
||
# One of the downsides of scaling data by z-scoring is that is not obvious
|
||
# how this should be handled in the case of categorical variables.
|
||
|
||
# suggest the use of a rescaling that divides numeric vari-
|
||
# ables by two standard deviations, whilst leaving binary encoded categorical
|
||
# variables untransformed.
|
||
# nothing in sklearn for this
|
||
###############################################
|
||
from sklearn.base import BaseEstimator, TransformerMixin
|
||
class two_sd_scaler(BaseEstimator, TransformerMixin):
|
||
def fit(self, X, y=None):
|
||
self.stds = 2*np.std(X, axis=0, ddof=1)
|
||
return self
|
||
def transform(self, X, y=None):
|
||
return X/self.stds
|
||
|
||
# Having preprocessed the data this way we can not fit a model to it in the
|
||
# same way as before.
|
||
model2 = linear_model.LinearRegression()
|
||
model2.fit(X_train_scaled, y_train)
|
||
#When making predictions on new values we also need to make sure to pass
|
||
#the new values through the same preprocessing step.
|
||
|
||
new_value = np.array(X_train.mean(), ndmin = 2)
|
||
new_scaled = scaler.transform(new_value)
|
||
pred = model2.predict(new_scaled)
|
||
pred
|
||
|
||
##########################
|
||
# 2.5 Creating a pipeline
|
||
##########################
|
||
# For any training data set and any data for prediction we will want to apply
|
||
# the same scaling transformation and use the same model. We could create
|
||
# a sklearn.pipeline.Pipeline() to organise the steps to creating the
|
||
# estimator
|
||
|
||
from sklearn.pipeline import Pipeline
|
||
model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
|
||
,('regression', linear_model.LinearRegression())
|
||
])
|
||
|
||
# Having created the Pipeline object we can now fit as before. Calling
|
||
# .fit() now however, will first fit the 'preprocess' step and then the
|
||
# 'regression' step. When we predict, the new values will also pass through
|
||
# both stages of our pipeline.
|
||
model.fit(X_train,y_train)
|
||
new_values = np.array(X_train.mean(), ndmin = 2)
|
||
model.predict(new_values)
|
||
#from sklearn.metrics import accuracy_score
|
||
#print(accuracy_score(y_test, model.predict(X_test)))
|
||
|
||
#2.6 Preprocessing categorical variables
|
||
# One hot encoding: will take a categorical feature with K categories and
|
||
# create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each
|
||
# category. Consider the toy data
|
||
|
||
toy = pd.DataFrame({
|
||
'category':['a', 'a', 'b', 'c', 'b']
|
||
})
|
||
enc = preprocessing.OneHotEncoder()
|
||
enc.fit(toy)
|
||
enc.transform(toy).toarray()
|
||
|
||
#Combining preprocessing steps:
|
||
# the preprocessing steps into a single operation
|
||
# for our Pipeline using a sklearn.compose.ColumnTransformer
|
||
toy = pd.DataFrame({
|
||
'numeric': [1., 2., 3., 4., 5.],
|
||
'category': ['a', 'a', 'b', 'c', 'b']
|
||
})
|
||
|
||
from sklearn.compose import ColumnTransformer
|
||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||
numeric_features = ['numeric']
|
||
categorical_features = ['category']
|
||
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
|
||
, numeric_features)
|
||
,('cat', OneHotEncoder(), categorical_features)])
|
||
|
||
preprocessor.fit(toy)
|
||
preprocessor.transform(toy)
|
||
|
||
# This preprocessing step could then be a step in the pipeline for a regres-
|
||
# sion
|
||
model = Pipeline(steps = [('preprocess', preprocessor)
|
||
,('regression', linear_model.LinearRegression())])
|
||
|
||
# fit the preprocessor pipeline to the data
|
||
preprocessor.fit(toy)
|
||
|
||
# transformer will now give the appropriate pre-processing for different types of variables.
|
||
preprocessor.transform(toy)
|
||
|
||
#This preprocessing step could then be a step in the pipeline for a regression
|
||
model = Pipeline(steps = [('preprocess', preprocessor)
|
||
,('regression', linear_model.LinearRegression())])
|
||
|
||
#Model Assessment and Feature Selection
|
||
|
||
#%%#####################################################################
|
||
|
||
# Accuracy score is only for classification problems.
|
||
# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
|
||
from sklearn import datasets
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.decomposition import PCA
|
||
from sklearn.tree import DecisionTreeClassifier
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn import preprocessing
|
||
|
||
# read data
|
||
iris = datasets.load_iris()
|
||
|
||
# assign X and y
|
||
X = iris.data
|
||
y = iris.target
|
||
|
||
# split data into train and testing part (25 % of data is test size of the data)
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
|
||
|
||
# preprocess the data
|
||
|
||
# scaling
|
||
scaler = preprocessing.MinMaxScaler()
|
||
# fit X_train to scaling
|
||
scaler.fit(X_train)
|
||
# Apply the scaling/transforamtion to the dta
|
||
X_train_scaled = scaler.transform(X_train)
|
||
|
||
# Choose the required model/s
|
||
model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
|
||
model2 = DecisionTreeClassifier()
|
||
|
||
# fit the model to the data for predictions
|
||
model2.fit(X_train_scaled, y_train)
|
||
# check model performace
|
||
print(accuracy_score(y_test, model2.predict(X_test)))
|
||
|
||
|
||
#When making predictions on new values we also need to make sure to pass
|
||
#the new values through the same preprocessing step.
|
||
new_value = np.array(X_train.mean(), ndmin = 2)
|
||
new_scaled = scaler.transform(new_value)
|
||
pred = model2.predict(new_scaled)
|
||
pred
|
||
|
||
# or Create a pipeline that standardizes the data then creates a model
|
||
|
||
# make a pipeline
|
||
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
|
||
pipe1 = Pipeline([('pca', PCA(n_components = 2))
|
||
, ('std', StandardScaler())
|
||
, ('decision_tree', DecisionTreeClassifier())]
|
||
, verbose = True)
|
||
|
||
pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
|
||
#,('regression', linear_model.LinearRegression())
|
||
,('rf', RandomForestClassifier())
|
||
])
|
||
|
||
# fit pipeline to TRAINING data [X_train and y_train]
|
||
pipe1.fit(X_train, y_train)
|
||
pipe2.fit(X_train, y_train)
|
||
|
||
# model prediction on TEST data [X_test and y_test]
|
||
print(accuracy_score(y_test, pipe1.predict(X_test)))
|
||
print(accuracy_score(y_test, pipe2.predict(X_test)))
|
||
print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1)))
|
||
enc = preprocessing.OneHotEncoder()
|
||
enc.fit(X_train)
|
||
enc.transform(X_train).toarray()
|
||
#%%
|
||
from sklearn.metrics import mean_squared_error, make_scorer
|
||
from sklearn.model_selection import cross_validate
|
||
from sklearn.linear_model import LinearRegression
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.preprocessing import MinMaxScaler
|
||
boston = load_boston()
|
||
|
||
X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
|
||
|
||
model1 = Pipeline(steps = [
|
||
('pre', MinMaxScaler()),
|
||
('reg', LinearRegression())])
|
||
|
||
score_fn = make_scorer(mean_squared_error)
|
||
scores = cross_validate(model1, X_train, y_train
|
||
, scoring = score_fn
|
||
, cv = 10)
|
||
|
||
from itertools import combinations
|
||
def train(X):
|
||
return cross_validate(model1, X, y_train
|
||
, scoring = score_fn
|
||
#, return_train_score = False)
|
||
, return_estimator = True)['test_score']
|
||
|
||
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
|
||
means = [score.mean() for score in scores]
|
||
means |