ML_AI_training/p_jr_d1.py

405 lines
No EOL
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 17 14:52:55 2022
@author: tanu
"""
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
boston = load_boston()
dir(boston)
#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
X, y = boston.data, boston.target
df = pd.DataFrame(X, columns = boston.feature_names)
df['MEDV'] = y
sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
plt.show()
#Model fitting
#To fit a model using just a single predictor we first extract the training variables.
X_train = df['CRIM']
y_train = y
# Unfortunately, sklearn s various model fitting functions typically expect a
# two dimensional array for the covariates. Since we have extracted only
# a single feature here it is only one dimensional. We need to reshape the
# X_train values to be the appropriate shape.
# This is not necessary if using more than a single feature.
if len(X_train.values.shape) == 1:
X_train = X_train.values.reshape(-1, 1)
# Create a LinearRegression object: This object is of a broader class of estima-
#tor objects.
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
# We can make predictions from our fitted model with the .predict() method.
new_value = np.array(4.09, ndmin = 2)
model.predict(new_value)
multiple_values = np.array([1, 2, 3], ndmin = 2).T
model.predict(multiple_values)
#Fitted values
#Fitted values of a model typically describes the predicted ŷ for the obser-
#vations X . To get the model fitted values we could just predict from the
#model using the values used to train it.
fitted = model.predict(X_train)
ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
sns.lineplot(df['CRIM'], fitted, ax = ax)
plt.show()
# Interpreting the coefficients
# The coefficients of the fitted model are kept in the model.coef_ attribute.
# This gives us the expected change in y for a unit change in X .
model.coef_
#2.3 Multiple linear regression
X_train = df.iloc[:,:3]
grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
grid.map_offdiag(sns.scatterplot)
grid.map_diag(sns.distplot)
plt.show()
model.fit(X_train, y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#Residuals
#In classical statistics, one of our assump-
#tions it that the residuals are normally dis-
#tributed.Small RSS implies the fitted model is
#closer to the observations.
fitted = model.predict(X_train)
resid = y_train - fitted
# Standardise to remove effect of measurement scale
resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
plt.figure()
for i in range(3):
xvar = X_train.iloc[:,i]
ax = plt.subplot(3, 1, i + 1)
ax.scatter(xvar, resid)
ax.set_xlabel(boston.feature_names[i])
ax.set_ylabel("Residuals")
ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
plt.show()
plt.figure()
ax = plt.subplot(3, 1, 1)
ax.scatter(fitted,resid)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax = plt.subplot(3,1,2)
ax.scatter(fitted,y_train)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Predicted values')
ax = plt.subplot(3, 1,3)
import scipy.stats as stats
stats.probplot(resid,dist = 'norm',plot = ax)
plt.show()
#Scaling data: many types available
# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
#Scaling is crucial for many statistical and machine learning algorithms
# • k-means and hierarchical clustering
# Data units & variance play crucial role in cluster selection
# • Using gradient descent optimization
# Scaled data allows the weights to update at an equal speed
# • Scaled data allows the regression coefficients to be compared
#########################################################
# Min-max scaling
# DOESN'T change the shape
# DOES change the bounds, mean and sd
# NOT often used in LR
# used more in GDO (gradient Descent Optimisation)
# sklearn.preprocessing module has a MinMaxScaler() for this
##########################################################
np.random.seed(1)
x_n = np.random.normal(2, 5, 500)
x_t = np.random.standard_t(2, 500)
x_ln = np.random.lognormal(1, 1, 500)
df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
})
df_long = df.melt(var_name='Distribution')
g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()
def min_max(x):
min = np.min(x)
s = (x - min)/(np.max(x) - min)
return (s)
scaled = df.apply(min_max).melt(var_name='Distribution')
scaled['Scaled'] = True
df_long['Scaled'] = False
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
,row='Scaled'
, sharex=False
, sharey=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()
df.apply([np.mean,np.std])
df.apply(min_max).apply([np.mean,np.std])
# sklearn: MinMaxScaler()
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled[:1]
##########################################################
# z-score standardisation
# DOESN'T change the shape
# popular in linear models
# DOESN'T effect the predictions
# but makes the size of the coeffs directly comparable
# sklearn.preprocessing module has a StandardScaler() for this
##########################################################
def z_score(x):
mean = np.mean(x)
std = np.std(x, ddof=1)
return (x - mean)/std
scaled = df.apply(z_score).melt(var_name='Distribution')
scaled['Scaled'] = True
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
, row ='Scaled'
, sharex=False
,sharey=False)
g.map(plt.hist, 'value', bins=50)
###############################################
# Dividing by two standard deviations
# http://www.stat.columbia.edu/
# ~gelman/research/published/ standardizing7.pdf
# One of the downsides of scaling data by z-scoring is that is not obvious
# how this should be handled in the case of categorical variables.
# suggest the use of a rescaling that divides numeric vari-
# ables by two standard deviations, whilst leaving binary encoded categorical
# variables untransformed.
# nothing in sklearn for this
###############################################
from sklearn.base import BaseEstimator, TransformerMixin
class two_sd_scaler(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
self.stds = 2*np.std(X, axis=0, ddof=1)
return self
def transform(self, X, y=None):
return X/self.stds
# Having preprocessed the data this way we can not fit a model to it in the
# same way as before.
model2 = linear_model.LinearRegression()
model2.fit(X_train_scaled, y_train)
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred
##########################
# 2.5 Creating a pipeline
##########################
# For any training data set and any data for prediction we will want to apply
# the same scaling transformation and use the same model. We could create
# a sklearn.pipeline.Pipeline() to organise the steps to creating the
# estimator
from sklearn.pipeline import Pipeline
model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
,('regression', linear_model.LinearRegression())
])
# Having created the Pipeline object we can now fit as before. Calling
# .fit() now however, will first fit the 'preprocess' step and then the
# 'regression' step. When we predict, the new values will also pass through
# both stages of our pipeline.
model.fit(X_train,y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test, model.predict(X_test)))
#2.6 Preprocessing categorical variables
# One hot encoding: will take a categorical feature with K categories and
# create a one of K encoding scheme. I.e a set of binary variables for each
# category. Consider the toy data
toy = pd.DataFrame({
'category':['a', 'a', 'b', 'c', 'b']
})
enc = preprocessing.OneHotEncoder()
enc.fit(toy)
enc.transform(toy).toarray()
#Combining preprocessing steps:
# the preprocessing steps into a single operation
# for our Pipeline using a sklearn.compose.ColumnTransformer
toy = pd.DataFrame({
'numeric': [1., 2., 3., 4., 5.],
'category': ['a', 'a', 'b', 'c', 'b']
})
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = ['numeric']
categorical_features = ['category']
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
, numeric_features)
,('cat', OneHotEncoder(), categorical_features)])
preprocessor.fit(toy)
preprocessor.transform(toy)
# This preprocessing step could then be a step in the pipeline for a regres-
# sion
model = Pipeline(steps = [('preprocess', preprocessor)
,('regression', linear_model.LinearRegression())])
# fit the preprocessor pipeline to the data
preprocessor.fit(toy)
# transformer will now give the appropriate pre-processing for different types of variables.
preprocessor.transform(toy)
#This preprocessing step could then be a step in the pipeline for a regression
model = Pipeline(steps = [('preprocess', preprocessor)
,('regression', linear_model.LinearRegression())])
#Model Assessment and Feature Selection
#%%#####################################################################
# Accuracy score is only for classification problems.
# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
# read data
iris = datasets.load_iris()
# assign X and y
X = iris.data
y = iris.target
# split data into train and testing part (25 % of data is test size of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# preprocess the data
# scaling
scaler = preprocessing.MinMaxScaler()
# fit X_train to scaling
scaler.fit(X_train)
# Apply the scaling/transforamtion to the dta
X_train_scaled = scaler.transform(X_train)
# Choose the required model/s
model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
model2 = DecisionTreeClassifier()
# fit the model to the data for predictions
model2.fit(X_train_scaled, y_train)
# check model performace
print(accuracy_score(y_test, model2.predict(X_test)))
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred
# or Create a pipeline that standardizes the data then creates a model
# make a pipeline
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
#https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
pipe1 = Pipeline([('pca', PCA(n_components = 2))
, ('std', StandardScaler())
, ('decision_tree', DecisionTreeClassifier())]
, verbose = True)
pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
#,('regression', linear_model.LinearRegression())
,('rf', RandomForestClassifier())
])
# fit pipeline to TRAINING data [X_train and y_train]
pipe1.fit(X_train, y_train)
pipe2.fit(X_train, y_train)
# model prediction on TEST data [X_test and y_test]
print(accuracy_score(y_test, pipe1.predict(X_test)))
print(accuracy_score(y_test, pipe2.predict(X_test)))
print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1)))
enc = preprocessing.OneHotEncoder()
enc.fit(X_train)
enc.transform(X_train).toarray()
#%%
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
boston = load_boston()
X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
model1 = Pipeline(steps = [
('pre', MinMaxScaler()),
('reg', LinearRegression())])
score_fn = make_scorer(mean_squared_error)
scores = cross_validate(model1, X_train, y_train
, scoring = score_fn
, cv = 10)
from itertools import combinations
def train(X):
return cross_validate(model1, X, y_train
, scoring = score_fn
#, return_train_score = False)
, return_estimator = True)['test_score']
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
means = [score.mean() for score in scores]
means