renamed hyperparams to gscv
This commit is contained in:
parent
a82358dbb4
commit
ad5ebad7f8
31 changed files with 4433 additions and 0 deletions
405
earlier_versions/p_jr_d1.py
Normal file
405
earlier_versions/p_jr_d1.py
Normal file
|
@ -0,0 +1,405 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Thu Feb 17 14:52:55 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn import linear_model
|
||||
from sklearn import preprocessing
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
print(np.__version__)
|
||||
print(pd.__version__)
|
||||
|
||||
boston = load_boston()
|
||||
dir(boston)
|
||||
#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
|
||||
X, y = boston.data, boston.target
|
||||
df = pd.DataFrame(X, columns = boston.feature_names)
|
||||
df['MEDV'] = y
|
||||
|
||||
sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
|
||||
plt.show()
|
||||
|
||||
#Model fitting
|
||||
#To fit a model using just a single predictor we first extract the training variables.
|
||||
X_train = df['CRIM']
|
||||
y_train = y
|
||||
# Unfortunately, sklearn ’s various model fitting functions typically expect a
|
||||
# two dimensional array for the covariates. Since we have extracted only
|
||||
# a single feature here it is only one dimensional. We need to reshape the
|
||||
# X_train values to be the appropriate shape.
|
||||
# This is not necessary if using more than a single feature.
|
||||
|
||||
if len(X_train.values.shape) == 1:
|
||||
X_train = X_train.values.reshape(-1, 1)
|
||||
|
||||
# Create a LinearRegression object: This object is of a broader class of estima-
|
||||
#tor objects.
|
||||
model = linear_model.LinearRegression()
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# We can make predictions from our fitted model with the .predict() method.
|
||||
new_value = np.array(4.09, ndmin = 2)
|
||||
model.predict(new_value)
|
||||
multiple_values = np.array([1, 2, 3], ndmin = 2).T
|
||||
model.predict(multiple_values)
|
||||
|
||||
#Fitted values
|
||||
#Fitted values of a model typically describes the predicted ŷ for the obser-
|
||||
#vations X . To get the model fitted values we could just predict from the
|
||||
#model using the values used to train it.
|
||||
fitted = model.predict(X_train)
|
||||
ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
|
||||
sns.lineplot(df['CRIM'], fitted, ax = ax)
|
||||
plt.show()
|
||||
|
||||
# Interpreting the coefficients
|
||||
# The coefficients of the fitted model are kept in the model.coef_ attribute.
|
||||
# This gives us the expected change in y for a unit change in X .
|
||||
model.coef_
|
||||
|
||||
|
||||
#2.3 Multiple linear regression
|
||||
X_train = df.iloc[:,:3]
|
||||
grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
|
||||
grid.map_offdiag(sns.scatterplot)
|
||||
grid.map_diag(sns.distplot)
|
||||
plt.show()
|
||||
model.fit(X_train, y_train)
|
||||
new_values = np.array(X_train.mean(), ndmin = 2)
|
||||
model.predict(new_values)
|
||||
#Residuals
|
||||
#In classical statistics, one of our assump-
|
||||
#tions it that the residuals are normally dis-
|
||||
#tributed.Small RSS implies the fitted model is
|
||||
#closer to the observations.
|
||||
|
||||
fitted = model.predict(X_train)
|
||||
resid = y_train - fitted
|
||||
# Standardise to remove effect of measurement scale
|
||||
resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
|
||||
plt.figure()
|
||||
for i in range(3):
|
||||
xvar = X_train.iloc[:,i]
|
||||
ax = plt.subplot(3, 1, i + 1)
|
||||
ax.scatter(xvar, resid)
|
||||
ax.set_xlabel(boston.feature_names[i])
|
||||
ax.set_ylabel("Residuals")
|
||||
ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
|
||||
|
||||
plt.show()
|
||||
plt.figure()
|
||||
ax = plt.subplot(3, 1, 1)
|
||||
ax.scatter(fitted,resid)
|
||||
ax.set_xlabel('Fitted values')
|
||||
ax.set_ylabel('Residuals')
|
||||
|
||||
ax = plt.subplot(3,1,2)
|
||||
ax.scatter(fitted,y_train)
|
||||
ax.set_xlabel('Fitted values')
|
||||
ax.set_ylabel('Predicted values')
|
||||
|
||||
ax = plt.subplot(3, 1,3)
|
||||
import scipy.stats as stats
|
||||
stats.probplot(resid,dist = 'norm',plot = ax)
|
||||
plt.show()
|
||||
|
||||
#Scaling data: many types available
|
||||
# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
|
||||
#Scaling is crucial for many statistical and machine learning algorithms
|
||||
# • k-means and hierarchical clustering
|
||||
# – Data units & variance play crucial role in cluster selection
|
||||
# • Using gradient descent optimization
|
||||
# – Scaled data allows the weights to update at an equal speed
|
||||
# • Scaled data allows the regression coefficients to be compared
|
||||
|
||||
#########################################################
|
||||
# Min-max scaling
|
||||
# DOESN'T change the shape
|
||||
# DOES change the bounds, mean and sd
|
||||
# NOT often used in LR
|
||||
# used more in GDO (gradient Descent Optimisation)
|
||||
|
||||
# sklearn.preprocessing module has a MinMaxScaler() for this
|
||||
##########################################################
|
||||
|
||||
np.random.seed(1)
|
||||
x_n = np.random.normal(2, 5, 500)
|
||||
x_t = np.random.standard_t(2, 500)
|
||||
x_ln = np.random.lognormal(1, 1, 500)
|
||||
df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
|
||||
})
|
||||
|
||||
df_long = df.melt(var_name='Distribution')
|
||||
g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
|
||||
g.map(plt.hist, 'value', bins = 50)
|
||||
plt.show()
|
||||
|
||||
def min_max(x):
|
||||
min = np.min(x)
|
||||
s = (x - min)/(np.max(x) - min)
|
||||
return (s)
|
||||
|
||||
scaled = df.apply(min_max).melt(var_name='Distribution')
|
||||
|
||||
scaled['Scaled'] = True
|
||||
df_long['Scaled'] = False
|
||||
full_data = pd.concat([df_long, scaled], axis=0)
|
||||
|
||||
g = sns.FacetGrid(full_data, col='Distribution'
|
||||
,row='Scaled'
|
||||
, sharex=False
|
||||
, sharey=False)
|
||||
|
||||
g.map(plt.hist, 'value', bins = 50)
|
||||
|
||||
plt.show()
|
||||
|
||||
df.apply([np.mean,np.std])
|
||||
df.apply(min_max).apply([np.mean,np.std])
|
||||
|
||||
# sklearn: MinMaxScaler()
|
||||
|
||||
scaler = preprocessing.MinMaxScaler()
|
||||
scaler.fit(X_train)
|
||||
|
||||
X_train_scaled = scaler.transform(X_train)
|
||||
X_train_scaled[:1]
|
||||
|
||||
##########################################################
|
||||
# z-score standardisation
|
||||
# DOESN'T change the shape
|
||||
# popular in linear models
|
||||
# DOESN'T effect the predictions
|
||||
# but makes the size of the coeffs directly comparable
|
||||
|
||||
# sklearn.preprocessing module has a StandardScaler() for this
|
||||
##########################################################
|
||||
|
||||
def z_score(x):
|
||||
mean = np.mean(x)
|
||||
std = np.std(x, ddof=1)
|
||||
return (x - mean)/std
|
||||
|
||||
scaled = df.apply(z_score).melt(var_name='Distribution')
|
||||
scaled['Scaled'] = True
|
||||
full_data = pd.concat([df_long, scaled], axis=0)
|
||||
g = sns.FacetGrid(full_data, col='Distribution'
|
||||
, row ='Scaled'
|
||||
, sharex=False
|
||||
,sharey=False)
|
||||
g.map(plt.hist, 'value', bins=50)
|
||||
|
||||
###############################################
|
||||
# Dividing by two standard deviations
|
||||
# http://www.stat.columbia.edu/
|
||||
# ~gelman/research/published/ standardizing7.pdf
|
||||
# One of the downsides of scaling data by z-scoring is that is not obvious
|
||||
# how this should be handled in the case of categorical variables.
|
||||
|
||||
# suggest the use of a rescaling that divides numeric vari-
|
||||
# ables by two standard deviations, whilst leaving binary encoded categorical
|
||||
# variables untransformed.
|
||||
# nothing in sklearn for this
|
||||
###############################################
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
class two_sd_scaler(BaseEstimator, TransformerMixin):
|
||||
def fit(self, X, y=None):
|
||||
self.stds = 2*np.std(X, axis=0, ddof=1)
|
||||
return self
|
||||
def transform(self, X, y=None):
|
||||
return X/self.stds
|
||||
|
||||
# Having preprocessed the data this way we can not fit a model to it in the
|
||||
# same way as before.
|
||||
model2 = linear_model.LinearRegression()
|
||||
model2.fit(X_train_scaled, y_train)
|
||||
#When making predictions on new values we also need to make sure to pass
|
||||
#the new values through the same preprocessing step.
|
||||
|
||||
new_value = np.array(X_train.mean(), ndmin = 2)
|
||||
new_scaled = scaler.transform(new_value)
|
||||
pred = model2.predict(new_scaled)
|
||||
pred
|
||||
|
||||
##########################
|
||||
# 2.5 Creating a pipeline
|
||||
##########################
|
||||
# For any training data set and any data for prediction we will want to apply
|
||||
# the same scaling transformation and use the same model. We could create
|
||||
# a sklearn.pipeline.Pipeline() to organise the steps to creating the
|
||||
# estimator
|
||||
|
||||
from sklearn.pipeline import Pipeline
|
||||
model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
|
||||
,('regression', linear_model.LinearRegression())
|
||||
])
|
||||
|
||||
# Having created the Pipeline object we can now fit as before. Calling
|
||||
# .fit() now however, will first fit the 'preprocess' step and then the
|
||||
# 'regression' step. When we predict, the new values will also pass through
|
||||
# both stages of our pipeline.
|
||||
model.fit(X_train,y_train)
|
||||
new_values = np.array(X_train.mean(), ndmin = 2)
|
||||
model.predict(new_values)
|
||||
#from sklearn.metrics import accuracy_score
|
||||
#print(accuracy_score(y_test, model.predict(X_test)))
|
||||
|
||||
#2.6 Preprocessing categorical variables
|
||||
# One hot encoding: will take a categorical feature with K categories and
|
||||
# create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each
|
||||
# category. Consider the toy data
|
||||
|
||||
toy = pd.DataFrame({
|
||||
'category':['a', 'a', 'b', 'c', 'b']
|
||||
})
|
||||
enc = preprocessing.OneHotEncoder()
|
||||
enc.fit(toy)
|
||||
enc.transform(toy).toarray()
|
||||
|
||||
#Combining preprocessing steps:
|
||||
# the preprocessing steps into a single operation
|
||||
# for our Pipeline using a sklearn.compose.ColumnTransformer
|
||||
toy = pd.DataFrame({
|
||||
'numeric': [1., 2., 3., 4., 5.],
|
||||
'category': ['a', 'a', 'b', 'c', 'b']
|
||||
})
|
||||
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
numeric_features = ['numeric']
|
||||
categorical_features = ['category']
|
||||
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
|
||||
, numeric_features)
|
||||
,('cat', OneHotEncoder(), categorical_features)])
|
||||
|
||||
preprocessor.fit(toy)
|
||||
preprocessor.transform(toy)
|
||||
|
||||
# This preprocessing step could then be a step in the pipeline for a regres-
|
||||
# sion
|
||||
model = Pipeline(steps = [('preprocess', preprocessor)
|
||||
,('regression', linear_model.LinearRegression())])
|
||||
|
||||
# fit the preprocessor pipeline to the data
|
||||
preprocessor.fit(toy)
|
||||
|
||||
# transformer will now give the appropriate pre-processing for different types of variables.
|
||||
preprocessor.transform(toy)
|
||||
|
||||
#This preprocessing step could then be a step in the pipeline for a regression
|
||||
model = Pipeline(steps = [('preprocess', preprocessor)
|
||||
,('regression', linear_model.LinearRegression())])
|
||||
|
||||
#Model Assessment and Feature Selection
|
||||
|
||||
#%%#####################################################################
|
||||
|
||||
# Accuracy score is only for classification problems.
|
||||
# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
|
||||
from sklearn import datasets
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn import preprocessing
|
||||
|
||||
# read data
|
||||
iris = datasets.load_iris()
|
||||
|
||||
# assign X and y
|
||||
X = iris.data
|
||||
y = iris.target
|
||||
|
||||
# split data into train and testing part (25 % of data is test size of the data)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
|
||||
|
||||
# preprocess the data
|
||||
|
||||
# scaling
|
||||
scaler = preprocessing.MinMaxScaler()
|
||||
# fit X_train to scaling
|
||||
scaler.fit(X_train)
|
||||
# Apply the scaling/transforamtion to the dta
|
||||
X_train_scaled = scaler.transform(X_train)
|
||||
|
||||
# Choose the required model/s
|
||||
model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
|
||||
model2 = DecisionTreeClassifier()
|
||||
|
||||
# fit the model to the data for predictions
|
||||
model2.fit(X_train_scaled, y_train)
|
||||
# check model performace
|
||||
print(accuracy_score(y_test, model2.predict(X_test)))
|
||||
|
||||
|
||||
#When making predictions on new values we also need to make sure to pass
|
||||
#the new values through the same preprocessing step.
|
||||
new_value = np.array(X_train.mean(), ndmin = 2)
|
||||
new_scaled = scaler.transform(new_value)
|
||||
pred = model2.predict(new_scaled)
|
||||
pred
|
||||
|
||||
# or Create a pipeline that standardizes the data then creates a model
|
||||
|
||||
# make a pipeline
|
||||
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
|
||||
#https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
|
||||
pipe1 = Pipeline([('pca', PCA(n_components = 2))
|
||||
, ('std', StandardScaler())
|
||||
, ('decision_tree', DecisionTreeClassifier())]
|
||||
, verbose = True)
|
||||
|
||||
pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
|
||||
#,('regression', linear_model.LinearRegression())
|
||||
,('rf', RandomForestClassifier())
|
||||
])
|
||||
|
||||
# fit pipeline to TRAINING data [X_train and y_train]
|
||||
pipe1.fit(X_train, y_train)
|
||||
pipe2.fit(X_train, y_train)
|
||||
|
||||
# model prediction on TEST data [X_test and y_test]
|
||||
print(accuracy_score(y_test, pipe1.predict(X_test)))
|
||||
print(accuracy_score(y_test, pipe2.predict(X_test)))
|
||||
print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1)))
|
||||
enc = preprocessing.OneHotEncoder()
|
||||
enc.fit(X_train)
|
||||
enc.transform(X_train).toarray()
|
||||
#%%
|
||||
from sklearn.metrics import mean_squared_error, make_scorer
|
||||
from sklearn.model_selection import cross_validate
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
boston = load_boston()
|
||||
|
||||
X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
|
||||
|
||||
model1 = Pipeline(steps = [
|
||||
('pre', MinMaxScaler()),
|
||||
('reg', LinearRegression())])
|
||||
|
||||
score_fn = make_scorer(mean_squared_error)
|
||||
scores = cross_validate(model1, X_train, y_train
|
||||
, scoring = score_fn
|
||||
, cv = 10)
|
||||
|
||||
from itertools import combinations
|
||||
def train(X):
|
||||
return cross_validate(model1, X, y_train
|
||||
, scoring = score_fn
|
||||
#, return_train_score = False)
|
||||
, return_estimator = True)['test_score']
|
||||
|
||||
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
|
||||
means = [score.mean() for score in scores]
|
||||
means
|
Loading…
Add table
Add a link
Reference in a new issue