renamed hyperparams to gscv

This commit is contained in:
Tanushree Tunstall 2022-03-22 11:08:20 +00:00
parent a82358dbb4
commit ad5ebad7f8
31 changed files with 4433 additions and 0 deletions

405
earlier_versions/p_jr_d1.py Normal file
View file

@ -0,0 +1,405 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 17 14:52:55 2022
@author: tanu
"""
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
boston = load_boston()
dir(boston)
#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
X, y = boston.data, boston.target
df = pd.DataFrame(X, columns = boston.feature_names)
df['MEDV'] = y
sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
plt.show()
#Model fitting
#To fit a model using just a single predictor we first extract the training variables.
X_train = df['CRIM']
y_train = y
# Unfortunately, sklearn s various model fitting functions typically expect a
# two dimensional array for the covariates. Since we have extracted only
# a single feature here it is only one dimensional. We need to reshape the
# X_train values to be the appropriate shape.
# This is not necessary if using more than a single feature.
if len(X_train.values.shape) == 1:
X_train = X_train.values.reshape(-1, 1)
# Create a LinearRegression object: This object is of a broader class of estima-
#tor objects.
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
# We can make predictions from our fitted model with the .predict() method.
new_value = np.array(4.09, ndmin = 2)
model.predict(new_value)
multiple_values = np.array([1, 2, 3], ndmin = 2).T
model.predict(multiple_values)
#Fitted values
#Fitted values of a model typically describes the predicted ŷ for the obser-
#vations X . To get the model fitted values we could just predict from the
#model using the values used to train it.
fitted = model.predict(X_train)
ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
sns.lineplot(df['CRIM'], fitted, ax = ax)
plt.show()
# Interpreting the coefficients
# The coefficients of the fitted model are kept in the model.coef_ attribute.
# This gives us the expected change in y for a unit change in X .
model.coef_
#2.3 Multiple linear regression
X_train = df.iloc[:,:3]
grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
grid.map_offdiag(sns.scatterplot)
grid.map_diag(sns.distplot)
plt.show()
model.fit(X_train, y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#Residuals
#In classical statistics, one of our assump-
#tions it that the residuals are normally dis-
#tributed.Small RSS implies the fitted model is
#closer to the observations.
fitted = model.predict(X_train)
resid = y_train - fitted
# Standardise to remove effect of measurement scale
resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
plt.figure()
for i in range(3):
xvar = X_train.iloc[:,i]
ax = plt.subplot(3, 1, i + 1)
ax.scatter(xvar, resid)
ax.set_xlabel(boston.feature_names[i])
ax.set_ylabel("Residuals")
ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
plt.show()
plt.figure()
ax = plt.subplot(3, 1, 1)
ax.scatter(fitted,resid)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax = plt.subplot(3,1,2)
ax.scatter(fitted,y_train)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Predicted values')
ax = plt.subplot(3, 1,3)
import scipy.stats as stats
stats.probplot(resid,dist = 'norm',plot = ax)
plt.show()
#Scaling data: many types available
# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
#Scaling is crucial for many statistical and machine learning algorithms
# • k-means and hierarchical clustering
# Data units & variance play crucial role in cluster selection
# • Using gradient descent optimization
# Scaled data allows the weights to update at an equal speed
# • Scaled data allows the regression coefficients to be compared
#########################################################
# Min-max scaling
# DOESN'T change the shape
# DOES change the bounds, mean and sd
# NOT often used in LR
# used more in GDO (gradient Descent Optimisation)
# sklearn.preprocessing module has a MinMaxScaler() for this
##########################################################
np.random.seed(1)
x_n = np.random.normal(2, 5, 500)
x_t = np.random.standard_t(2, 500)
x_ln = np.random.lognormal(1, 1, 500)
df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
})
df_long = df.melt(var_name='Distribution')
g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()
def min_max(x):
min = np.min(x)
s = (x - min)/(np.max(x) - min)
return (s)
scaled = df.apply(min_max).melt(var_name='Distribution')
scaled['Scaled'] = True
df_long['Scaled'] = False
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
,row='Scaled'
, sharex=False
, sharey=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()
df.apply([np.mean,np.std])
df.apply(min_max).apply([np.mean,np.std])
# sklearn: MinMaxScaler()
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled[:1]
##########################################################
# z-score standardisation
# DOESN'T change the shape
# popular in linear models
# DOESN'T effect the predictions
# but makes the size of the coeffs directly comparable
# sklearn.preprocessing module has a StandardScaler() for this
##########################################################
def z_score(x):
mean = np.mean(x)
std = np.std(x, ddof=1)
return (x - mean)/std
scaled = df.apply(z_score).melt(var_name='Distribution')
scaled['Scaled'] = True
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
, row ='Scaled'
, sharex=False
,sharey=False)
g.map(plt.hist, 'value', bins=50)
###############################################
# Dividing by two standard deviations
# http://www.stat.columbia.edu/
# ~gelman/research/published/ standardizing7.pdf
# One of the downsides of scaling data by z-scoring is that is not obvious
# how this should be handled in the case of categorical variables.
# suggest the use of a rescaling that divides numeric vari-
# ables by two standard deviations, whilst leaving binary encoded categorical
# variables untransformed.
# nothing in sklearn for this
###############################################
from sklearn.base import BaseEstimator, TransformerMixin
class two_sd_scaler(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
self.stds = 2*np.std(X, axis=0, ddof=1)
return self
def transform(self, X, y=None):
return X/self.stds
# Having preprocessed the data this way we can not fit a model to it in the
# same way as before.
model2 = linear_model.LinearRegression()
model2.fit(X_train_scaled, y_train)
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred
##########################
# 2.5 Creating a pipeline
##########################
# For any training data set and any data for prediction we will want to apply
# the same scaling transformation and use the same model. We could create
# a sklearn.pipeline.Pipeline() to organise the steps to creating the
# estimator
from sklearn.pipeline import Pipeline
model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
,('regression', linear_model.LinearRegression())
])
# Having created the Pipeline object we can now fit as before. Calling
# .fit() now however, will first fit the 'preprocess' step and then the
# 'regression' step. When we predict, the new values will also pass through
# both stages of our pipeline.
model.fit(X_train,y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test, model.predict(X_test)))
#2.6 Preprocessing categorical variables
# One hot encoding: will take a categorical feature with K categories and
# create a one of K encoding scheme. I.e a set of binary variables for each
# category. Consider the toy data
toy = pd.DataFrame({
'category':['a', 'a', 'b', 'c', 'b']
})
enc = preprocessing.OneHotEncoder()
enc.fit(toy)
enc.transform(toy).toarray()
#Combining preprocessing steps:
# the preprocessing steps into a single operation
# for our Pipeline using a sklearn.compose.ColumnTransformer
toy = pd.DataFrame({
'numeric': [1., 2., 3., 4., 5.],
'category': ['a', 'a', 'b', 'c', 'b']
})
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = ['numeric']
categorical_features = ['category']
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
, numeric_features)
,('cat', OneHotEncoder(), categorical_features)])
preprocessor.fit(toy)
preprocessor.transform(toy)
# This preprocessing step could then be a step in the pipeline for a regres-
# sion
model = Pipeline(steps = [('preprocess', preprocessor)
,('regression', linear_model.LinearRegression())])
# fit the preprocessor pipeline to the data
preprocessor.fit(toy)
# transformer will now give the appropriate pre-processing for different types of variables.
preprocessor.transform(toy)
#This preprocessing step could then be a step in the pipeline for a regression
model = Pipeline(steps = [('preprocess', preprocessor)
,('regression', linear_model.LinearRegression())])
#Model Assessment and Feature Selection
#%%#####################################################################
# Accuracy score is only for classification problems.
# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
# read data
iris = datasets.load_iris()
# assign X and y
X = iris.data
y = iris.target
# split data into train and testing part (25 % of data is test size of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# preprocess the data
# scaling
scaler = preprocessing.MinMaxScaler()
# fit X_train to scaling
scaler.fit(X_train)
# Apply the scaling/transforamtion to the dta
X_train_scaled = scaler.transform(X_train)
# Choose the required model/s
model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
model2 = DecisionTreeClassifier()
# fit the model to the data for predictions
model2.fit(X_train_scaled, y_train)
# check model performace
print(accuracy_score(y_test, model2.predict(X_test)))
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred
# or Create a pipeline that standardizes the data then creates a model
# make a pipeline
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
#https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
pipe1 = Pipeline([('pca', PCA(n_components = 2))
, ('std', StandardScaler())
, ('decision_tree', DecisionTreeClassifier())]
, verbose = True)
pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
#,('regression', linear_model.LinearRegression())
,('rf', RandomForestClassifier())
])
# fit pipeline to TRAINING data [X_train and y_train]
pipe1.fit(X_train, y_train)
pipe2.fit(X_train, y_train)
# model prediction on TEST data [X_test and y_test]
print(accuracy_score(y_test, pipe1.predict(X_test)))
print(accuracy_score(y_test, pipe2.predict(X_test)))
print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1)))
enc = preprocessing.OneHotEncoder()
enc.fit(X_train)
enc.transform(X_train).toarray()
#%%
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
boston = load_boston()
X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
model1 = Pipeline(steps = [
('pre', MinMaxScaler()),
('reg', LinearRegression())])
score_fn = make_scorer(mean_squared_error)
scores = cross_validate(model1, X_train, y_train
, scoring = score_fn
, cv = 10)
from itertools import combinations
def train(X):
return cross_validate(model1, X, y_train
, scoring = score_fn
#, return_train_score = False)
, return_estimator = True)['test_score']
scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
means = [score.mean() for score in scores]
means