renamed hyperparams to gscv

2022-03-22 11:08:20 +00:00 · 2022-03-22 11:08:20 +00:00 · ad5ebad7f8
commit ad5ebad7f8
parent a82358dbb4
31 changed files with 4433 additions and 0 deletions
--- a/earlier_versions/p_jr_d1.py
+++ b/earlier_versions/p_jr_d1.py
@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Feb 17 14:52:55 2022
+
+@author: tanu
+"""
+from sklearn.datasets import load_boston
+from sklearn import linear_model
+from sklearn import preprocessing
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+print(np.__version__)
+print(pd.__version__)
+
+boston = load_boston()
+dir(boston)
+#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
+X, y = boston.data, boston.target
+df = pd.DataFrame(X, columns = boston.feature_names)
+df['MEDV'] = y
+
+sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
+plt.show()
+
+#Model fitting
+#To fit a model using just a single predictor we first extract the training variables.
+X_train = df['CRIM']
+y_train = y
+# Unfortunately, sklearn ’s various model fitting functions typically expect a
+# two dimensional array for the covariates. Since we have extracted only
+# a single feature here it is only one dimensional. We need to reshape the
+# X_train values to be the appropriate shape.
+# This is not necessary if using more than a single feature.
+
+if len(X_train.values.shape) == 1:
+    X_train = X_train.values.reshape(-1, 1)
+
+# Create a LinearRegression object: This object is of a broader class of estima-
+#tor objects.
+model = linear_model.LinearRegression()
+model.fit(X_train, y_train)
+
+# We can make predictions from our fitted model with the .predict() method.
+new_value = np.array(4.09, ndmin = 2)
+model.predict(new_value)
+multiple_values = np.array([1, 2, 3], ndmin = 2).T
+model.predict(multiple_values)
+
+#Fitted values
+#Fitted values of a model typically describes the predicted ŷ for the obser-
+#vations X . To get the model fitted values we could just predict from the
+#model using the values used to train it.
+fitted = model.predict(X_train)
+ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
+sns.lineplot(df['CRIM'], fitted, ax = ax)
+plt.show()
+
+# Interpreting the coefficients
+# The coefficients of the fitted model are kept in the model.coef_ attribute.
+# This gives us the expected change in y for a unit change in X .
+model.coef_
+
+
+#2.3 Multiple linear regression
+X_train = df.iloc[:,:3]
+grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
+grid.map_offdiag(sns.scatterplot)
+grid.map_diag(sns.distplot)
+plt.show()
+model.fit(X_train, y_train)
+new_values = np.array(X_train.mean(), ndmin = 2)
+model.predict(new_values)
+#Residuals
+#In classical statistics, one of our assump-
+#tions it that the residuals are normally dis-
+#tributed.Small RSS implies the fitted model is
+#closer to the observations.
+
+fitted = model.predict(X_train)
+resid = y_train - fitted
+# Standardise to remove effect of measurement scale
+resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
+plt.figure()
+for i in range(3):
+    xvar = X_train.iloc[:,i]
+    ax = plt.subplot(3, 1, i + 1)
+    ax.scatter(xvar, resid)
+    ax.set_xlabel(boston.feature_names[i])
+    ax.set_ylabel("Residuals")
+    ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
+    
+plt.show()
+plt.figure()
+ax = plt.subplot(3, 1, 1)
+ax.scatter(fitted,resid)
+ax.set_xlabel('Fitted values')
+ax.set_ylabel('Residuals')
+
+ax = plt.subplot(3,1,2)
+ax.scatter(fitted,y_train)
+ax.set_xlabel('Fitted values')
+ax.set_ylabel('Predicted values')
+
+ax = plt.subplot(3, 1,3)
+import scipy.stats as stats
+stats.probplot(resid,dist = 'norm',plot = ax)
+plt.show()
+
+#Scaling data: many types available
+# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
+#Scaling is crucial for many statistical and machine learning algorithms
+# • k-means and hierarchical clustering
+# – Data units & variance play crucial role in cluster selection
+# • Using gradient descent optimization
+# – Scaled data allows the weights to update at an equal speed
+# • Scaled data allows the regression coefficients to be compared
+
+#########################################################
+# Min-max scaling
+# DOESN'T change the shape
+# DOES change the bounds, mean and sd
+# NOT often used in LR
+# used more in GDO (gradient Descent Optimisation)
+
+# sklearn.preprocessing module has a MinMaxScaler() for this
+##########################################################
+
+np.random.seed(1)
+x_n = np.random.normal(2, 5, 500) 
+x_t = np.random.standard_t(2, 500) 
+x_ln = np.random.lognormal(1, 1, 500) 
+df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
+})
+
+df_long = df.melt(var_name='Distribution')
+g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
+g.map(plt.hist, 'value', bins = 50)
+plt.show()
+
+def min_max(x):
+    min = np.min(x)
+    s = (x - min)/(np.max(x) - min)
+    return (s)
+
+scaled = df.apply(min_max).melt(var_name='Distribution')
+
+scaled['Scaled'] = True
+df_long['Scaled'] = False
+full_data = pd.concat([df_long, scaled], axis=0)
+
+g = sns.FacetGrid(full_data, col='Distribution'
+                  ,row='Scaled'
+                  , sharex=False
+                  , sharey=False)
+
+g.map(plt.hist, 'value', bins = 50)
+
+plt.show()
+
+df.apply([np.mean,np.std])
+df.apply(min_max).apply([np.mean,np.std])
+
+# sklearn: MinMaxScaler()
+
+scaler = preprocessing.MinMaxScaler()
+scaler.fit(X_train)
+
+X_train_scaled = scaler.transform(X_train)
+X_train_scaled[:1]
+
+##########################################################
+# z-score standardisation
+# DOESN'T change the shape
+# popular in linear models
+# DOESN'T effect the predictions
+# but makes the size of the coeffs directly comparable
+
+# sklearn.preprocessing module has a StandardScaler() for this
+##########################################################
+
+def z_score(x):
+    mean = np.mean(x)
+    std = np.std(x, ddof=1)
+    return (x - mean)/std
+
+scaled = df.apply(z_score).melt(var_name='Distribution')
+scaled['Scaled'] = True
+full_data = pd.concat([df_long, scaled], axis=0)
+g = sns.FacetGrid(full_data, col='Distribution'
+                  , row ='Scaled'
+                  , sharex=False
+                  ,sharey=False)
+g.map(plt.hist, 'value', bins=50)
+
+###############################################
+# Dividing by two standard deviations
+# http://www.stat.columbia.edu/
+# ~gelman/research/published/ standardizing7.pdf
+# One of the downsides of scaling data by z-scoring is that is not obvious
+# how this should be handled in the case of categorical variables.
+
+# suggest the use of a rescaling that divides numeric vari-
+# ables by two standard deviations, whilst leaving binary encoded categorical
+# variables untransformed.
+# nothing in sklearn for this
+###############################################
+from sklearn.base import BaseEstimator, TransformerMixin
+class two_sd_scaler(BaseEstimator, TransformerMixin):
+    def fit(self, X, y=None):
+        self.stds = 2*np.std(X, axis=0, ddof=1)
+        return self
+    def transform(self, X, y=None):
+        return X/self.stds
+        
+# Having preprocessed the data this way we can not fit a model to it in the
+# same way as before.
+model2 = linear_model.LinearRegression()
+model2.fit(X_train_scaled, y_train)
+#When making predictions on new values we also need to make sure to pass
+#the new values through the same preprocessing step.
+
+new_value = np.array(X_train.mean(), ndmin = 2)
+new_scaled = scaler.transform(new_value)
+pred = model2.predict(new_scaled)
+pred
+
+##########################
+# 2.5 Creating a pipeline
+##########################
+# For any training data set and any data for prediction we will want to apply
+# the same scaling transformation and use the same model. We could create
+# a sklearn.pipeline.Pipeline() to organise the steps to creating the
+# estimator
+
+from sklearn.pipeline import Pipeline
+model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
+                          ,('regression', linear_model.LinearRegression())
+                          ])
+
+# Having created the Pipeline object we can now fit as before. Calling
+# .fit() now however, will first fit the 'preprocess' step and then the
+# 'regression' step. When we predict, the new values will also pass through
+# both stages of our pipeline.
+model.fit(X_train,y_train)
+new_values = np.array(X_train.mean(), ndmin = 2)
+model.predict(new_values)
+#from sklearn.metrics import accuracy_score
+#print(accuracy_score(y_test, model.predict(X_test)))
+
+#2.6 Preprocessing categorical variables
+# One hot encoding: will take a categorical feature with K categories and
+# create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each
+# category. Consider the toy data
+
+toy = pd.DataFrame({
+'category':['a', 'a', 'b', 'c', 'b']
+})
+enc = preprocessing.OneHotEncoder()
+enc.fit(toy)
+enc.transform(toy).toarray()
+
+#Combining preprocessing steps: 
+# the preprocessing steps into a single operation
+# for our Pipeline using a sklearn.compose.ColumnTransformer
+toy = pd.DataFrame({
+'numeric': [1., 2., 3., 4., 5.],
+'category': ['a', 'a', 'b', 'c', 'b']
+})
+
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+numeric_features = ['numeric']
+categorical_features = ['category']
+preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
+                                                , numeric_features)
+                                               ,('cat', OneHotEncoder(), categorical_features)])
+
+preprocessor.fit(toy)
+preprocessor.transform(toy)
+
+# This preprocessing step could then be a step in the pipeline for a regres-
+# sion 
+model = Pipeline(steps = [('preprocess', preprocessor)
+                          ,('regression', linear_model.LinearRegression())])
+
+# fit the preprocessor pipeline to the data
+preprocessor.fit(toy)
+
+# transformer will now give the appropriate pre-processing for different types of variables.
+preprocessor.transform(toy)
+
+#This preprocessing step could then be a step in the pipeline for a regression
+model = Pipeline(steps = [('preprocess', preprocessor)
+         ,('regression', linear_model.LinearRegression())])
+
+#Model Assessment and Feature Selection
+
+#%%#####################################################################
+
+# Accuracy score is only for classification problems.
+# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+from sklearn import preprocessing
+
+# read data
+iris = datasets.load_iris()
+
+# assign X and y
+X = iris.data
+y = iris.target
+
+# split data into train and testing part (25 % of data is test size of the data)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
+
+# preprocess the data 
+
+# scaling
+scaler = preprocessing.MinMaxScaler()
+# fit X_train to scaling
+scaler.fit(X_train)
+# Apply the scaling/transforamtion to the dta
+X_train_scaled = scaler.transform(X_train)
+
+# Choose the required model/s
+model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
+model2 = DecisionTreeClassifier()
+
+# fit the model to the data for predictions
+model2.fit(X_train_scaled, y_train)
+# check model performace
+print(accuracy_score(y_test, model2.predict(X_test)))
+
+
+#When making predictions on new values we also need to make sure to pass
+#the new values through the same preprocessing step.
+new_value = np.array(X_train.mean(), ndmin = 2)
+new_scaled = scaler.transform(new_value)
+pred = model2.predict(new_scaled)
+pred
+
+# or Create a pipeline that standardizes the data then creates a model
+
+# make a pipeline
+# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
+#https://www.geeksforgeeks.org/pipelines-python-and-scikit-learn/
+pipe1 = Pipeline([('pca', PCA(n_components = 2))
+                 , ('std', StandardScaler())
+                 , ('decision_tree', DecisionTreeClassifier())]
+                 , verbose = True)
+
+pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
+                          #,('regression', linear_model.LinearRegression())
+                          ,('rf', RandomForestClassifier())
+                          ])
+
+# fit pipeline to TRAINING data [X_train and y_train]
+pipe1.fit(X_train, y_train)
+pipe2.fit(X_train, y_train)
+
+# model prediction on TEST data [X_test and y_test]
+print(accuracy_score(y_test, pipe1.predict(X_test)))
+print(accuracy_score(y_test, pipe2.predict(X_test)))
+print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1))) 
+enc = preprocessing.OneHotEncoder()
+enc.fit(X_train)
+enc.transform(X_train).toarray()
+#%%
+from sklearn.metrics import mean_squared_error, make_scorer
+from sklearn.model_selection import cross_validate
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import MinMaxScaler
+boston = load_boston()
+
+X_train, y_train = pd.DataFrame(boston.data, columns = boston.feature_names), boston.target
+
+model1 = Pipeline(steps = [
+    ('pre', MinMaxScaler()),
+    ('reg', LinearRegression())])
+
+score_fn = make_scorer(mean_squared_error)
+scores = cross_validate(model1, X_train, y_train
+                        , scoring = score_fn
+                        , cv = 10)
+
+from itertools import combinations
+def train(X):
+    return cross_validate(model1, X, y_train
+                          , scoring = score_fn
+                          #, return_train_score = False)
+                          , return_estimator = True)['test_score']
+
+scores = [train(X_train.loc[:,vars]) for vars in combinations(X_train.columns, 12)]
+means = [score.mean() for score in scores]
+means