#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Feb 17 14:52:55 2022 @author: tanu """ from sklearn.datasets import load_boston from sklearn import linear_model from sklearn import preprocessing import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np print(np.__version__) print(pd.__version__) boston = load_boston() dir(boston) #['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target'] X, y = boston.data, boston.target df = pd.DataFrame(X, columns = boston.feature_names) df['MEDV'] = y sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df) plt.show() #Model fitting #To fit a model using just a single predictor we first extract the training variables. X_train = df['CRIM'] y_train = y # Unfortunately, sklearn ’s various model fitting functions typically expect a # two dimensional array for the covariates. Since we have extracted only # a single feature here it is only one dimensional. We need to reshape the # X_train values to be the appropriate shape. # This is not necessary if using more than a single feature. if len(X_train.values.shape) == 1: X_train = X_train.values.reshape(-1, 1) # Create a LinearRegression object: This object is of a broader class of estima- #tor objects. model = linear_model.LinearRegression() model.fit(X_train, y_train) # We can make predictions from our fitted model with the .predict() method. new_value = np.array(4.09, ndmin = 2) model.predict(new_value) multiple_values = np.array([1, 2, 3], ndmin = 2).T model.predict(multiple_values) #Fitted values #Fitted values of a model typically describes the predicted ŷ for the obser- #vations X . To get the model fitted values we could just predict from the #model using the values used to train it. fitted = model.predict(X_train) ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df) sns.lineplot(df['CRIM'], fitted, ax = ax) plt.show() # Interpreting the coefficients # The coefficients of the fitted model are kept in the model.coef_ attribute. # This gives us the expected change in y for a unit change in X . model.coef_ #2.3 Multiple linear regression X_train = df.iloc[:,:3] grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1)) grid.map_offdiag(sns.scatterplot) grid.map_diag(sns.distplot) plt.show() model.fit(X_train, y_train) new_values = np.array(X_train.mean(), ndmin = 2) model.predict(new_values) #Residuals #In classical statistics, one of our assump- #tions it that the residuals are normally dis- #tributed.Small RSS implies the fitted model is #closer to the observations. fitted = model.predict(X_train) resid = y_train - fitted # Standardise to remove effect of measurement scale resid = (resid - np.mean(resid))/np.std(resid,ddof = 1) plt.figure() for i in range(3): xvar = X_train.iloc[:,i] ax = plt.subplot(3, 1, i + 1) ax.scatter(xvar, resid) ax.set_xlabel(boston.feature_names[i]) ax.set_ylabel("Residuals") ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar)) plt.show() plt.figure() ax = plt.subplot(3, 1, 1) ax.scatter(fitted,resid) ax.set_xlabel('Fitted values') ax.set_ylabel('Residuals') ax = plt.subplot(3,1,2) ax.scatter(fitted,y_train) ax.set_xlabel('Fitted values') ax.set_ylabel('Predicted values') ax = plt.subplot(3, 1,3) import scipy.stats as stats stats.probplot(resid,dist = 'norm',plot = ax) plt.show() #Scaling data: many types available # sklearn comes with many preprocessing transformations in the sklearn.preprocessing module #Scaling is crucial for many statistical and machine learning algorithms # • k-means and hierarchical clustering # – Data units & variance play crucial role in cluster selection # • Using gradient descent optimization # – Scaled data allows the weights to update at an equal speed # • Scaled data allows the regression coefficients to be compared ######################################################### # Min-max scaling # DOESN'T change the shape # DOES change the bounds, mean and sd # NOT often used in LR # used more in GDO (gradient Descent Optimisation) # sklearn.preprocessing module has a MinMaxScaler() for this ########################################################## np.random.seed(1) x_n = np.random.normal(2, 5, 500) x_t = np.random.standard_t(2, 500) x_ln = np.random.lognormal(1, 1, 500) df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln }) df_long = df.melt(var_name='Distribution') g = sns.FacetGrid(df_long, col='Distribution',sharex=False) g.map(plt.hist, 'value', bins = 50) plt.show() def min_max(x): min = np.min(x) s = (x - min)/(np.max(x) - min) return (s) scaled = df.apply(min_max).melt(var_name='Distribution') scaled['Scaled'] = True df_long['Scaled'] = False full_data = pd.concat([df_long, scaled], axis=0) g = sns.FacetGrid(full_data, col='Distribution' ,row='Scaled' , sharex=False , sharey=False) g.map(plt.hist, 'value', bins = 50) plt.show() df.apply([np.mean,np.std]) df.apply(min_max).apply([np.mean,np.std]) # sklearn: MinMaxScaler() scaler = preprocessing.MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_train_scaled[:1] ########################################################## # z-score standardisation # DOESN'T change the shape # popular in linear models # DOESN'T effect the predictions # but makes the size of the coeffs directly comparable # sklearn.preprocessing module has a StandardScaler() for this ########################################################## def z_score(x): mean = np.mean(x) std = np.std(x, ddof=1) return (x - mean)/std scaled = df.apply(z_score).melt(var_name='Distribution') scaled['Scaled'] = True full_data = pd.concat([df_long, scaled], axis=0) g = sns.FacetGrid(full_data, col='Distribution' , row ='Scaled' , sharex=False ,sharey=False) g.map(plt.hist, 'value', bins=50) ############################################### # Dividing by two standard deviations # http://www.stat.columbia.edu/ # ~gelman/research/published/ standardizing7.pdf # One of the downsides of scaling data by z-scoring is that is not obvious # how this should be handled in the case of categorical variables. # suggest the use of a rescaling that divides numeric vari- # ables by two standard deviations, whilst leaving binary encoded categorical # variables untransformed. # nothing in sklearn for this ############################################### from sklearn.base import BaseEstimator, TransformerMixin class two_sd_scaler(BaseEstimator, TransformerMixin): def fit(self, X, y=None): self.stds = 2*np.std(X, axis=0, ddof=1) return self def transform(self, X, y=None): return X/self.stds # Having preprocessed the data this way we can not fit a model to it in the # same way as before. model2 = linear_model.LinearRegression() model2.fit(X_train_scaled, y_train) #When making predictions on new values we also need to make sure to pass #the new values through the same preprocessing step. new_value = np.array(X_train.mean(), ndmin = 2) new_scaled = scaler.transform(new_value) pred = model2.predict(new_scaled) pred ########################## # 2.5 Creating a pipeline ########################## # For any training data set and any data for prediction we will want to apply # the same scaling transformation and use the same model. We could create # a sklearn.pipeline.Pipeline() to organise the steps to creating the # estimator from sklearn.pipeline import Pipeline model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler()) ,('regression', linear_model.LinearRegression()) ]) # Having created the Pipeline object we can now fit as before. Calling # .fit() now however, will first fit the 'preprocess' step and then the # 'regression' step. When we predict, the new values will also pass through # both stages of our pipeline. model.fit(X_train,y_train) new_values = np.array(X_train.mean(), ndmin = 2) model.predict(new_values) #from sklearn.metrics import accuracy_score #print(accuracy_score(y_test, model.predict(X_test))) #2.6 Preprocessing categorical variables # One hot encoding: will take a categorical feature with K categories and # create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each # category. Consider the toy data toy = pd.DataFrame({ 'category':['a', 'a', 'b', 'c', 'b'] }) enc = preprocessing.OneHotEncoder() enc.fit(toy) enc.transform(toy).toarray() #Combining preprocessing steps: # the preprocessing steps into a single operation # for our Pipeline using a sklearn.compose.ColumnTransformer toy = pd.DataFrame({ 'numeric': [1., 2., 3., 4., 5.], 'category': ['a', 'a', 'b', 'c', 'b'] }) from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder numeric_features = ['numeric'] categorical_features = ['category'] preprocessor = ColumnTransformer(transformers=[('num', StandardScaler() , numeric_features) ,('cat', OneHotEncoder(), categorical_features)]) preprocessor.fit(toy) preprocessor.transform(toy) # This preprocessing step could then be a step in the pipeline for a regres- # sion model = Pipeline(steps = [('preprocess', preprocessor) ,('regression', linear_model.LinearRegression())]) # fit the preprocessor pipeline to the data preprocessor.fit(toy) # transformer will now give the appropriate pre-processing for different types of variables. preprocessor.transform(toy) #This preprocessing step could then be a step in the pipeline for a regression model = Pipeline(steps = [('preprocess', preprocessor) ,('regression', linear_model.LinearRegression())]) #Model Assessment and Feature Selection #%%##################################################################### # Accuracy score is only for classification problems. # For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error). from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from sklearn import preprocessing # read data iris = datasets.load_iris() # assign X and y X = iris.data y = iris.target # split data into train and testing part (25 % of data is test size of the data) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) # preprocess the data # scaling scaler = preprocessing.MinMaxScaler() # fit X_train to scaling scaler.fit(X_train) # Apply the scaling/transforamtion to the dta X_train_scaled = scaler.transform(X_train) # Choose the required model/s model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets model2 = DecisionTreeClassifier() # fit the model to the data for predictions model2.fit(X_train_scaled, y_train) # check model performace print(accuracy_score(y_test, model2.predict(X_test))) #When making predictions on new values we also need to make sure to pass #the new values through the same preprocessing step. new_value = np.array(X_train.mean(), ndmin = 2) new_scaled = scaler.transform(new_value) pred = model2.predict(new_scaled) pred # or Create a pipeline that standardizes the data then creates a model # make a pipeline # PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification pipe1 = Pipeline([('pca', PCA(n_components = 2)) , ('std', StandardScaler()) , ('decision_tree', DecisionTreeClassifier())] , verbose = True) pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler()) #,('regression', linear_model.LinearRegression()) ,('rf', RandomForestClassifier()) ]) # fit pipeline to TRAINING data [X_train and y_train] pipe1.fit(X_train, y_train) pipe2.fit(X_train, y_train) # model prediction on TEST data [X_test and y_test] print(accuracy_score(y_test, pipe1.predict(X_test))) print(accuracy_score(y_test, pipe2.predict(X_test))) print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1))) enc = preprocessing.OneHotEncoder() enc.fit(X_train) enc.transform(X_train).toarray()