#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 17 14:52:55 2022

@author: tanu
"""
from sklearn.datasets import load_boston
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)

boston = load_boston()
dir(boston)
#['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'target']
X, y = boston.data, boston.target
df = pd.DataFrame(X, columns = boston.feature_names)
df['MEDV'] = y

sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
plt.show()

#Model fitting
#To fit a model using just a single predictor we first extract the training variables.
X_train = df['CRIM']
y_train = y
# Unfortunately, sklearn ’s various model fitting functions typically expect a
# two dimensional array for the covariates. Since we have extracted only
# a single feature here it is only one dimensional. We need to reshape the
# X_train values to be the appropriate shape.
# This is not necessary if using more than a single feature.

if len(X_train.values.shape) == 1:
    X_train = X_train.values.reshape(-1, 1)

# Create a LinearRegression object: This object is of a broader class of estima-
#tor objects.
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

# We can make predictions from our fitted model with the .predict() method.
new_value = np.array(4.09, ndmin = 2)
model.predict(new_value)
multiple_values = np.array([1, 2, 3], ndmin = 2).T
model.predict(multiple_values)

#Fitted values
#Fitted values of a model typically describes the predicted ŷ for the obser-
#vations X . To get the model fitted values we could just predict from the
#model using the values used to train it.
fitted = model.predict(X_train)
ax = sns.scatterplot(x = 'CRIM', y = 'MEDV', data = df)
sns.lineplot(df['CRIM'], fitted, ax = ax)
plt.show()

# Interpreting the coefficients
# The coefficients of the fitted model are kept in the model.coef_ attribute.
# This gives us the expected change in y for a unit change in X .
model.coef_


#2.3 Multiple linear regression
X_train = df.iloc[:,:3]
grid = sns.PairGrid(data=pd.concat([X_train,pd.Series(y_train,name="MEDV")],axis = 1))
grid.map_offdiag(sns.scatterplot)
grid.map_diag(sns.distplot)
plt.show()
model.fit(X_train, y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#Residuals
#In classical statistics, one of our assump-
#tions it that the residuals are normally dis-
#tributed.Small RSS implies the fitted model is
#closer to the observations.

fitted = model.predict(X_train)
resid = y_train - fitted
# Standardise to remove effect of measurement scale
resid = (resid - np.mean(resid))/np.std(resid,ddof = 1)
plt.figure()
for i in range(3):
    xvar = X_train.iloc[:,i]
    ax = plt.subplot(3, 1, i + 1)
    ax.scatter(xvar, resid)
    ax.set_xlabel(boston.feature_names[i])
    ax.set_ylabel("Residuals")
    ax.hlines([-2, 0, 2], np.min(xvar), np.max(xvar))
    
plt.show()
plt.figure()
ax = plt.subplot(3, 1, 1)
ax.scatter(fitted,resid)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')

ax = plt.subplot(3,1,2)
ax.scatter(fitted,y_train)
ax.set_xlabel('Fitted values')
ax.set_ylabel('Predicted values')

ax = plt.subplot(3, 1,3)
import scipy.stats as stats
stats.probplot(resid,dist = 'norm',plot = ax)
plt.show()

#Scaling data: many types available
# sklearn comes with many preprocessing transformations in the sklearn.preprocessing module
#Scaling is crucial for many statistical and machine learning algorithms
# • k-means and hierarchical clustering
# – Data units & variance play crucial role in cluster selection
# • Using gradient descent optimization
# – Scaled data allows the weights to update at an equal speed
# • Scaled data allows the regression coefficients to be compared

#########################################################
# Min-max scaling
# DOESN'T change the shape
# DOES change the bounds, mean and sd
# NOT often used in LR
# used more in GDO (gradient Descent Optimisation)

# sklearn.preprocessing module has a MinMaxScaler() for this
##########################################################

np.random.seed(1)
x_n = np.random.normal(2, 5, 500) 
x_t = np.random.standard_t(2, 500) 
x_ln = np.random.lognormal(1, 1, 500) 
df = pd.DataFrame({ 'Normal': x_n, 'T': x_t, 'Lognormal': x_ln
})

df_long = df.melt(var_name='Distribution')
g = sns.FacetGrid(df_long, col='Distribution',sharex=False)
g.map(plt.hist, 'value', bins = 50)
plt.show()

def min_max(x):
    min = np.min(x)
    s = (x - min)/(np.max(x) - min)
    return (s)

scaled = df.apply(min_max).melt(var_name='Distribution')

scaled['Scaled'] = True
df_long['Scaled'] = False
full_data = pd.concat([df_long, scaled], axis=0)

g = sns.FacetGrid(full_data, col='Distribution'
                  ,row='Scaled'
                  , sharex=False
                  , sharey=False)

g.map(plt.hist, 'value', bins = 50)

plt.show()

df.apply([np.mean,np.std])
df.apply(min_max).apply([np.mean,np.std])

# sklearn: MinMaxScaler()

scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_train_scaled[:1]

##########################################################
# z-score standardisation
# DOESN'T change the shape
# popular in linear models
# DOESN'T effect the predictions
# but makes the size of the coeffs directly comparable

# sklearn.preprocessing module has a StandardScaler() for this
##########################################################

def z_score(x):
    mean = np.mean(x)
    std = np.std(x, ddof=1)
    return (x - mean)/std

scaled = df.apply(z_score).melt(var_name='Distribution')
scaled['Scaled'] = True
full_data = pd.concat([df_long, scaled], axis=0)
g = sns.FacetGrid(full_data, col='Distribution'
                  , row ='Scaled'
                  , sharex=False
                  ,sharey=False)
g.map(plt.hist, 'value', bins=50)

###############################################
# Dividing by two standard deviations
# http://www.stat.columbia.edu/
# ~gelman/research/published/ standardizing7.pdf
# One of the downsides of scaling data by z-scoring is that is not obvious
# how this should be handled in the case of categorical variables.

# suggest the use of a rescaling that divides numeric vari-
# ables by two standard deviations, whilst leaving binary encoded categorical
# variables untransformed.
# nothing in sklearn for this
###############################################
from sklearn.base import BaseEstimator, TransformerMixin
class two_sd_scaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.stds = 2*np.std(X, axis=0, ddof=1)
        return self
    def transform(self, X, y=None):
        return X/self.stds
        
# Having preprocessed the data this way we can not fit a model to it in the
# same way as before.
model2 = linear_model.LinearRegression()
model2.fit(X_train_scaled, y_train)
#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.

new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred

##########################
# 2.5 Creating a pipeline
##########################
# For any training data set and any data for prediction we will want to apply
# the same scaling transformation and use the same model. We could create
# a sklearn.pipeline.Pipeline() to organise the steps to creating the
# estimator

from sklearn.pipeline import Pipeline
model = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
                          ,('regression', linear_model.LinearRegression())
                          ])

# Having created the Pipeline object we can now fit as before. Calling
# .fit() now however, will first fit the 'preprocess' step and then the
# 'regression' step. When we predict, the new values will also pass through
# both stages of our pipeline.
model.fit(X_train,y_train)
new_values = np.array(X_train.mean(), ndmin = 2)
model.predict(new_values)
#from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test, model.predict(X_test)))

#2.6 Preprocessing categorical variables
# One hot encoding: will take a categorical feature with K categories and
# create a ‘one of K ’ encoding scheme. I.e a set of binary variables for each
# category. Consider the toy data

toy = pd.DataFrame({
'category':['a', 'a', 'b', 'c', 'b']
})
enc = preprocessing.OneHotEncoder()
enc.fit(toy)
enc.transform(toy).toarray()

#Combining preprocessing steps: 
# the preprocessing steps into a single operation
# for our Pipeline using a sklearn.compose.ColumnTransformer
toy = pd.DataFrame({
'numeric': [1., 2., 3., 4., 5.],
'category': ['a', 'a', 'b', 'c', 'b']
})

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = ['numeric']
categorical_features = ['category']
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler()
                                                , numeric_features)
                                               ,('cat', OneHotEncoder(), categorical_features)])

preprocessor.fit(toy)
preprocessor.transform(toy)

# This preprocessing step could then be a step in the pipeline for a regres-
# sion 
model = Pipeline(steps = [('preprocess', preprocessor)
                          ,('regression', linear_model.LinearRegression())])

# fit the preprocessor pipeline to the data
preprocessor.fit(toy)

# transformer will now give the appropriate pre-processing for different types of variables.
preprocessor.transform(toy)

#This preprocessing step could then be a step in the pipeline for a regression
model = Pipeline(steps = [('preprocess', preprocessor)
         ,('regression', linear_model.LinearRegression())])

#Model Assessment and Feature Selection

#%%#####################################################################

# Accuracy score is only for classification problems.
# For regression problems you can use: R2 Score, MSE (Mean Squared Error), RMSE (Root Mean Squared Error).
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

# read data
iris = datasets.load_iris()

# assign X and y
X = iris.data
y = iris.target

# split data into train and testing part (25 % of data is test size of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# preprocess the data 

# scaling
scaler = preprocessing.MinMaxScaler()
# fit X_train to scaling
scaler.fit(X_train)
# Apply the scaling/transforamtion to the dta
X_train_scaled = scaler.transform(X_train)

# Choose the required model/s
model2 = linear_model.LinearRegression() # Classification metrics can't handle a mix of multiclass and continuous targets
model2 = DecisionTreeClassifier()

# fit the model to the data for predictions
model2.fit(X_train_scaled, y_train)
# check model performace
print(accuracy_score(y_test, model2.predict(X_test)))


#When making predictions on new values we also need to make sure to pass
#the new values through the same preprocessing step.
new_value = np.array(X_train.mean(), ndmin = 2)
new_scaled = scaler.transform(new_value)
pred = model2.predict(new_scaled)
pred

# or Create a pipeline that standardizes the data then creates a model

# make a pipeline
# PCA(Dimension reduction to two) -> Scaling the data -> DecisionTreeClassification
pipe1 = Pipeline([('pca', PCA(n_components = 2))
                 , ('std', StandardScaler())
                 , ('decision_tree', DecisionTreeClassifier())]
                 , verbose = True)

pipe2 = Pipeline(steps = [('preprocess', preprocessing.StandardScaler())
                          #,('regression', linear_model.LinearRegression())
                          ,('rf', RandomForestClassifier())
                          ])

# fit pipeline to TRAINING data [X_train and y_train]
pipe1.fit(X_train, y_train)
pipe2.fit(X_train, y_train)

# model prediction on TEST data [X_test and y_test]
print(accuracy_score(y_test, pipe1.predict(X_test)))
print(accuracy_score(y_test, pipe2.predict(X_test)))
print(pipe2.classification_report (y_test, np.argmax(predicted, axis = 1))) 
enc = preprocessing.OneHotEncoder()
enc.fit(X_train)
enc.transform(X_train).toarray()