ML_AI_training/unsup_v1.py

778 lines
27 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 16 16:55:06 2022
@author: tanu
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import cluster, datasets
import warnings
import sys
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
from sklearn.metrics import v_measure_score
# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
# K-means
# HC
# Gaussian mixed mixture
# FP growth
# PCA
# Meanshift
# DBScan
# Model assessment:
# Mututal information
# Silloute score
# v_measure_score
# Itertools.combinations()
#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
# For K-means clustering, the model is that all clusters
# have equal, spherical variance.
samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
print('Dataset size : ', samples.shape, clusters.shape)
print('Cluster names : ',set(clusters))
with plt.style.context(('ggplot', 'seaborn')):
plt.figure(figsize=(8,6))
for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Visualizing Dataset')
plt.legend(loc='best')
#kmeans = cluster.KMeans(n_clusters=5)
kmeans = cluster.KMeans(n_clusters=2)
kmeans.fit(samples)
preds = kmeans.predict(samples)
print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
print('Cluster Centers : \n', str(kmeans.cluster_centers_))
print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
with plt.style.context(('ggplot', 'seaborn')):
plt.figure(figsize=(10,6))
plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Visualizing Predictions & Cluster Centers')
plt.legend(loc='best')
# The Elbow Method: To decide the numbers of cluster i.e. 'k'
plt.figure(figsize=(8,5))
distortions = []
for i in range(1,11):
kmeans = cluster.KMeans(n_clusters=i)
kmeans.fit(samples)
distortions.append(kmeans.inertia_)
print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
with plt.style.context(('ggplot', 'seaborn')):
plt.plot(range(1,11), distortions, )
plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
plt.xlabel('Number Of Clusters')
plt.ylabel('Distortions')
plt.title('The Elbow Method (Num of Clusters vs Distortions)')
plt.xticks(range(1,11));
#%% Example 1: My data
X_unsup = num_df_wtgt[['ligand_affinity_change'
, 'duet_stability_change']]
kmeans = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
, ('clf', KMeans(n_clusters = 2))
])
#kmeans = KMeans(n_clusters = 2)
#kmeans.fit(X_unsup)
kmeans.fit(X_unsup)
y_kmeans = kmeans.predict(X_unsup)
plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
, X_unsup.loc[:, 'duet_stability_change']
, c = y_kmeans
, s = 50
, cmap = 'viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1]
, c = 'black'
, s = 200
, alpha = 0.5);
plt.show()
#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
# Loading dataset
iris_df = datasets.load_iris()
# Available methods on dataset
print(dir(iris_df))
# Features
print(iris_df.feature_names)
# Targets
print(iris_df.target)
# Target Names
print(iris_df.target_names)
label = {0: 'red', 1: 'blue', 2: 'green'}
# Dataset Slicing
x_axis = iris_df.data[:, 0] # Sepal Length
y_axis = iris_df.data[:, 2] # Sepal Width
# Plotting
plt.scatter(x_axis, y_axis, c=iris_df.target)
plt.show()
# Use Kmeans: Declaring Model
model = KMeans(n_clusters = 3)
# Fitting Model
model.fit(iris_df.data)
# Predicitng a single input
predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
# Prediction on the entire data
all_predictions = model.predict(iris_df.data)
# Printing Predictions
print(predicted_label)
print(all_predictions)
#%% Example 2: My data
X_unsup = num_df_wtgt[[#'ligand_affinity_change'
#, 'duet_stability_change'
'ddg_foldx'
,'deepddg']]
y_unsup = num_df_wtgt[['mutation_class']]
# X_train, X_test, y_train, y_test = train_test_split(X_unsup
# , y_unsup
# , test_size = 0.33
# , **rs
# , shuffle = True
# , stratify = y_unsup)
#model = KMeans(n_clusters=2)
model = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
, ('clf', KMeans(n_clusters = 2))
])
label = {0: 'blue', 1: 'red'}
model.fit(X_unsup)
predicted_label = model.predict(X_test)
all_predictions = model.predict(X_unsup)
print(predicted_label)
print(all_predictions)
plt.scatter(X_train.loc[:, 'ligand_affinity_change']
, X_train.loc[:, 'duet_stability_change']
, c = y_train.loc[:, 'mutation_class'])
#from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
visualizer.show()
#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
# List of V-Measure Scores for different models
v_scores = []
# List of different types of covariance parameters
N_Clusters = [2, 3]
# Building the clustering model
kmeans2 = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
# ('pre'), StandardScaler())
, ('clf', KMeans(n_clusters = 2))
])
# Training the clustering model
kmeans2.fit(X_unsup)
# Storing the predicted Clustering labels
labels2 = kmeans2.predict(X_unsup)
# Evaluating the performance
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
# Building the clustering model
kmeans3 = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
# ('pre'), StandardScaler())
, ('clf', KMeans(n_clusters = 3))
])
# Training the clustering model
kmeans3.fit(X_unsup)
# Storing the predicted Clustering labels
labels3 = kmeans3.predict(X_unsup)
# Evaluating the performance
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
# Plotting a Bar Graph to compare the models
plt.bar(N_Clusters, v_scores)
plt.xlabel('Number of Clusters')
plt.ylabel('V-Measure Score')
plt.title('Comparison of different Clustering Models')
plt.show()
# Score: silhouette
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42}
from sklearn.metrics import silhouette_score
#https://realpython.com/k-means-clustering-python/
#import kneed
#from kneed import KneeLocator
scaler = StandardScaler()
X_unsup_scaled = scaler.fit_transform(X_unsup)
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 5):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
# kmeans = Pipeline([
# # ('pre', MinMaxScaler())
# ('pre', StandardScaler())
# , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
# ])
kmeans.fit(X_unsup_scaled)
score = silhouette_score(X_unsup_scaled, kmeans.labels_)
silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2, 5), silhouette_coefficients)
plt.xticks(range(2, 5))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
plt.bar(range(2, 5), silhouette_coefficients)
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
from sklearn.metrics import adjusted_rand_score
# Instantiate k-means and dbscan algorithms
kmeans = KMeans(n_clusters=2)
dbscan = DBSCAN(eps=0.3)
# Fit the algorithms to the features
kmeans.fit(X_unsup_scaled)
dbscan.fit(X_unsup_scaled)
# Compute the silhouette scores for each algorithm
kmeans_silhouette = silhouette_score(
X_unsup_scaled, kmeans.labels_ ).round(2)
dbscan_silhouette = silhouette_score(
X_unsup_scaled, dbscan.labels_).round (2)
kmeans_silhouette
dbscan_silhouette
ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
round(ari_kmeans, 2)
round(ari_dbscan, 2)
# Crescent plot
fig, (ax1, ax2) = plt.subplots(
1, 2, figsize=(8, 6), sharex=True, sharey=True
)
fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
fte_colors = {
0: "#008fd5",
1: "#fc4f30",
}
# The k-means plot
km_colors = [fte_colors[label] for label in kmeans.labels_]
ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
ax1.set_title(
f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
)
# The dbscan plot
db_colors = [fte_colors[label] for label in dbscan.labels_]
ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
ax2.set_title(
f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
)
plt.show()
#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
from sklearn.cluster import AffinityPropagation
from matplotlib import pyplot
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import OPTICS
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
# affinity propagation
#
# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
#=============
XA = np.array(X_unsup)
# define the model
#model = AffinityPropagation(damping=0.9)
model = Pipeline([
#('pca', PCA(n_components = 2))
('pre', MinMaxScaler())
#('pre', StandardScaler())
#, ('clf', AffinityPropagation(damping=0.9))
, ('clf', AgglomerativeClustering(n_clusters=2)) #y
#, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
#, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
#, ('clf', KMeans(n_clusters=2)) #y
#, ('clf', MiniBatchKMeans(n_clusters=2))
#, ('clf', OPTICS(eps=0.8, min_samples=10))
#, ('clf', SpectralClustering(n_clusters=2))
# , ('clf', GaussianMixture(n_components=2))
])
model
# fit the model
#model.fit(X_unsup)
#yhat = model.predict(X_unsup)
yhat = model.fit_predict(X_unsup)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
print(cluster)
# get row indexes for samples with this cluster
row_ix = where(yhat == cluster)
print(row_ix)
# create scatter of these samples
#pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
# show the plot
pyplot.show()
#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime
from tqdm import tqdm
from sklearn.preprocessing import PowerTransformer
import umap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from scipy import stats
from sklearn.cluster import KMeans
from kmodes.kprototypes import KPrototypes
from lightgbm import LGBMClassifier
import shap
from sklearn.model_selection import cross_val_score
from kmodes.kprototypes import KPrototypes
import lightgbm
from lightgbm import LGBMClassifier
full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
full_data.head()
#Preprocessing numerical
numerical = full_data.select_dtypes(exclude='object')
for c in numerical.columns:
print(c)
pt = PowerTransformer()
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
##preprocessing categorical
categorical = full_data.select_dtypes(include='object')
categorical.head()
for col in categorical.columns:
#print('-' * 40 + col + '-' * 40 , end=' - ')
display(categorical[col].value_counts().head(10))
categorical = pd.get_dummies(categorical)
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
#Embedding numerical & categorical
# fit1 = PCA(n_components=2).fit(numerical)
# fit2 = PCA(n_components=2).fit(categorical)
fit1 = umap.UMAP(metric='l2').fit(numerical)
fit2 = umap.UMAP(metric='dice').fit(categorical)
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
intersection = umap.umap_.reset_local_connectivity(intersection)
#https://github.com/lmcinnes/umap/issues/561
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
, intersection
, fit1.n_components
, fit1._initial_alpha
, fit1._a
, fit1._b
, fit1.repulsion_strength
, fit1.negative_sample_rate
, 200
, 'random'
, np.random
, fit1.metric
, fit1._metric_kwds
, False
, densmap_kwds = {}
, output_dens = False)
plt.figure(figsize=(20, 10))
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
plt.show()
#One-Hot-Encoding
data = pd.get_dummies(full_data)
#Pre-processing
for c in data.columns:
pt = PowerTransformer()
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
#Actual Clustering
kmeans = KMeans(n_clusters=15).fit(data)
kmeans_labels = kmeans.labels_
#OPTIONAL: Elbow plot with inertia
#Elbow method to choose the optimal number of clusters
# sse = {}
# for k in tqdm(range(2, 50)):
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
# fig.show()
# K-Prototypes
kprot_data = full_data.copy()
#Pre-processing
for c in full_data.select_dtypes(exclude='object').columns:
pt = PowerTransformer()
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
#Actual clustering
kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
#Prints the count of each cluster group
pd.Series(clusters).value_counts()
#OPTIONAL: Elbow plot with cost (will take a LONG time)
costs = []
n_clusters = []
clusters_assigned = []
for i in tqdm(range(2, 25)):
#for i in tqdm(range(2, 10)):
try:
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
costs.append(kproto.cost_)
n_clusters.append(i)
clusters_assigned.append(clusters)
except:
print(f"Can't cluster with {i} clusters")
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
fig.show()
# Visual Evaluation: Kmeans
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Visual Evaluation: K-Prototypes
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Evaluation by Classification
#Setting the objects to category
lgbm_data = full_data.copy()
for c in lgbm_data.select_dtypes(include='object'):
lgbm_data[c] = lgbm_data[c].astype('category')
#KMeans clusters
clf_km = LGBMClassifier(colsample_by_tree=0.8)
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
#Fit the model
clf_km.fit(lgbm_data, kmeans_labels)
#SHAP values
explainer_km = shap.TreeExplainer(clf_km)
shap_values_km = explainer_km.shap_values(lgbm_data)
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
#K-Prototypes
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
#clf_kp.fit(lgbm_data, proto_clusters)
clf_kp.fit(lgbm_data, clusters)
explainer_kp = shap.TreeExplainer(clf_kp)
shap_values_kp = explainer_kp.shap_values(lgbm_data)
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
#%%Example 5: My data
# FIXME: clusters and proto_clusters?
full_data = all_df.copy()
full_data.head()
full_data.shape
#Preprocessing numerical
numerical = full_data.select_dtypes(exclude='object')
numerical.shape
for c in numerical.columns:
print(c)
pt = PowerTransformer()
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
##preprocessing categorical
categorical = full_data.select_dtypes(include='object')
categorical.head()
for col in categorical.columns:
#print('-' * 40 + col + '-' * 40 , end=' - ')
display(categorical[col].value_counts().head(10))
categorical = pd.get_dummies(categorical)
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
#Embedding numerical & categorical
# fit1 = PCA(n_components=2).fit(numerical)
# fit2 = PCA(n_components=2).fit(categorical)
fit1 = umap.UMAP(metric='l2').fit(numerical)
fit2 = umap.UMAP(metric='dice').fit(categorical)
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
intersection = umap.umap_.reset_local_connectivity(intersection)
#https://github.com/lmcinnes/umap/issues/561
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
, intersection
, fit1.n_components
, fit1._initial_alpha
, fit1._a
, fit1._b
, fit1.repulsion_strength
, fit1.negative_sample_rate
, 200
, 'random'
, np.random
, fit1.metric
, fit1._metric_kwds
, False
, densmap_kwds = {}
, output_dens = False)
plt.figure(figsize=(20, 10))
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
plt.show()
#One-Hot-Encoding
data = pd.get_dummies(full_data)
#Pre-processing
for c in data.columns:
pt = PowerTransformer()
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
#Actual Clustering
kmeans = KMeans(n_clusters=2).fit(data)
kmeans_labels = kmeans.labels_
#OPTIONAL: Elbow plot with inertia
#Elbow method to choose the optimal number of clusters
# sse = {}
# for k in tqdm(range(2, 50)):
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
# fig.show()
# K-Prototypes
kprot_data = full_data.copy()
#Pre-processing
for c in full_data.select_dtypes(exclude='object').columns:
pt = PowerTransformer()
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
categorical_columns = []
for col in cat_df.columns:
print(col)
#categ_i += all_df.columns.get_loc(col)
categorical_columns.append(full_data.columns.get_loc(col))
print(categorical_columns)
print(len(categorical_columns))
#Actual clustering
kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
#Prints the count of each cluster group
pd.Series(clusters).value_counts()
#OPTIONAL: Elbow plot with cost (will take a LONG time)
costs = []
n_clusters = []
clusters_assigned = []
for i in tqdm(range(2, 25)):
#for i in tqdm(range(2, 10)):
#print(i)
try:
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
#clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
costs.append(kproto.cost_)
n_clusters.append(i)
clusters_assigned.append(clusters)
except:
print(f"Can't cluster with {i} clusters")
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
fig.show()
# Visual Evaluation: Kmeans
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Visual Evaluation: K-Prototypes
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=2), #15
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Evaluation by Classification
#Setting the objects to category
lgbm_data = full_data.copy()
for c in lgbm_data.select_dtypes(include='object'):
lgbm_data[c] = lgbm_data[c].astype('category')
#KMeans clusters
clf_km = LGBMClassifier(colsample_by_tree=0.8)
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
#Fit the model
clf_km.fit(lgbm_data, kmeans_labels)
#SHAP values
explainer_km = shap.TreeExplainer(clf_km)
shap_values_km = explainer_km.shap_values(lgbm_data)
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
#K-Prototypes
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
clf_kp.fit(lgbm_data, proto_clusters)
clf_kp.fit(lgbm_data, clusters)
explainer_kp = shap.TreeExplainer(clf_kp)
shap_values_kp = explainer_kp.shap_values(lgbm_data)
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))