778 lines
27 KiB
Python
Executable file
778 lines
27 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Mar 16 16:55:06 2022
|
|
|
|
@author: tanu
|
|
"""
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import sklearn
|
|
from sklearn import cluster, datasets
|
|
import warnings
|
|
import sys
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
|
|
from sklearn.metrics import v_measure_score
|
|
# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
|
|
# K-means
|
|
# HC
|
|
# Gaussian mixed mixture
|
|
# FP growth
|
|
# PCA
|
|
# Meanshift
|
|
# DBScan
|
|
|
|
# Model assessment:
|
|
# Mututal information
|
|
# Silloute score
|
|
# v_measure_score
|
|
|
|
# Itertools.combinations()
|
|
|
|
#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
|
|
|
|
# For K-means clustering, the model is that all clusters
|
|
# have equal, spherical variance.
|
|
|
|
samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
|
|
print('Dataset size : ', samples.shape, clusters.shape)
|
|
print('Cluster names : ',set(clusters))
|
|
with plt.style.context(('ggplot', 'seaborn')):
|
|
plt.figure(figsize=(8,6))
|
|
for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
|
|
plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
|
|
|
|
plt.xlabel('Feature 1')
|
|
plt.ylabel('Feature 2')
|
|
plt.title('Visualizing Dataset')
|
|
plt.legend(loc='best')
|
|
|
|
#kmeans = cluster.KMeans(n_clusters=5)
|
|
kmeans = cluster.KMeans(n_clusters=2)
|
|
|
|
kmeans.fit(samples)
|
|
preds = kmeans.predict(samples)
|
|
|
|
print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
|
|
print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
|
|
print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
|
|
print('Cluster Centers : \n', str(kmeans.cluster_centers_))
|
|
|
|
print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
|
|
with plt.style.context(('ggplot', 'seaborn')):
|
|
plt.figure(figsize=(10,6))
|
|
|
|
plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
|
|
plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
|
|
plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
|
|
plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
|
|
plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
|
|
|
|
for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
|
|
plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
|
|
for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
|
|
plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
|
|
for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
|
|
plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
|
|
for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
|
|
plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
|
|
for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
|
|
plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
|
|
|
|
plt.xlabel('Feature 1')
|
|
plt.ylabel('Feature 2')
|
|
plt.title('Visualizing Predictions & Cluster Centers')
|
|
plt.legend(loc='best')
|
|
|
|
# The Elbow Method: To decide the numbers of cluster i.e. 'k'
|
|
plt.figure(figsize=(8,5))
|
|
distortions = []
|
|
for i in range(1,11):
|
|
kmeans = cluster.KMeans(n_clusters=i)
|
|
kmeans.fit(samples)
|
|
distortions.append(kmeans.inertia_)
|
|
|
|
print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
|
|
|
|
with plt.style.context(('ggplot', 'seaborn')):
|
|
plt.plot(range(1,11), distortions, )
|
|
plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
|
|
plt.xlabel('Number Of Clusters')
|
|
plt.ylabel('Distortions')
|
|
plt.title('The Elbow Method (Num of Clusters vs Distortions)')
|
|
plt.xticks(range(1,11));
|
|
|
|
#%% Example 1: My data
|
|
X_unsup = num_df_wtgt[['ligand_affinity_change'
|
|
, 'duet_stability_change']]
|
|
|
|
kmeans = Pipeline([
|
|
#('pca', PCA())
|
|
('pre', MinMaxScaler())
|
|
, ('clf', KMeans(n_clusters = 2))
|
|
])
|
|
|
|
#kmeans = KMeans(n_clusters = 2)
|
|
#kmeans.fit(X_unsup)
|
|
kmeans.fit(X_unsup)
|
|
y_kmeans = kmeans.predict(X_unsup)
|
|
|
|
plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
|
|
, X_unsup.loc[:, 'duet_stability_change']
|
|
, c = y_kmeans
|
|
, s = 50
|
|
, cmap = 'viridis')
|
|
|
|
centers = kmeans.cluster_centers_
|
|
plt.scatter(centers[:, 0], centers[:, 1]
|
|
, c = 'black'
|
|
, s = 200
|
|
, alpha = 0.5);
|
|
plt.show()
|
|
#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
|
|
|
|
# Loading dataset
|
|
iris_df = datasets.load_iris()
|
|
|
|
# Available methods on dataset
|
|
print(dir(iris_df))
|
|
|
|
# Features
|
|
print(iris_df.feature_names)
|
|
|
|
# Targets
|
|
print(iris_df.target)
|
|
|
|
# Target Names
|
|
print(iris_df.target_names)
|
|
label = {0: 'red', 1: 'blue', 2: 'green'}
|
|
|
|
# Dataset Slicing
|
|
x_axis = iris_df.data[:, 0] # Sepal Length
|
|
y_axis = iris_df.data[:, 2] # Sepal Width
|
|
|
|
# Plotting
|
|
plt.scatter(x_axis, y_axis, c=iris_df.target)
|
|
plt.show()
|
|
|
|
# Use Kmeans: Declaring Model
|
|
model = KMeans(n_clusters = 3)
|
|
|
|
# Fitting Model
|
|
model.fit(iris_df.data)
|
|
|
|
# Predicitng a single input
|
|
predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
|
|
|
|
# Prediction on the entire data
|
|
all_predictions = model.predict(iris_df.data)
|
|
|
|
# Printing Predictions
|
|
print(predicted_label)
|
|
print(all_predictions)
|
|
|
|
#%% Example 2: My data
|
|
X_unsup = num_df_wtgt[[#'ligand_affinity_change'
|
|
#, 'duet_stability_change'
|
|
'ddg_foldx'
|
|
,'deepddg']]
|
|
y_unsup = num_df_wtgt[['mutation_class']]
|
|
|
|
|
|
# X_train, X_test, y_train, y_test = train_test_split(X_unsup
|
|
# , y_unsup
|
|
# , test_size = 0.33
|
|
# , **rs
|
|
# , shuffle = True
|
|
# , stratify = y_unsup)
|
|
|
|
#model = KMeans(n_clusters=2)
|
|
model = Pipeline([
|
|
#('pca', PCA())
|
|
('pre', MinMaxScaler())
|
|
, ('clf', KMeans(n_clusters = 2))
|
|
])
|
|
|
|
label = {0: 'blue', 1: 'red'}
|
|
|
|
model.fit(X_unsup)
|
|
predicted_label = model.predict(X_test)
|
|
all_predictions = model.predict(X_unsup)
|
|
print(predicted_label)
|
|
print(all_predictions)
|
|
|
|
plt.scatter(X_train.loc[:, 'ligand_affinity_change']
|
|
, X_train.loc[:, 'duet_stability_change']
|
|
, c = y_train.loc[:, 'mutation_class'])
|
|
|
|
#from yellowbrick.cluster import KElbowVisualizer
|
|
model = KMeans()
|
|
visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
|
|
visualizer.show()
|
|
|
|
#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
|
|
|
|
# List of V-Measure Scores for different models
|
|
v_scores = []
|
|
|
|
# List of different types of covariance parameters
|
|
N_Clusters = [2, 3]
|
|
|
|
# Building the clustering model
|
|
kmeans2 = Pipeline([
|
|
#('pca', PCA())
|
|
('pre', MinMaxScaler())
|
|
# ('pre'), StandardScaler())
|
|
, ('clf', KMeans(n_clusters = 2))
|
|
])
|
|
|
|
# Training the clustering model
|
|
kmeans2.fit(X_unsup)
|
|
|
|
# Storing the predicted Clustering labels
|
|
labels2 = kmeans2.predict(X_unsup)
|
|
|
|
# Evaluating the performance
|
|
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
|
|
|
|
# Building the clustering model
|
|
|
|
kmeans3 = Pipeline([
|
|
#('pca', PCA())
|
|
('pre', MinMaxScaler())
|
|
# ('pre'), StandardScaler())
|
|
, ('clf', KMeans(n_clusters = 3))
|
|
])
|
|
|
|
# Training the clustering model
|
|
kmeans3.fit(X_unsup)
|
|
|
|
# Storing the predicted Clustering labels
|
|
labels3 = kmeans3.predict(X_unsup)
|
|
|
|
# Evaluating the performance
|
|
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
|
|
|
|
# Plotting a Bar Graph to compare the models
|
|
plt.bar(N_Clusters, v_scores)
|
|
plt.xlabel('Number of Clusters')
|
|
plt.ylabel('V-Measure Score')
|
|
plt.title('Comparison of different Clustering Models')
|
|
plt.show()
|
|
|
|
|
|
# Score: silhouette
|
|
kmeans_kwargs = {
|
|
"init": "random",
|
|
"n_init": 10,
|
|
"max_iter": 300,
|
|
"random_state": 42}
|
|
|
|
from sklearn.metrics import silhouette_score
|
|
#https://realpython.com/k-means-clustering-python/
|
|
#import kneed
|
|
#from kneed import KneeLocator
|
|
scaler = StandardScaler()
|
|
X_unsup_scaled = scaler.fit_transform(X_unsup)
|
|
|
|
silhouette_coefficients = []
|
|
# Notice you start at 2 clusters for silhouette coefficient
|
|
for k in range(2, 5):
|
|
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
|
|
# kmeans = Pipeline([
|
|
# # ('pre', MinMaxScaler())
|
|
# ('pre', StandardScaler())
|
|
# , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
|
|
# ])
|
|
kmeans.fit(X_unsup_scaled)
|
|
score = silhouette_score(X_unsup_scaled, kmeans.labels_)
|
|
silhouette_coefficients.append(score)
|
|
|
|
plt.style.use("fivethirtyeight")
|
|
plt.plot(range(2, 5), silhouette_coefficients)
|
|
plt.xticks(range(2, 5))
|
|
plt.xlabel("Number of Clusters")
|
|
plt.ylabel("Silhouette Coefficient")
|
|
plt.show()
|
|
plt.bar(range(2, 5), silhouette_coefficients)
|
|
|
|
from sklearn.cluster import DBSCAN
|
|
from sklearn.datasets import make_moons
|
|
from sklearn.metrics import adjusted_rand_score
|
|
|
|
# Instantiate k-means and dbscan algorithms
|
|
kmeans = KMeans(n_clusters=2)
|
|
dbscan = DBSCAN(eps=0.3)
|
|
|
|
# Fit the algorithms to the features
|
|
kmeans.fit(X_unsup_scaled)
|
|
dbscan.fit(X_unsup_scaled)
|
|
|
|
# Compute the silhouette scores for each algorithm
|
|
kmeans_silhouette = silhouette_score(
|
|
X_unsup_scaled, kmeans.labels_ ).round(2)
|
|
dbscan_silhouette = silhouette_score(
|
|
X_unsup_scaled, dbscan.labels_).round (2)
|
|
|
|
kmeans_silhouette
|
|
dbscan_silhouette
|
|
|
|
ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
|
|
ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
|
|
|
|
round(ari_kmeans, 2)
|
|
round(ari_dbscan, 2)
|
|
|
|
# Crescent plot
|
|
fig, (ax1, ax2) = plt.subplots(
|
|
1, 2, figsize=(8, 6), sharex=True, sharey=True
|
|
)
|
|
|
|
fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
|
|
fte_colors = {
|
|
0: "#008fd5",
|
|
1: "#fc4f30",
|
|
}
|
|
|
|
# The k-means plot
|
|
km_colors = [fte_colors[label] for label in kmeans.labels_]
|
|
ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
|
|
ax1.set_title(
|
|
f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
|
|
)
|
|
|
|
# The dbscan plot
|
|
db_colors = [fte_colors[label] for label in dbscan.labels_]
|
|
ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
|
|
ax2.set_title(
|
|
f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
|
|
)
|
|
plt.show()
|
|
|
|
|
|
#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
|
|
from sklearn.cluster import AffinityPropagation
|
|
from matplotlib import pyplot
|
|
from numpy import unique
|
|
from numpy import where
|
|
from sklearn.datasets import make_classification
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
from sklearn.cluster import Birch
|
|
from sklearn.cluster import DBSCAN
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.cluster import MiniBatchKMeans
|
|
from sklearn.cluster import OPTICS
|
|
from sklearn.cluster import SpectralClustering
|
|
from sklearn.mixture import GaussianMixture
|
|
from sklearn.decomposition import PCA
|
|
# affinity propagation
|
|
#
|
|
# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
|
|
#=============
|
|
XA = np.array(X_unsup)
|
|
|
|
# define the model
|
|
#model = AffinityPropagation(damping=0.9)
|
|
model = Pipeline([
|
|
#('pca', PCA(n_components = 2))
|
|
('pre', MinMaxScaler())
|
|
#('pre', StandardScaler())
|
|
#, ('clf', AffinityPropagation(damping=0.9))
|
|
, ('clf', AgglomerativeClustering(n_clusters=2)) #y
|
|
#, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
|
|
#, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
|
|
#, ('clf', KMeans(n_clusters=2)) #y
|
|
#, ('clf', MiniBatchKMeans(n_clusters=2))
|
|
#, ('clf', OPTICS(eps=0.8, min_samples=10))
|
|
#, ('clf', SpectralClustering(n_clusters=2))
|
|
# , ('clf', GaussianMixture(n_components=2))
|
|
|
|
])
|
|
model
|
|
# fit the model
|
|
#model.fit(X_unsup)
|
|
#yhat = model.predict(X_unsup)
|
|
yhat = model.fit_predict(X_unsup)
|
|
# retrieve unique clusters
|
|
clusters = unique(yhat)
|
|
|
|
# create scatter plot for samples from each cluster
|
|
for cluster in clusters:
|
|
print(cluster)
|
|
# get row indexes for samples with this cluster
|
|
row_ix = where(yhat == cluster)
|
|
print(row_ix)
|
|
|
|
# create scatter of these samples
|
|
#pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
|
|
pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
|
|
|
|
# show the plot
|
|
pyplot.show()
|
|
#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
|
|
import os
|
|
import json
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas.io.json import json_normalize
|
|
from datetime import datetime
|
|
from tqdm import tqdm
|
|
from sklearn.preprocessing import PowerTransformer
|
|
import umap
|
|
import matplotlib.pyplot as plt
|
|
import plotly.graph_objects as go
|
|
from scipy import stats
|
|
from sklearn.cluster import KMeans
|
|
from kmodes.kprototypes import KPrototypes
|
|
from lightgbm import LGBMClassifier
|
|
import shap
|
|
from sklearn.model_selection import cross_val_score
|
|
from kmodes.kprototypes import KPrototypes
|
|
import lightgbm
|
|
from lightgbm import LGBMClassifier
|
|
|
|
full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
|
|
full_data.head()
|
|
|
|
#Preprocessing numerical
|
|
numerical = full_data.select_dtypes(exclude='object')
|
|
|
|
for c in numerical.columns:
|
|
print(c)
|
|
pt = PowerTransformer()
|
|
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
|
|
|
|
##preprocessing categorical
|
|
categorical = full_data.select_dtypes(include='object')
|
|
categorical.head()
|
|
|
|
for col in categorical.columns:
|
|
#print('-' * 40 + col + '-' * 40 , end=' - ')
|
|
display(categorical[col].value_counts().head(10))
|
|
|
|
categorical = pd.get_dummies(categorical)
|
|
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
|
|
|
|
#Embedding numerical & categorical
|
|
# fit1 = PCA(n_components=2).fit(numerical)
|
|
# fit2 = PCA(n_components=2).fit(categorical)
|
|
fit1 = umap.UMAP(metric='l2').fit(numerical)
|
|
fit2 = umap.UMAP(metric='dice').fit(categorical)
|
|
|
|
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
|
|
intersection = umap.umap_.reset_local_connectivity(intersection)
|
|
#https://github.com/lmcinnes/umap/issues/561
|
|
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
|
|
, intersection
|
|
, fit1.n_components
|
|
, fit1._initial_alpha
|
|
, fit1._a
|
|
, fit1._b
|
|
, fit1.repulsion_strength
|
|
, fit1.negative_sample_rate
|
|
, 200
|
|
, 'random'
|
|
, np.random
|
|
, fit1.metric
|
|
, fit1._metric_kwds
|
|
, False
|
|
, densmap_kwds = {}
|
|
, output_dens = False)
|
|
plt.figure(figsize=(20, 10))
|
|
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
|
|
plt.show()
|
|
|
|
#One-Hot-Encoding
|
|
data = pd.get_dummies(full_data)
|
|
|
|
#Pre-processing
|
|
for c in data.columns:
|
|
pt = PowerTransformer()
|
|
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
|
|
#Actual Clustering
|
|
kmeans = KMeans(n_clusters=15).fit(data)
|
|
kmeans_labels = kmeans.labels_
|
|
#OPTIONAL: Elbow plot with inertia
|
|
#Elbow method to choose the optimal number of clusters
|
|
# sse = {}
|
|
# for k in tqdm(range(2, 50)):
|
|
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
|
|
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
|
|
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
|
|
# fig.show()
|
|
|
|
# K-Prototypes
|
|
kprot_data = full_data.copy()
|
|
#Pre-processing
|
|
for c in full_data.select_dtypes(exclude='object').columns:
|
|
pt = PowerTransformer()
|
|
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
|
|
|
|
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
|
|
categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
|
|
|
|
#Actual clustering
|
|
kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
|
|
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
|
|
|
#Prints the count of each cluster group
|
|
pd.Series(clusters).value_counts()
|
|
|
|
#OPTIONAL: Elbow plot with cost (will take a LONG time)
|
|
costs = []
|
|
n_clusters = []
|
|
clusters_assigned = []
|
|
for i in tqdm(range(2, 25)):
|
|
#for i in tqdm(range(2, 10)):
|
|
|
|
try:
|
|
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
|
|
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
|
costs.append(kproto.cost_)
|
|
n_clusters.append(i)
|
|
clusters_assigned.append(clusters)
|
|
except:
|
|
print(f"Can't cluster with {i} clusters")
|
|
|
|
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
|
|
fig.show()
|
|
|
|
# Visual Evaluation: Kmeans
|
|
fig, ax = plt.subplots()
|
|
fig.set_size_inches((20, 10))
|
|
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
|
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
|
|
|
# produce a legend with the unique colors from the scatter
|
|
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
|
loc="lower left", title="Classes")
|
|
ax.add_artist(legend1)
|
|
|
|
# produce a legend with the unique colors from the scatter
|
|
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
|
loc="lower left", title="Classes")
|
|
ax.add_artist(legend1)
|
|
|
|
# Visual Evaluation: K-Prototypes
|
|
fig, ax = plt.subplots()
|
|
fig.set_size_inches((20, 10))
|
|
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
|
|
|
# produce a legend with the unique colors from the scatter
|
|
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
|
loc="lower left", title="Classes")
|
|
ax.add_artist(legend1)
|
|
|
|
# Evaluation by Classification
|
|
#Setting the objects to category
|
|
lgbm_data = full_data.copy()
|
|
for c in lgbm_data.select_dtypes(include='object'):
|
|
lgbm_data[c] = lgbm_data[c].astype('category')
|
|
|
|
#KMeans clusters
|
|
clf_km = LGBMClassifier(colsample_by_tree=0.8)
|
|
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
|
|
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
|
|
|
|
#Fit the model
|
|
clf_km.fit(lgbm_data, kmeans_labels)
|
|
#SHAP values
|
|
explainer_km = shap.TreeExplainer(clf_km)
|
|
shap_values_km = explainer_km.shap_values(lgbm_data)
|
|
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
|
|
|
#K-Prototypes
|
|
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
|
|
#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
|
|
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
|
|
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
|
|
|
|
#clf_kp.fit(lgbm_data, proto_clusters)
|
|
clf_kp.fit(lgbm_data, clusters)
|
|
|
|
explainer_kp = shap.TreeExplainer(clf_kp)
|
|
shap_values_kp = explainer_kp.shap_values(lgbm_data)
|
|
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
|
#%%Example 5: My data
|
|
# FIXME: clusters and proto_clusters?
|
|
|
|
full_data = all_df.copy()
|
|
full_data.head()
|
|
full_data.shape
|
|
|
|
#Preprocessing numerical
|
|
numerical = full_data.select_dtypes(exclude='object')
|
|
numerical.shape
|
|
|
|
for c in numerical.columns:
|
|
print(c)
|
|
pt = PowerTransformer()
|
|
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
|
|
|
|
##preprocessing categorical
|
|
categorical = full_data.select_dtypes(include='object')
|
|
categorical.head()
|
|
|
|
for col in categorical.columns:
|
|
#print('-' * 40 + col + '-' * 40 , end=' - ')
|
|
display(categorical[col].value_counts().head(10))
|
|
|
|
categorical = pd.get_dummies(categorical)
|
|
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
|
|
|
|
#Embedding numerical & categorical
|
|
# fit1 = PCA(n_components=2).fit(numerical)
|
|
# fit2 = PCA(n_components=2).fit(categorical)
|
|
fit1 = umap.UMAP(metric='l2').fit(numerical)
|
|
fit2 = umap.UMAP(metric='dice').fit(categorical)
|
|
|
|
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
|
|
intersection = umap.umap_.reset_local_connectivity(intersection)
|
|
#https://github.com/lmcinnes/umap/issues/561
|
|
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
|
|
, intersection
|
|
, fit1.n_components
|
|
, fit1._initial_alpha
|
|
, fit1._a
|
|
, fit1._b
|
|
, fit1.repulsion_strength
|
|
, fit1.negative_sample_rate
|
|
, 200
|
|
, 'random'
|
|
, np.random
|
|
, fit1.metric
|
|
, fit1._metric_kwds
|
|
, False
|
|
, densmap_kwds = {}
|
|
, output_dens = False)
|
|
plt.figure(figsize=(20, 10))
|
|
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
|
|
plt.show()
|
|
|
|
#One-Hot-Encoding
|
|
data = pd.get_dummies(full_data)
|
|
|
|
#Pre-processing
|
|
for c in data.columns:
|
|
pt = PowerTransformer()
|
|
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
|
|
#Actual Clustering
|
|
kmeans = KMeans(n_clusters=2).fit(data)
|
|
kmeans_labels = kmeans.labels_
|
|
#OPTIONAL: Elbow plot with inertia
|
|
#Elbow method to choose the optimal number of clusters
|
|
# sse = {}
|
|
# for k in tqdm(range(2, 50)):
|
|
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
|
|
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
|
|
|
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
|
|
# fig.show()
|
|
|
|
# K-Prototypes
|
|
kprot_data = full_data.copy()
|
|
#Pre-processing
|
|
for c in full_data.select_dtypes(exclude='object').columns:
|
|
pt = PowerTransformer()
|
|
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
|
|
|
|
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
|
|
#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
|
|
categorical_columns = []
|
|
for col in cat_df.columns:
|
|
print(col)
|
|
#categ_i += all_df.columns.get_loc(col)
|
|
categorical_columns.append(full_data.columns.get_loc(col))
|
|
|
|
print(categorical_columns)
|
|
print(len(categorical_columns))
|
|
|
|
#Actual clustering
|
|
kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
|
|
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
|
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
|
|
|
#Prints the count of each cluster group
|
|
pd.Series(clusters).value_counts()
|
|
|
|
#OPTIONAL: Elbow plot with cost (will take a LONG time)
|
|
costs = []
|
|
n_clusters = []
|
|
clusters_assigned = []
|
|
for i in tqdm(range(2, 25)):
|
|
#for i in tqdm(range(2, 10)):
|
|
#print(i)
|
|
|
|
try:
|
|
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
|
|
#clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
|
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
|
|
|
costs.append(kproto.cost_)
|
|
n_clusters.append(i)
|
|
clusters_assigned.append(clusters)
|
|
except:
|
|
print(f"Can't cluster with {i} clusters")
|
|
|
|
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
|
|
fig.show()
|
|
|
|
# Visual Evaluation: Kmeans
|
|
fig, ax = plt.subplots()
|
|
fig.set_size_inches((20, 10))
|
|
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
|
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
|
|
|
# produce a legend with the unique colors from the scatter
|
|
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
|
loc="lower left", title="Classes")
|
|
ax.add_artist(legend1)
|
|
|
|
# produce a legend with the unique colors from the scatter
|
|
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
|
loc="lower left", title="Classes")
|
|
ax.add_artist(legend1)
|
|
|
|
# Visual Evaluation: K-Prototypes
|
|
fig, ax = plt.subplots()
|
|
fig.set_size_inches((20, 10))
|
|
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
|
|
|
# produce a legend with the unique colors from the scatter
|
|
legend1 = ax.legend(*scatter.legend_elements(num=2), #15
|
|
loc="lower left", title="Classes")
|
|
ax.add_artist(legend1)
|
|
|
|
# Evaluation by Classification
|
|
#Setting the objects to category
|
|
lgbm_data = full_data.copy()
|
|
for c in lgbm_data.select_dtypes(include='object'):
|
|
lgbm_data[c] = lgbm_data[c].astype('category')
|
|
|
|
#KMeans clusters
|
|
clf_km = LGBMClassifier(colsample_by_tree=0.8)
|
|
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
|
|
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
|
|
|
|
#Fit the model
|
|
clf_km.fit(lgbm_data, kmeans_labels)
|
|
#SHAP values
|
|
explainer_km = shap.TreeExplainer(clf_km)
|
|
shap_values_km = explainer_km.shap_values(lgbm_data)
|
|
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
|
|
|
#K-Prototypes
|
|
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
|
|
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
|
|
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
|
|
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
|
|
|
|
clf_kp.fit(lgbm_data, proto_clusters)
|
|
clf_kp.fit(lgbm_data, clusters)
|
|
|
|
explainer_kp = shap.TreeExplainer(clf_kp)
|
|
shap_values_kp = explainer_kp.shap_values(lgbm_data)
|
|
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
|