added tutorial examples and my data workthrough examplesin unsup_v1.py
This commit is contained in:
parent
ad5ebad7f8
commit
89a0c3a58a
4 changed files with 1123 additions and 0 deletions
778
unsup_v1.py
Normal file
778
unsup_v1.py
Normal file
|
@ -0,0 +1,778 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Mar 16 16:55:06 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import sklearn
|
||||
from sklearn import cluster, datasets
|
||||
import warnings
|
||||
import sys
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
|
||||
from sklearn.metrics import v_measure_score
|
||||
# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
|
||||
# K-means
|
||||
# HC
|
||||
# Gaussian mixed mixture
|
||||
# FP growth
|
||||
# PCA
|
||||
# Meanshift
|
||||
# DBScan
|
||||
|
||||
# Model assessment:
|
||||
# Mututal information
|
||||
# Silloute score
|
||||
# v_measure_score
|
||||
|
||||
# Itertools.combinations()
|
||||
|
||||
#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
|
||||
|
||||
# For K-means clustering, the model is that all clusters
|
||||
# have equal, spherical variance.
|
||||
|
||||
samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
|
||||
print('Dataset size : ', samples.shape, clusters.shape)
|
||||
print('Cluster names : ',set(clusters))
|
||||
with plt.style.context(('ggplot', 'seaborn')):
|
||||
plt.figure(figsize=(8,6))
|
||||
for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
|
||||
plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
|
||||
|
||||
plt.xlabel('Feature 1')
|
||||
plt.ylabel('Feature 2')
|
||||
plt.title('Visualizing Dataset')
|
||||
plt.legend(loc='best')
|
||||
|
||||
#kmeans = cluster.KMeans(n_clusters=5)
|
||||
kmeans = cluster.KMeans(n_clusters=2)
|
||||
|
||||
kmeans.fit(samples)
|
||||
preds = kmeans.predict(samples)
|
||||
|
||||
print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
|
||||
print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
|
||||
print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
|
||||
print('Cluster Centers : \n', str(kmeans.cluster_centers_))
|
||||
|
||||
print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
|
||||
with plt.style.context(('ggplot', 'seaborn')):
|
||||
plt.figure(figsize=(10,6))
|
||||
|
||||
plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
|
||||
plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
|
||||
plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
|
||||
plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
|
||||
plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
|
||||
|
||||
for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
|
||||
plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
|
||||
for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
|
||||
plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
|
||||
for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
|
||||
plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
|
||||
for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
|
||||
plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
|
||||
for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
|
||||
plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
|
||||
|
||||
plt.xlabel('Feature 1')
|
||||
plt.ylabel('Feature 2')
|
||||
plt.title('Visualizing Predictions & Cluster Centers')
|
||||
plt.legend(loc='best')
|
||||
|
||||
# The Elbow Method: To decide the numbers of cluster i.e. 'k'
|
||||
plt.figure(figsize=(8,5))
|
||||
distortions = []
|
||||
for i in range(1,11):
|
||||
kmeans = cluster.KMeans(n_clusters=i)
|
||||
kmeans.fit(samples)
|
||||
distortions.append(kmeans.inertia_)
|
||||
|
||||
print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
|
||||
|
||||
with plt.style.context(('ggplot', 'seaborn')):
|
||||
plt.plot(range(1,11), distortions, )
|
||||
plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
|
||||
plt.xlabel('Number Of Clusters')
|
||||
plt.ylabel('Distortions')
|
||||
plt.title('The Elbow Method (Num of Clusters vs Distortions)')
|
||||
plt.xticks(range(1,11));
|
||||
|
||||
#%% Example 1: My data
|
||||
X_unsup = num_df_wtgt[['ligand_affinity_change'
|
||||
, 'duet_stability_change']]
|
||||
|
||||
kmeans = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
, ('clf', KMeans(n_clusters = 2))
|
||||
])
|
||||
|
||||
#kmeans = KMeans(n_clusters = 2)
|
||||
#kmeans.fit(X_unsup)
|
||||
kmeans.fit(X_unsup)
|
||||
y_kmeans = kmeans.predict(X_unsup)
|
||||
|
||||
plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
|
||||
, X_unsup.loc[:, 'duet_stability_change']
|
||||
, c = y_kmeans
|
||||
, s = 50
|
||||
, cmap = 'viridis')
|
||||
|
||||
centers = kmeans.cluster_centers_
|
||||
plt.scatter(centers[:, 0], centers[:, 1]
|
||||
, c = 'black'
|
||||
, s = 200
|
||||
, alpha = 0.5);
|
||||
plt.show()
|
||||
#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
|
||||
|
||||
# Loading dataset
|
||||
iris_df = datasets.load_iris()
|
||||
|
||||
# Available methods on dataset
|
||||
print(dir(iris_df))
|
||||
|
||||
# Features
|
||||
print(iris_df.feature_names)
|
||||
|
||||
# Targets
|
||||
print(iris_df.target)
|
||||
|
||||
# Target Names
|
||||
print(iris_df.target_names)
|
||||
label = {0: 'red', 1: 'blue', 2: 'green'}
|
||||
|
||||
# Dataset Slicing
|
||||
x_axis = iris_df.data[:, 0] # Sepal Length
|
||||
y_axis = iris_df.data[:, 2] # Sepal Width
|
||||
|
||||
# Plotting
|
||||
plt.scatter(x_axis, y_axis, c=iris_df.target)
|
||||
plt.show()
|
||||
|
||||
# Use Kmeans: Declaring Model
|
||||
model = KMeans(n_clusters = 3)
|
||||
|
||||
# Fitting Model
|
||||
model.fit(iris_df.data)
|
||||
|
||||
# Predicitng a single input
|
||||
predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
|
||||
|
||||
# Prediction on the entire data
|
||||
all_predictions = model.predict(iris_df.data)
|
||||
|
||||
# Printing Predictions
|
||||
print(predicted_label)
|
||||
print(all_predictions)
|
||||
|
||||
#%% Example 2: My data
|
||||
X_unsup = num_df_wtgt[[#'ligand_affinity_change'
|
||||
#, 'duet_stability_change'
|
||||
'ddg_foldx'
|
||||
,'deepddg']]
|
||||
y_unsup = num_df_wtgt[['mutation_class']]
|
||||
|
||||
|
||||
# X_train, X_test, y_train, y_test = train_test_split(X_unsup
|
||||
# , y_unsup
|
||||
# , test_size = 0.33
|
||||
# , **rs
|
||||
# , shuffle = True
|
||||
# , stratify = y_unsup)
|
||||
|
||||
#model = KMeans(n_clusters=2)
|
||||
model = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
, ('clf', KMeans(n_clusters = 2))
|
||||
])
|
||||
|
||||
label = {0: 'blue', 1: 'red'}
|
||||
|
||||
model.fit(X_unsup)
|
||||
predicted_label = model.predict(X_test)
|
||||
all_predictions = model.predict(X_unsup)
|
||||
print(predicted_label)
|
||||
print(all_predictions)
|
||||
|
||||
plt.scatter(X_train.loc[:, 'ligand_affinity_change']
|
||||
, X_train.loc[:, 'duet_stability_change']
|
||||
, c = y_train.loc[:, 'mutation_class'])
|
||||
|
||||
#from yellowbrick.cluster import KElbowVisualizer
|
||||
model = KMeans()
|
||||
visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
|
||||
visualizer.show()
|
||||
|
||||
#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
|
||||
|
||||
# List of V-Measure Scores for different models
|
||||
v_scores = []
|
||||
|
||||
# List of different types of covariance parameters
|
||||
N_Clusters = [2, 3]
|
||||
|
||||
# Building the clustering model
|
||||
kmeans2 = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
# ('pre'), StandardScaler())
|
||||
, ('clf', KMeans(n_clusters = 2))
|
||||
])
|
||||
|
||||
# Training the clustering model
|
||||
kmeans2.fit(X_unsup)
|
||||
|
||||
# Storing the predicted Clustering labels
|
||||
labels2 = kmeans2.predict(X_unsup)
|
||||
|
||||
# Evaluating the performance
|
||||
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
|
||||
|
||||
# Building the clustering model
|
||||
|
||||
kmeans3 = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
# ('pre'), StandardScaler())
|
||||
, ('clf', KMeans(n_clusters = 3))
|
||||
])
|
||||
|
||||
# Training the clustering model
|
||||
kmeans3.fit(X_unsup)
|
||||
|
||||
# Storing the predicted Clustering labels
|
||||
labels3 = kmeans3.predict(X_unsup)
|
||||
|
||||
# Evaluating the performance
|
||||
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
|
||||
|
||||
# Plotting a Bar Graph to compare the models
|
||||
plt.bar(N_Clusters, v_scores)
|
||||
plt.xlabel('Number of Clusters')
|
||||
plt.ylabel('V-Measure Score')
|
||||
plt.title('Comparison of different Clustering Models')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Score: silhouette
|
||||
kmeans_kwargs = {
|
||||
"init": "random",
|
||||
"n_init": 10,
|
||||
"max_iter": 300,
|
||||
"random_state": 42}
|
||||
|
||||
from sklearn.metrics import silhouette_score
|
||||
#https://realpython.com/k-means-clustering-python/
|
||||
#import kneed
|
||||
#from kneed import KneeLocator
|
||||
scaler = StandardScaler()
|
||||
X_unsup_scaled = scaler.fit_transform(X_unsup)
|
||||
|
||||
silhouette_coefficients = []
|
||||
# Notice you start at 2 clusters for silhouette coefficient
|
||||
for k in range(2, 5):
|
||||
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
|
||||
# kmeans = Pipeline([
|
||||
# # ('pre', MinMaxScaler())
|
||||
# ('pre', StandardScaler())
|
||||
# , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
|
||||
# ])
|
||||
kmeans.fit(X_unsup_scaled)
|
||||
score = silhouette_score(X_unsup_scaled, kmeans.labels_)
|
||||
silhouette_coefficients.append(score)
|
||||
|
||||
plt.style.use("fivethirtyeight")
|
||||
plt.plot(range(2, 5), silhouette_coefficients)
|
||||
plt.xticks(range(2, 5))
|
||||
plt.xlabel("Number of Clusters")
|
||||
plt.ylabel("Silhouette Coefficient")
|
||||
plt.show()
|
||||
plt.bar(range(2, 5), silhouette_coefficients)
|
||||
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.datasets import make_moons
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
|
||||
# Instantiate k-means and dbscan algorithms
|
||||
kmeans = KMeans(n_clusters=2)
|
||||
dbscan = DBSCAN(eps=0.3)
|
||||
|
||||
# Fit the algorithms to the features
|
||||
kmeans.fit(X_unsup_scaled)
|
||||
dbscan.fit(X_unsup_scaled)
|
||||
|
||||
# Compute the silhouette scores for each algorithm
|
||||
kmeans_silhouette = silhouette_score(
|
||||
X_unsup_scaled, kmeans.labels_ ).round(2)
|
||||
dbscan_silhouette = silhouette_score(
|
||||
X_unsup_scaled, dbscan.labels_).round (2)
|
||||
|
||||
kmeans_silhouette
|
||||
dbscan_silhouette
|
||||
|
||||
ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
|
||||
ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
|
||||
|
||||
round(ari_kmeans, 2)
|
||||
round(ari_dbscan, 2)
|
||||
|
||||
# Crescent plot
|
||||
fig, (ax1, ax2) = plt.subplots(
|
||||
1, 2, figsize=(8, 6), sharex=True, sharey=True
|
||||
)
|
||||
|
||||
fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
|
||||
fte_colors = {
|
||||
0: "#008fd5",
|
||||
1: "#fc4f30",
|
||||
}
|
||||
|
||||
# The k-means plot
|
||||
km_colors = [fte_colors[label] for label in kmeans.labels_]
|
||||
ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
|
||||
ax1.set_title(
|
||||
f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
|
||||
)
|
||||
|
||||
# The dbscan plot
|
||||
db_colors = [fte_colors[label] for label in dbscan.labels_]
|
||||
ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
|
||||
ax2.set_title(
|
||||
f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
|
||||
)
|
||||
plt.show()
|
||||
|
||||
|
||||
#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
|
||||
from sklearn.cluster import AffinityPropagation
|
||||
from matplotlib import pyplot
|
||||
from numpy import unique
|
||||
from numpy import where
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.cluster import Birch
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
from sklearn.cluster import OPTICS
|
||||
from sklearn.cluster import SpectralClustering
|
||||
from sklearn.mixture import GaussianMixture
|
||||
from sklearn.decomposition import PCA
|
||||
# affinity propagation
|
||||
#
|
||||
# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
|
||||
#=============
|
||||
XA = np.array(X_unsup)
|
||||
|
||||
# define the model
|
||||
#model = AffinityPropagation(damping=0.9)
|
||||
model = Pipeline([
|
||||
#('pca', PCA(n_components = 2))
|
||||
('pre', MinMaxScaler())
|
||||
#('pre', StandardScaler())
|
||||
#, ('clf', AffinityPropagation(damping=0.9))
|
||||
, ('clf', AgglomerativeClustering(n_clusters=2)) #y
|
||||
#, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
|
||||
#, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
|
||||
#, ('clf', KMeans(n_clusters=2)) #y
|
||||
#, ('clf', MiniBatchKMeans(n_clusters=2))
|
||||
#, ('clf', OPTICS(eps=0.8, min_samples=10))
|
||||
#, ('clf', SpectralClustering(n_clusters=2))
|
||||
# , ('clf', GaussianMixture(n_components=2))
|
||||
|
||||
])
|
||||
model
|
||||
# fit the model
|
||||
#model.fit(X_unsup)
|
||||
#yhat = model.predict(X_unsup)
|
||||
yhat = model.fit_predict(X_unsup)
|
||||
# retrieve unique clusters
|
||||
clusters = unique(yhat)
|
||||
|
||||
# create scatter plot for samples from each cluster
|
||||
for cluster in clusters:
|
||||
print(cluster)
|
||||
# get row indexes for samples with this cluster
|
||||
row_ix = where(yhat == cluster)
|
||||
print(row_ix)
|
||||
|
||||
# create scatter of these samples
|
||||
#pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
|
||||
pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
|
||||
|
||||
# show the plot
|
||||
pyplot.show()
|
||||
#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.io.json import json_normalize
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
from sklearn.preprocessing import PowerTransformer
|
||||
import umap
|
||||
import matplotlib.pyplot as plt
|
||||
import plotly.graph_objects as go
|
||||
from scipy import stats
|
||||
from sklearn.cluster import KMeans
|
||||
from kmodes.kprototypes import KPrototypes
|
||||
from lightgbm import LGBMClassifier
|
||||
import shap
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from kmodes.kprototypes import KPrototypes
|
||||
import lightgbm
|
||||
from lightgbm import LGBMClassifier
|
||||
|
||||
full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
|
||||
full_data.head()
|
||||
|
||||
#Preprocessing numerical
|
||||
numerical = full_data.select_dtypes(exclude='object')
|
||||
|
||||
for c in numerical.columns:
|
||||
print(c)
|
||||
pt = PowerTransformer()
|
||||
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
|
||||
|
||||
##preprocessing categorical
|
||||
categorical = full_data.select_dtypes(include='object')
|
||||
categorical.head()
|
||||
|
||||
for col in categorical.columns:
|
||||
#print('-' * 40 + col + '-' * 40 , end=' - ')
|
||||
display(categorical[col].value_counts().head(10))
|
||||
|
||||
categorical = pd.get_dummies(categorical)
|
||||
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
|
||||
|
||||
#Embedding numerical & categorical
|
||||
# fit1 = PCA(n_components=2).fit(numerical)
|
||||
# fit2 = PCA(n_components=2).fit(categorical)
|
||||
fit1 = umap.UMAP(metric='l2').fit(numerical)
|
||||
fit2 = umap.UMAP(metric='dice').fit(categorical)
|
||||
|
||||
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
|
||||
intersection = umap.umap_.reset_local_connectivity(intersection)
|
||||
#https://github.com/lmcinnes/umap/issues/561
|
||||
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
|
||||
, intersection
|
||||
, fit1.n_components
|
||||
, fit1._initial_alpha
|
||||
, fit1._a
|
||||
, fit1._b
|
||||
, fit1.repulsion_strength
|
||||
, fit1.negative_sample_rate
|
||||
, 200
|
||||
, 'random'
|
||||
, np.random
|
||||
, fit1.metric
|
||||
, fit1._metric_kwds
|
||||
, False
|
||||
, densmap_kwds = {}
|
||||
, output_dens = False)
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
|
||||
plt.show()
|
||||
|
||||
#One-Hot-Encoding
|
||||
data = pd.get_dummies(full_data)
|
||||
|
||||
#Pre-processing
|
||||
for c in data.columns:
|
||||
pt = PowerTransformer()
|
||||
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
|
||||
#Actual Clustering
|
||||
kmeans = KMeans(n_clusters=15).fit(data)
|
||||
kmeans_labels = kmeans.labels_
|
||||
#OPTIONAL: Elbow plot with inertia
|
||||
#Elbow method to choose the optimal number of clusters
|
||||
# sse = {}
|
||||
# for k in tqdm(range(2, 50)):
|
||||
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
|
||||
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
||||
|
||||
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
|
||||
# fig.show()
|
||||
|
||||
# K-Prototypes
|
||||
kprot_data = full_data.copy()
|
||||
#Pre-processing
|
||||
for c in full_data.select_dtypes(exclude='object').columns:
|
||||
pt = PowerTransformer()
|
||||
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
|
||||
|
||||
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
|
||||
categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
|
||||
|
||||
#Actual clustering
|
||||
kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
|
||||
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
|
||||
#Prints the count of each cluster group
|
||||
pd.Series(clusters).value_counts()
|
||||
|
||||
#OPTIONAL: Elbow plot with cost (will take a LONG time)
|
||||
costs = []
|
||||
n_clusters = []
|
||||
clusters_assigned = []
|
||||
for i in tqdm(range(2, 25)):
|
||||
#for i in tqdm(range(2, 10)):
|
||||
|
||||
try:
|
||||
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
|
||||
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
costs.append(kproto.cost_)
|
||||
n_clusters.append(i)
|
||||
clusters_assigned.append(clusters)
|
||||
except:
|
||||
print(f"Can't cluster with {i} clusters")
|
||||
|
||||
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
|
||||
fig.show()
|
||||
|
||||
# Visual Evaluation: Kmeans
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Visual Evaluation: K-Prototypes
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Evaluation by Classification
|
||||
#Setting the objects to category
|
||||
lgbm_data = full_data.copy()
|
||||
for c in lgbm_data.select_dtypes(include='object'):
|
||||
lgbm_data[c] = lgbm_data[c].astype('category')
|
||||
|
||||
#KMeans clusters
|
||||
clf_km = LGBMClassifier(colsample_by_tree=0.8)
|
||||
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
|
||||
|
||||
#Fit the model
|
||||
clf_km.fit(lgbm_data, kmeans_labels)
|
||||
#SHAP values
|
||||
explainer_km = shap.TreeExplainer(clf_km)
|
||||
shap_values_km = explainer_km.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
||||
|
||||
#K-Prototypes
|
||||
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
|
||||
#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
|
||||
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
|
||||
|
||||
#clf_kp.fit(lgbm_data, proto_clusters)
|
||||
clf_kp.fit(lgbm_data, clusters)
|
||||
|
||||
explainer_kp = shap.TreeExplainer(clf_kp)
|
||||
shap_values_kp = explainer_kp.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
||||
#%%Example 5: My data
|
||||
# FIXME: clusters and proto_clusters?
|
||||
|
||||
full_data = all_df.copy()
|
||||
full_data.head()
|
||||
full_data.shape
|
||||
|
||||
#Preprocessing numerical
|
||||
numerical = full_data.select_dtypes(exclude='object')
|
||||
numerical.shape
|
||||
|
||||
for c in numerical.columns:
|
||||
print(c)
|
||||
pt = PowerTransformer()
|
||||
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
|
||||
|
||||
##preprocessing categorical
|
||||
categorical = full_data.select_dtypes(include='object')
|
||||
categorical.head()
|
||||
|
||||
for col in categorical.columns:
|
||||
#print('-' * 40 + col + '-' * 40 , end=' - ')
|
||||
display(categorical[col].value_counts().head(10))
|
||||
|
||||
categorical = pd.get_dummies(categorical)
|
||||
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
|
||||
|
||||
#Embedding numerical & categorical
|
||||
# fit1 = PCA(n_components=2).fit(numerical)
|
||||
# fit2 = PCA(n_components=2).fit(categorical)
|
||||
fit1 = umap.UMAP(metric='l2').fit(numerical)
|
||||
fit2 = umap.UMAP(metric='dice').fit(categorical)
|
||||
|
||||
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
|
||||
intersection = umap.umap_.reset_local_connectivity(intersection)
|
||||
#https://github.com/lmcinnes/umap/issues/561
|
||||
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
|
||||
, intersection
|
||||
, fit1.n_components
|
||||
, fit1._initial_alpha
|
||||
, fit1._a
|
||||
, fit1._b
|
||||
, fit1.repulsion_strength
|
||||
, fit1.negative_sample_rate
|
||||
, 200
|
||||
, 'random'
|
||||
, np.random
|
||||
, fit1.metric
|
||||
, fit1._metric_kwds
|
||||
, False
|
||||
, densmap_kwds = {}
|
||||
, output_dens = False)
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
|
||||
plt.show()
|
||||
|
||||
#One-Hot-Encoding
|
||||
data = pd.get_dummies(full_data)
|
||||
|
||||
#Pre-processing
|
||||
for c in data.columns:
|
||||
pt = PowerTransformer()
|
||||
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
|
||||
#Actual Clustering
|
||||
kmeans = KMeans(n_clusters=2).fit(data)
|
||||
kmeans_labels = kmeans.labels_
|
||||
#OPTIONAL: Elbow plot with inertia
|
||||
#Elbow method to choose the optimal number of clusters
|
||||
# sse = {}
|
||||
# for k in tqdm(range(2, 50)):
|
||||
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
|
||||
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
||||
|
||||
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
|
||||
# fig.show()
|
||||
|
||||
# K-Prototypes
|
||||
kprot_data = full_data.copy()
|
||||
#Pre-processing
|
||||
for c in full_data.select_dtypes(exclude='object').columns:
|
||||
pt = PowerTransformer()
|
||||
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
|
||||
|
||||
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
|
||||
#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
|
||||
categorical_columns = []
|
||||
for col in cat_df.columns:
|
||||
print(col)
|
||||
#categ_i += all_df.columns.get_loc(col)
|
||||
categorical_columns.append(full_data.columns.get_loc(col))
|
||||
|
||||
print(categorical_columns)
|
||||
print(len(categorical_columns))
|
||||
|
||||
#Actual clustering
|
||||
kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
|
||||
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
|
||||
#Prints the count of each cluster group
|
||||
pd.Series(clusters).value_counts()
|
||||
|
||||
#OPTIONAL: Elbow plot with cost (will take a LONG time)
|
||||
costs = []
|
||||
n_clusters = []
|
||||
clusters_assigned = []
|
||||
for i in tqdm(range(2, 25)):
|
||||
#for i in tqdm(range(2, 10)):
|
||||
#print(i)
|
||||
|
||||
try:
|
||||
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
|
||||
#clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
|
||||
costs.append(kproto.cost_)
|
||||
n_clusters.append(i)
|
||||
clusters_assigned.append(clusters)
|
||||
except:
|
||||
print(f"Can't cluster with {i} clusters")
|
||||
|
||||
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
|
||||
fig.show()
|
||||
|
||||
# Visual Evaluation: Kmeans
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Visual Evaluation: K-Prototypes
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=2), #15
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Evaluation by Classification
|
||||
#Setting the objects to category
|
||||
lgbm_data = full_data.copy()
|
||||
for c in lgbm_data.select_dtypes(include='object'):
|
||||
lgbm_data[c] = lgbm_data[c].astype('category')
|
||||
|
||||
#KMeans clusters
|
||||
clf_km = LGBMClassifier(colsample_by_tree=0.8)
|
||||
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
|
||||
|
||||
#Fit the model
|
||||
clf_km.fit(lgbm_data, kmeans_labels)
|
||||
#SHAP values
|
||||
explainer_km = shap.TreeExplainer(clf_km)
|
||||
shap_values_km = explainer_km.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
||||
|
||||
#K-Prototypes
|
||||
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
|
||||
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
|
||||
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
|
||||
|
||||
clf_kp.fit(lgbm_data, proto_clusters)
|
||||
clf_kp.fit(lgbm_data, clusters)
|
||||
|
||||
explainer_kp = shap.TreeExplainer(clf_kp)
|
||||
shap_values_kp = explainer_kp.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
Loading…
Add table
Add a link
Reference in a new issue