added tutorial examples and my data workthrough examplesin unsup_v1.py

2022-03-23 16:23:18 +00:00 · 2022-03-23 16:23:18 +00:00 · 89a0c3a58a
commit 89a0c3a58a
parent ad5ebad7f8
4 changed files with 1123 additions and 0 deletions
--- a/unsup_v1.py
+++ b/unsup_v1.py
@ -0,0 +1,778 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 16 16:55:06 2022
+
+@author: tanu
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import sklearn
+from sklearn import cluster, datasets
+import warnings
+import sys
+from sklearn.cluster import KMeans
+from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
+from sklearn.metrics import v_measure_score
+# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
+# K-means
+# HC
+# Gaussian mixed mixture
+# FP growth
+# PCA
+# Meanshift
+# DBScan
+
+# Model assessment:
+# Mututal information
+# Silloute score
+# v_measure_score
+
+# Itertools.combinations()
+
+#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
+
+# For K-means clustering, the model is that all clusters
+# have equal, spherical variance.
+
+samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
+print('Dataset size : ', samples.shape, clusters.shape)
+print('Cluster names : ',set(clusters))
+with plt.style.context(('ggplot', 'seaborn')):
+    plt.figure(figsize=(8,6))
+    for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
+        plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
+
+    plt.xlabel('Feature 1')
+    plt.ylabel('Feature 2')
+    plt.title('Visualizing Dataset')
+    plt.legend(loc='best')
+    
+#kmeans = cluster.KMeans(n_clusters=5)
+kmeans = cluster.KMeans(n_clusters=2)
+
+kmeans.fit(samples)
+preds = kmeans.predict(samples)
+
+print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
+print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
+print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
+print('Cluster Centers : \n', str(kmeans.cluster_centers_))
+
+print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
+with plt.style.context(('ggplot', 'seaborn')):
+    plt.figure(figsize=(10,6))
+
+    plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
+    plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
+    plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
+    plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
+    plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
+
+    for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
+        plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
+    for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
+        plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
+    for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
+        plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
+    for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
+        plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
+    for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
+        plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
+
+    plt.xlabel('Feature 1')
+    plt.ylabel('Feature 2')
+    plt.title('Visualizing Predictions & Cluster Centers')
+    plt.legend(loc='best')
+    
+# The Elbow Method: To decide the numbers of cluster i.e. 'k'
+plt.figure(figsize=(8,5))
+distortions = []
+for i in range(1,11):
+    kmeans = cluster.KMeans(n_clusters=i)
+    kmeans.fit(samples)
+    distortions.append(kmeans.inertia_)
+
+print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
+
+with plt.style.context(('ggplot', 'seaborn')):
+    plt.plot(range(1,11), distortions, )
+    plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
+    plt.xlabel('Number Of Clusters')
+    plt.ylabel('Distortions')
+    plt.title('The Elbow Method (Num of Clusters vs Distortions)')
+    plt.xticks(range(1,11));
+
+#%% Example 1: My data
+X_unsup = num_df_wtgt[['ligand_affinity_change'
+                       , 'duet_stability_change']]
+
+kmeans = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    , ('clf', KMeans(n_clusters = 2))
+])
+
+#kmeans = KMeans(n_clusters = 2)
+#kmeans.fit(X_unsup)
+kmeans.fit(X_unsup)
+y_kmeans = kmeans.predict(X_unsup)
+
+plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
+            , X_unsup.loc[:, 'duet_stability_change']
+            , c = y_kmeans
+            , s = 50
+            , cmap = 'viridis')
+
+centers = kmeans.cluster_centers_
+plt.scatter(centers[:, 0], centers[:, 1]
+            , c = 'black'
+            , s = 200
+            , alpha = 0.5);
+plt.show()
+#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
+
+# Loading dataset
+iris_df = datasets.load_iris()
+
+# Available methods on dataset
+print(dir(iris_df))
+
+# Features
+print(iris_df.feature_names)
+
+# Targets
+print(iris_df.target)
+
+# Target Names
+print(iris_df.target_names)
+label = {0: 'red', 1: 'blue', 2: 'green'}
+
+# Dataset Slicing
+x_axis = iris_df.data[:, 0]  # Sepal Length
+y_axis = iris_df.data[:, 2]  # Sepal Width
+
+# Plotting
+plt.scatter(x_axis, y_axis, c=iris_df.target)
+plt.show()
+
+# Use Kmeans: Declaring Model
+model = KMeans(n_clusters = 3)
+
+# Fitting Model
+model.fit(iris_df.data)
+
+# Predicitng a single input
+predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
+
+# Prediction on the entire data
+all_predictions = model.predict(iris_df.data)
+
+# Printing Predictions
+print(predicted_label)
+print(all_predictions)
+
+#%% Example 2: My data
+X_unsup = num_df_wtgt[[#'ligand_affinity_change'
+                       #, 'duet_stability_change'
+                       'ddg_foldx'
+                        ,'deepddg']]
+y_unsup = num_df_wtgt[['mutation_class']]
+
+
+# X_train, X_test, y_train, y_test = train_test_split(X_unsup
+#                                             , y_unsup
+#                                             , test_size    = 0.33
+#                                             , **rs
+#                                             , shuffle      = True
+#                                             , stratify     = y_unsup)
+
+#model = KMeans(n_clusters=2)
+model = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    , ('clf', KMeans(n_clusters = 2))
+])
+
+label = {0: 'blue', 1: 'red'}
+
+model.fit(X_unsup)
+predicted_label = model.predict(X_test)
+all_predictions = model.predict(X_unsup)
+print(predicted_label)
+print(all_predictions)
+
+plt.scatter(X_train.loc[:, 'ligand_affinity_change']
+            , X_train.loc[:, 'duet_stability_change']
+            , c = y_train.loc[:, 'mutation_class'])
+
+#from yellowbrick.cluster import KElbowVisualizer
+model = KMeans()
+visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
+visualizer.show()
+
+#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
+
+# List of V-Measure Scores for different models
+v_scores = []
+  
+# List of different types of covariance parameters
+N_Clusters = [2, 3]
+
+# Building the clustering model
+kmeans2 = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    # ('pre'), StandardScaler())
+    , ('clf', KMeans(n_clusters = 2))
+])
+
+# Training the clustering model
+kmeans2.fit(X_unsup)
+  
+# Storing the predicted Clustering labels
+labels2 = kmeans2.predict(X_unsup)
+  
+# Evaluating the performance
+v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
+
+# Building the clustering model
+ 
+kmeans3 = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    # ('pre'), StandardScaler())
+    , ('clf', KMeans(n_clusters = 3))
+])
+
+# Training the clustering model
+kmeans3.fit(X_unsup)
+  
+# Storing the predicted Clustering labels
+labels3 = kmeans3.predict(X_unsup)
+  
+# Evaluating the performance
+v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
+
+# Plotting a Bar Graph to compare the models
+plt.bar(N_Clusters, v_scores)
+plt.xlabel('Number of Clusters')
+plt.ylabel('V-Measure Score')
+plt.title('Comparison of different Clustering Models')
+plt.show()
+
+
+# Score: silhouette
+kmeans_kwargs = {
+   "init": "random",
+   "n_init": 10,
+   "max_iter": 300,
+   "random_state": 42}
+
+from sklearn.metrics import silhouette_score
+#https://realpython.com/k-means-clustering-python/
+#import kneed
+#from kneed import KneeLocator
+scaler = StandardScaler()
+X_unsup_scaled = scaler.fit_transform(X_unsup)
+
+silhouette_coefficients = []
+# Notice you start at 2 clusters for silhouette coefficient
+for k in range(2, 5):
+    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
+    # kmeans = Pipeline([
+    #    #  ('pre', MinMaxScaler())
+    #     ('pre', StandardScaler())
+    #     , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
+    #     ])
+    kmeans.fit(X_unsup_scaled)
+    score = silhouette_score(X_unsup_scaled, kmeans.labels_)
+    silhouette_coefficients.append(score)
+    
+plt.style.use("fivethirtyeight")
+plt.plot(range(2, 5), silhouette_coefficients)
+plt.xticks(range(2, 5))
+plt.xlabel("Number of Clusters")
+plt.ylabel("Silhouette Coefficient")
+plt.show()    
+plt.bar(range(2, 5), silhouette_coefficients)
+
+from sklearn.cluster import DBSCAN
+from sklearn.datasets import make_moons
+from sklearn.metrics import adjusted_rand_score
+
+# Instantiate k-means and dbscan algorithms
+kmeans = KMeans(n_clusters=2)
+dbscan = DBSCAN(eps=0.3)
+
+# Fit the algorithms to the features
+kmeans.fit(X_unsup_scaled)
+dbscan.fit(X_unsup_scaled)
+
+# Compute the silhouette scores for each algorithm
+kmeans_silhouette = silhouette_score(
+    X_unsup_scaled, kmeans.labels_ ).round(2)
+dbscan_silhouette = silhouette_score(
+    X_unsup_scaled, dbscan.labels_).round (2)
+
+kmeans_silhouette 
+dbscan_silhouette
+
+ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
+ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
+
+round(ari_kmeans, 2)
+round(ari_dbscan, 2)
+
+# Crescent plot
+fig, (ax1, ax2) = plt.subplots(
+        1, 2, figsize=(8, 6), sharex=True, sharey=True
+        )
+
+fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
+fte_colors = {
+        0: "#008fd5",
+        1: "#fc4f30",
+    }
+
+# The k-means plot
+km_colors = [fte_colors[label] for label in kmeans.labels_]
+ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
+ax1.set_title(
+    f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
+    )
+   
+# The dbscan plot
+db_colors = [fte_colors[label] for label in dbscan.labels_]
+ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
+ax2.set_title(
+    f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
+ )
+plt.show()
+
+
+#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
+from sklearn.cluster import AffinityPropagation
+from matplotlib import pyplot
+from numpy import unique
+from numpy import where
+from sklearn.datasets import make_classification
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.cluster import Birch
+from sklearn.cluster import DBSCAN
+from sklearn.cluster import KMeans
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import OPTICS
+from sklearn.cluster import SpectralClustering
+from sklearn.mixture import GaussianMixture
+from sklearn.decomposition import PCA
+# affinity propagation
+# 
+# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
+#=============
+XA = np.array(X_unsup)
+
+# define the model
+#model = AffinityPropagation(damping=0.9)
+model = Pipeline([
+    #('pca', PCA(n_components = 2))
+     ('pre', MinMaxScaler())
+    #('pre', StandardScaler())
+    #, ('clf', AffinityPropagation(damping=0.9))
+    , ('clf', AgglomerativeClustering(n_clusters=2)) #y
+    #, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
+    #, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
+    #, ('clf', KMeans(n_clusters=2)) #y
+    #, ('clf', MiniBatchKMeans(n_clusters=2))
+    #, ('clf', OPTICS(eps=0.8, min_samples=10))
+    #, ('clf', SpectralClustering(n_clusters=2))
+    # , ('clf', GaussianMixture(n_components=2))
+    
+    ])
+model
+# fit the model
+#model.fit(X_unsup)
+#yhat = model.predict(X_unsup)
+yhat = model.fit_predict(X_unsup)
+# retrieve unique clusters
+clusters = unique(yhat)
+
+# create scatter plot for samples from each cluster
+for cluster in clusters:
+    print(cluster)
+	# get row indexes for samples with this cluster
+    row_ix = where(yhat == cluster)
+    print(row_ix)
+	
+    # create scatter of these samples
+    #pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
+    pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
+
+# show the plot
+pyplot.show()
+#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
+import os
+import json
+import numpy as np
+import pandas as pd
+from pandas.io.json import json_normalize
+from datetime import datetime 
+from tqdm import tqdm
+from sklearn.preprocessing import PowerTransformer
+import umap
+import matplotlib.pyplot as plt
+import plotly.graph_objects as go
+from scipy import stats
+from sklearn.cluster import KMeans
+from kmodes.kprototypes import KPrototypes
+from lightgbm import LGBMClassifier
+import shap
+from sklearn.model_selection import cross_val_score
+from kmodes.kprototypes import KPrototypes
+import lightgbm 
+from lightgbm import LGBMClassifier
+
+full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
+full_data.head()
+
+#Preprocessing numerical
+numerical = full_data.select_dtypes(exclude='object')
+
+for c in numerical.columns:
+    print(c)
+    pt = PowerTransformer()
+    numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
+    
+##preprocessing categorical
+categorical = full_data.select_dtypes(include='object')
+categorical.head()
+
+for col in categorical.columns:
+    #print('-' * 40 + col + '-' * 40 , end=' - ')
+    display(categorical[col].value_counts().head(10))
+
+categorical = pd.get_dummies(categorical)
+categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
+
+#Embedding numerical & categorical
+# fit1 = PCA(n_components=2).fit(numerical)
+# fit2 = PCA(n_components=2).fit(categorical)
+fit1 = umap.UMAP(metric='l2').fit(numerical)
+fit2 = umap.UMAP(metric='dice').fit(categorical)
+
+intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
+intersection = umap.umap_.reset_local_connectivity(intersection)
+#https://github.com/lmcinnes/umap/issues/561
+embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
+                                                , intersection
+                                                , fit1.n_components
+                                                , fit1._initial_alpha
+                                                , fit1._a
+                                                , fit1._b
+                                                , fit1.repulsion_strength
+                                                , fit1.negative_sample_rate
+                                                , 200
+                                                , 'random'
+                                                , np.random
+                                                , fit1.metric
+                                                , fit1._metric_kwds
+                                                , False
+                                                , densmap_kwds = {}
+                                                , output_dens = False)
+plt.figure(figsize=(20, 10))
+plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
+plt.show()
+
+#One-Hot-Encoding
+data = pd.get_dummies(full_data)
+
+#Pre-processing
+for c in data.columns:
+    pt = PowerTransformer()
+    data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
+#Actual Clustering
+kmeans = KMeans(n_clusters=15).fit(data)
+kmeans_labels = kmeans.labels_
+#OPTIONAL: Elbow plot with inertia
+#Elbow method to choose the optimal number of clusters
+# sse = {}
+# for k in tqdm(range(2, 50)):
+#     kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
+#     sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
+    
+# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
+# fig.show()
+
+# K-Prototypes
+kprot_data = full_data.copy()
+#Pre-processing
+for c in full_data.select_dtypes(exclude='object').columns:
+    pt = PowerTransformer()
+    kprot_data[c] =  pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
+
+#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
+categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
+
+#Actual clustering
+kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
+clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+
+#Prints the count of each cluster group
+pd.Series(clusters).value_counts()
+
+#OPTIONAL: Elbow plot with cost (will take a LONG time)
+costs = []
+n_clusters = []
+clusters_assigned = []
+for i in tqdm(range(2, 25)):
+#for i in tqdm(range(2, 10)):
+
+    try:
+        kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
+        clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+        costs.append(kproto.cost_)
+        n_clusters.append(i)
+        clusters_assigned.append(clusters)
+    except:
+        print(f"Can't cluster with {i} clusters")
+        
+fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
+fig.show()
+
+# Visual Evaluation: Kmeans
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Visual Evaluation: K-Prototypes
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Evaluation by Classification
+#Setting the objects to category 
+lgbm_data = full_data.copy()
+for c in lgbm_data.select_dtypes(include='object'):
+    lgbm_data[c] = lgbm_data[c].astype('category')
+
+#KMeans clusters
+clf_km = LGBMClassifier(colsample_by_tree=0.8)
+cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
+print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
+
+#Fit the model
+clf_km.fit(lgbm_data, kmeans_labels)
+#SHAP values
+explainer_km = shap.TreeExplainer(clf_km)
+shap_values_km = explainer_km.shap_values(lgbm_data)
+shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
+
+#K-Prototypes
+clf_kp = LGBMClassifier(colsample_by_tree=0.8)
+#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
+cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
+print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
+
+#clf_kp.fit(lgbm_data, proto_clusters)
+clf_kp.fit(lgbm_data, clusters)
+
+explainer_kp = shap.TreeExplainer(clf_kp)
+shap_values_kp = explainer_kp.shap_values(lgbm_data)
+shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
+#%%Example 5: My data
+# FIXME: clusters and proto_clusters?
+
+full_data = all_df.copy()
+full_data.head()
+full_data.shape
+
+#Preprocessing numerical
+numerical = full_data.select_dtypes(exclude='object')
+numerical.shape
+
+for c in numerical.columns:
+    print(c)
+    pt = PowerTransformer()
+    numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
+    
+##preprocessing categorical
+categorical = full_data.select_dtypes(include='object')
+categorical.head()
+
+for col in categorical.columns:
+    #print('-' * 40 + col + '-' * 40 , end=' - ')
+    display(categorical[col].value_counts().head(10))
+
+categorical = pd.get_dummies(categorical)
+categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
+
+#Embedding numerical & categorical
+# fit1 = PCA(n_components=2).fit(numerical)
+# fit2 = PCA(n_components=2).fit(categorical)
+fit1 = umap.UMAP(metric='l2').fit(numerical)
+fit2 = umap.UMAP(metric='dice').fit(categorical)
+
+intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
+intersection = umap.umap_.reset_local_connectivity(intersection)
+#https://github.com/lmcinnes/umap/issues/561
+embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
+                                                , intersection
+                                                , fit1.n_components
+                                                , fit1._initial_alpha
+                                                , fit1._a
+                                                , fit1._b
+                                                , fit1.repulsion_strength
+                                                , fit1.negative_sample_rate
+                                                , 200
+                                                , 'random'
+                                                , np.random
+                                                , fit1.metric
+                                                , fit1._metric_kwds
+                                                , False
+                                                , densmap_kwds = {}
+                                                , output_dens = False)
+plt.figure(figsize=(20, 10))
+plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
+plt.show()
+
+#One-Hot-Encoding
+data = pd.get_dummies(full_data)
+
+#Pre-processing
+for c in data.columns:
+    pt = PowerTransformer()
+    data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
+#Actual Clustering
+kmeans = KMeans(n_clusters=2).fit(data)
+kmeans_labels = kmeans.labels_
+#OPTIONAL: Elbow plot with inertia
+#Elbow method to choose the optimal number of clusters
+# sse = {}
+# for k in tqdm(range(2, 50)):
+#     kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
+#     sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
+    
+# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
+# fig.show()
+
+# K-Prototypes
+kprot_data = full_data.copy()
+#Pre-processing
+for c in full_data.select_dtypes(exclude='object').columns:
+    pt = PowerTransformer()
+    kprot_data[c] =  pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
+
+#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
+#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
+categorical_columns = []
+for col in cat_df.columns:
+    print(col)
+    #categ_i += all_df.columns.get_loc(col)
+    categorical_columns.append(full_data.columns.get_loc(col))
+
+print(categorical_columns)
+print(len(categorical_columns))
+
+#Actual clustering
+kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
+clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+
+#Prints the count of each cluster group
+pd.Series(clusters).value_counts()
+
+#OPTIONAL: Elbow plot with cost (will take a LONG time)
+costs = []
+n_clusters = []
+clusters_assigned = []
+for i in tqdm(range(2, 25)):
+#for i in tqdm(range(2, 10)):
+    #print(i)
+
+    try:
+        kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
+        #clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+        proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+
+        costs.append(kproto.cost_)
+        n_clusters.append(i)
+        clusters_assigned.append(clusters)
+    except:
+        print(f"Can't cluster with {i} clusters")
+        
+fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
+fig.show()
+
+# Visual Evaluation: Kmeans
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Visual Evaluation: K-Prototypes
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=2), #15
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Evaluation by Classification
+#Setting the objects to category 
+lgbm_data = full_data.copy()
+for c in lgbm_data.select_dtypes(include='object'):
+    lgbm_data[c] = lgbm_data[c].astype('category')
+
+#KMeans clusters
+clf_km = LGBMClassifier(colsample_by_tree=0.8)
+cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
+print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
+
+#Fit the model
+clf_km.fit(lgbm_data, kmeans_labels)
+#SHAP values
+explainer_km = shap.TreeExplainer(clf_km)
+shap_values_km = explainer_km.shap_values(lgbm_data)
+shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
+
+#K-Prototypes
+clf_kp = LGBMClassifier(colsample_by_tree=0.8)
+cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
+cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
+print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
+
+clf_kp.fit(lgbm_data, proto_clusters)
+clf_kp.fit(lgbm_data, clusters)
+
+explainer_kp = shap.TreeExplainer(clf_kp)
+shap_values_kp = explainer_kp.shap_values(lgbm_data)
+shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))