#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Mar 16 16:55:06 2022 @author: tanu """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import sklearn from sklearn import cluster, datasets import warnings import sys from sklearn.cluster import KMeans from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score from sklearn.metrics import v_measure_score # Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/ # K-means # HC # Gaussian mixed mixture # FP growth # PCA # Meanshift # DBScan # Model assessment: # Mututal information # Silloute score # v_measure_score # Itertools.combinations() #%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn # For K-means clustering, the model is that all clusters # have equal, spherical variance. samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345) print('Dataset size : ', samples.shape, clusters.shape) print('Cluster names : ',set(clusters)) with plt.style.context(('ggplot', 'seaborn')): plt.figure(figsize=(8,6)) for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']): plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i) plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.title('Visualizing Dataset') plt.legend(loc='best') #kmeans = cluster.KMeans(n_clusters=5) kmeans = cluster.KMeans(n_clusters=2) kmeans.fit(samples) preds = kmeans.predict(samples) print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds)) print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds)) print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds)) print('Cluster Centers : \n', str(kmeans.cluster_centers_)) print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,) with plt.style.context(('ggplot', 'seaborn')): plt.figure(figsize=(10,6)) plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0') plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1') plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2') plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3') plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4') for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]): plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red') for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]): plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green') for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]): plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue') for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]): plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange') for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]): plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.title('Visualizing Predictions & Cluster Centers') plt.legend(loc='best') # The Elbow Method: To decide the numbers of cluster i.e. 'k' plt.figure(figsize=(8,5)) distortions = [] for i in range(1,11): kmeans = cluster.KMeans(n_clusters=i) kmeans.fit(samples) distortions.append(kmeans.inertia_) print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions) with plt.style.context(('ggplot', 'seaborn')): plt.plot(range(1,11), distortions, ) plt.scatter(range(1,11), distortions, color='red', marker='o', s=80) plt.xlabel('Number Of Clusters') plt.ylabel('Distortions') plt.title('The Elbow Method (Num of Clusters vs Distortions)') plt.xticks(range(1,11)); #%% Example 1: My data X_unsup = num_df_wtgt[['ligand_affinity_change' , 'duet_stability_change']] kmeans = Pipeline([ #('pca', PCA()) ('pre', MinMaxScaler()) , ('clf', KMeans(n_clusters = 2)) ]) #kmeans = KMeans(n_clusters = 2) #kmeans.fit(X_unsup) kmeans.fit(X_unsup) y_kmeans = kmeans.predict(X_unsup) plt.scatter(X_unsup.loc [:, 'ligand_affinity_change'] , X_unsup.loc[:, 'duet_stability_change'] , c = y_kmeans , s = 50 , cmap = 'viridis') centers = kmeans.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1] , c = 'black' , s = 200 , alpha = 0.5); plt.show() #%% Example 2: https://builtin.com/data-science/unsupervised-learning-python # Loading dataset iris_df = datasets.load_iris() # Available methods on dataset print(dir(iris_df)) # Features print(iris_df.feature_names) # Targets print(iris_df.target) # Target Names print(iris_df.target_names) label = {0: 'red', 1: 'blue', 2: 'green'} # Dataset Slicing x_axis = iris_df.data[:, 0] # Sepal Length y_axis = iris_df.data[:, 2] # Sepal Width # Plotting plt.scatter(x_axis, y_axis, c=iris_df.target) plt.show() # Use Kmeans: Declaring Model model = KMeans(n_clusters = 3) # Fitting Model model.fit(iris_df.data) # Predicitng a single input predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]]) # Prediction on the entire data all_predictions = model.predict(iris_df.data) # Printing Predictions print(predicted_label) print(all_predictions) #%% Example 2: My data X_unsup = num_df_wtgt[[#'ligand_affinity_change' #, 'duet_stability_change' 'ddg_foldx' ,'deepddg']] y_unsup = num_df_wtgt[['mutation_class']] # X_train, X_test, y_train, y_test = train_test_split(X_unsup # , y_unsup # , test_size = 0.33 # , **rs # , shuffle = True # , stratify = y_unsup) #model = KMeans(n_clusters=2) model = Pipeline([ #('pca', PCA()) ('pre', MinMaxScaler()) , ('clf', KMeans(n_clusters = 2)) ]) label = {0: 'blue', 1: 'red'} model.fit(X_unsup) predicted_label = model.predict(X_test) all_predictions = model.predict(X_unsup) print(predicted_label) print(all_predictions) plt.scatter(X_train.loc[:, 'ligand_affinity_change'] , X_train.loc[:, 'duet_stability_change'] , c = y_train.loc[:, 'mutation_class']) #from yellowbrick.cluster import KElbowVisualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(1,12)).fit(df) visualizer.show() #%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/ # List of V-Measure Scores for different models v_scores = [] # List of different types of covariance parameters N_Clusters = [2, 3] # Building the clustering model kmeans2 = Pipeline([ #('pca', PCA()) ('pre', MinMaxScaler()) # ('pre'), StandardScaler()) , ('clf', KMeans(n_clusters = 2)) ]) # Training the clustering model kmeans2.fit(X_unsup) # Storing the predicted Clustering labels labels2 = kmeans2.predict(X_unsup) # Evaluating the performance v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2)) # Building the clustering model kmeans3 = Pipeline([ #('pca', PCA()) ('pre', MinMaxScaler()) # ('pre'), StandardScaler()) , ('clf', KMeans(n_clusters = 3)) ]) # Training the clustering model kmeans3.fit(X_unsup) # Storing the predicted Clustering labels labels3 = kmeans3.predict(X_unsup) # Evaluating the performance v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3)) # Plotting a Bar Graph to compare the models plt.bar(N_Clusters, v_scores) plt.xlabel('Number of Clusters') plt.ylabel('V-Measure Score') plt.title('Comparison of different Clustering Models') plt.show() # Score: silhouette kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42} from sklearn.metrics import silhouette_score #https://realpython.com/k-means-clustering-python/ #import kneed #from kneed import KneeLocator scaler = StandardScaler() X_unsup_scaled = scaler.fit_transform(X_unsup) silhouette_coefficients = [] # Notice you start at 2 clusters for silhouette coefficient for k in range(2, 5): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) # kmeans = Pipeline([ # # ('pre', MinMaxScaler()) # ('pre', StandardScaler()) # , ('clf', KMeans(n_clusters=k, **kmeans_kwargs)) # ]) kmeans.fit(X_unsup_scaled) score = silhouette_score(X_unsup_scaled, kmeans.labels_) silhouette_coefficients.append(score) plt.style.use("fivethirtyeight") plt.plot(range(2, 5), silhouette_coefficients) plt.xticks(range(2, 5)) plt.xlabel("Number of Clusters") plt.ylabel("Silhouette Coefficient") plt.show() plt.bar(range(2, 5), silhouette_coefficients) from sklearn.cluster import DBSCAN from sklearn.datasets import make_moons from sklearn.metrics import adjusted_rand_score # Instantiate k-means and dbscan algorithms kmeans = KMeans(n_clusters=2) dbscan = DBSCAN(eps=0.3) # Fit the algorithms to the features kmeans.fit(X_unsup_scaled) dbscan.fit(X_unsup_scaled) # Compute the silhouette scores for each algorithm kmeans_silhouette = silhouette_score( X_unsup_scaled, kmeans.labels_ ).round(2) dbscan_silhouette = silhouette_score( X_unsup_scaled, dbscan.labels_).round (2) kmeans_silhouette dbscan_silhouette ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_) ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_) round(ari_kmeans, 2) round(ari_dbscan, 2) # Crescent plot fig, (ax1, ax2) = plt.subplots( 1, 2, figsize=(8, 6), sharex=True, sharey=True ) fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16) fte_colors = { 0: "#008fd5", 1: "#fc4f30", } # The k-means plot km_colors = [fte_colors[label] for label in kmeans.labels_] ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors) ax1.set_title( f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12} ) # The dbscan plot db_colors = [fte_colors[label] for label in dbscan.labels_] ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors) ax2.set_title( f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12} ) plt.show() #%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/ from sklearn.cluster import AffinityPropagation from matplotlib import pyplot from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import Birch from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans from sklearn.cluster import MiniBatchKMeans from sklearn.cluster import OPTICS from sklearn.cluster import SpectralClustering from sklearn.mixture import GaussianMixture from sklearn.decomposition import PCA # affinity propagation # # which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges. #============= XA = np.array(X_unsup) # define the model #model = AffinityPropagation(damping=0.9) model = Pipeline([ #('pca', PCA(n_components = 2)) ('pre', MinMaxScaler()) #('pre', StandardScaler()) #, ('clf', AffinityPropagation(damping=0.9)) , ('clf', AgglomerativeClustering(n_clusters=2)) #y #, ('clf', Birch(threshold=0.01, n_clusters=2)) #y #, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n #, ('clf', KMeans(n_clusters=2)) #y #, ('clf', MiniBatchKMeans(n_clusters=2)) #, ('clf', OPTICS(eps=0.8, min_samples=10)) #, ('clf', SpectralClustering(n_clusters=2)) # , ('clf', GaussianMixture(n_components=2)) ]) model # fit the model #model.fit(X_unsup) #yhat = model.predict(X_unsup) yhat = model.fit_predict(X_unsup) # retrieve unique clusters clusters = unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: print(cluster) # get row indexes for samples with this cluster row_ix = where(yhat == cluster) print(row_ix) # create scatter of these samples #pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1]) # show the plot pyplot.show() #%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb import os import json import numpy as np import pandas as pd from pandas.io.json import json_normalize from datetime import datetime from tqdm import tqdm from sklearn.preprocessing import PowerTransformer import umap import matplotlib.pyplot as plt import plotly.graph_objects as go from scipy import stats from sklearn.cluster import KMeans from kmodes.kprototypes import KPrototypes from lightgbm import LGBMClassifier import shap from sklearn.model_selection import cross_val_score from kmodes.kprototypes import KPrototypes import lightgbm from lightgbm import LGBMClassifier full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv') full_data.head() #Preprocessing numerical numerical = full_data.select_dtypes(exclude='object') for c in numerical.columns: print(c) pt = PowerTransformer() numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1)) ##preprocessing categorical categorical = full_data.select_dtypes(include='object') categorical.head() for col in categorical.columns: #print('-' * 40 + col + '-' * 40 , end=' - ') display(categorical[col].value_counts().head(10)) categorical = pd.get_dummies(categorical) categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1] #Embedding numerical & categorical # fit1 = PCA(n_components=2).fit(numerical) # fit2 = PCA(n_components=2).fit(categorical) fit1 = umap.UMAP(metric='l2').fit(numerical) fit2 = umap.UMAP(metric='dice').fit(categorical) intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight) intersection = umap.umap_.reset_local_connectivity(intersection) #https://github.com/lmcinnes/umap/issues/561 embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data , intersection , fit1.n_components , fit1._initial_alpha , fit1._a , fit1._b , fit1.repulsion_strength , fit1.negative_sample_rate , 200 , 'random' , np.random , fit1.metric , fit1._metric_kwds , False , densmap_kwds = {} , output_dens = False) plt.figure(figsize=(20, 10)) plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0) plt.show() #One-Hot-Encoding data = pd.get_dummies(full_data) #Pre-processing for c in data.columns: pt = PowerTransformer() data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1)) #Actual Clustering kmeans = KMeans(n_clusters=15).fit(data) kmeans_labels = kmeans.labels_ #OPTIONAL: Elbow plot with inertia #Elbow method to choose the optimal number of clusters # sse = {} # for k in tqdm(range(2, 50)): # kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data) # sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center # fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values()))) # fig.show() # K-Prototypes kprot_data = full_data.copy() #Pre-processing for c in full_data.select_dtypes(exclude='object').columns: pt = PowerTransformer() kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1)) #categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices #Actual clustering kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4) clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) #Prints the count of each cluster group pd.Series(clusters).value_counts() #OPTIONAL: Elbow plot with cost (will take a LONG time) costs = [] n_clusters = [] clusters_assigned = [] for i in tqdm(range(2, 25)): #for i in tqdm(range(2, 10)): try: kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10) clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) costs.append(kproto.cost_) n_clusters.append(i) clusters_assigned.append(clusters) except: print(f"Can't cluster with {i} clusters") fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs )) fig.show() # Visual Evaluation: Kmeans fig, ax = plt.subplots() fig.set_size_inches((20, 10)) #scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) # produce a legend with the unique colors from the scatter legend1 = ax.legend(*scatter.legend_elements(num=15), loc="lower left", title="Classes") ax.add_artist(legend1) # produce a legend with the unique colors from the scatter legend1 = ax.legend(*scatter.legend_elements(num=15), loc="lower left", title="Classes") ax.add_artist(legend1) # Visual Evaluation: K-Prototypes fig, ax = plt.subplots() fig.set_size_inches((20, 10)) scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) # produce a legend with the unique colors from the scatter legend1 = ax.legend(*scatter.legend_elements(num=15), loc="lower left", title="Classes") ax.add_artist(legend1) # Evaluation by Classification #Setting the objects to category lgbm_data = full_data.copy() for c in lgbm_data.select_dtypes(include='object'): lgbm_data[c] = lgbm_data[c].astype('category') #KMeans clusters clf_km = LGBMClassifier(colsample_by_tree=0.8) cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted') print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}') #Fit the model clf_km.fit(lgbm_data, kmeans_labels) #SHAP values explainer_km = shap.TreeExplainer(clf_km) shap_values_km = explainer_km.shap_values(lgbm_data) shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10)) #K-Prototypes clf_kp = LGBMClassifier(colsample_by_tree=0.8) #cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted') cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted') print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}') #clf_kp.fit(lgbm_data, proto_clusters) clf_kp.fit(lgbm_data, clusters) explainer_kp = shap.TreeExplainer(clf_kp) shap_values_kp = explainer_kp.shap_values(lgbm_data) shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10)) #%%Example 5: My data # FIXME: clusters and proto_clusters? full_data = all_df.copy() full_data.head() full_data.shape #Preprocessing numerical numerical = full_data.select_dtypes(exclude='object') numerical.shape for c in numerical.columns: print(c) pt = PowerTransformer() numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1)) ##preprocessing categorical categorical = full_data.select_dtypes(include='object') categorical.head() for col in categorical.columns: #print('-' * 40 + col + '-' * 40 , end=' - ') display(categorical[col].value_counts().head(10)) categorical = pd.get_dummies(categorical) categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1] #Embedding numerical & categorical # fit1 = PCA(n_components=2).fit(numerical) # fit2 = PCA(n_components=2).fit(categorical) fit1 = umap.UMAP(metric='l2').fit(numerical) fit2 = umap.UMAP(metric='dice').fit(categorical) intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight) intersection = umap.umap_.reset_local_connectivity(intersection) #https://github.com/lmcinnes/umap/issues/561 embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data , intersection , fit1.n_components , fit1._initial_alpha , fit1._a , fit1._b , fit1.repulsion_strength , fit1.negative_sample_rate , 200 , 'random' , np.random , fit1.metric , fit1._metric_kwds , False , densmap_kwds = {} , output_dens = False) plt.figure(figsize=(20, 10)) plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0) plt.show() #One-Hot-Encoding data = pd.get_dummies(full_data) #Pre-processing for c in data.columns: pt = PowerTransformer() data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1)) #Actual Clustering kmeans = KMeans(n_clusters=2).fit(data) kmeans_labels = kmeans.labels_ #OPTIONAL: Elbow plot with inertia #Elbow method to choose the optimal number of clusters # sse = {} # for k in tqdm(range(2, 50)): # kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data) # sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center # fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values()))) # fig.show() # K-Prototypes kprot_data = full_data.copy() #Pre-processing for c in full_data.select_dtypes(exclude='object').columns: pt = PowerTransformer() kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1)) #categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices #categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices categorical_columns = [] for col in cat_df.columns: print(col) #categ_i += all_df.columns.get_loc(col) categorical_columns.append(full_data.columns.get_loc(col)) print(categorical_columns) print(len(categorical_columns)) #Actual clustering kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10) clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) #Prints the count of each cluster group pd.Series(clusters).value_counts() #OPTIONAL: Elbow plot with cost (will take a LONG time) costs = [] n_clusters = [] clusters_assigned = [] for i in tqdm(range(2, 25)): #for i in tqdm(range(2, 10)): #print(i) try: kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10) #clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) costs.append(kproto.cost_) n_clusters.append(i) clusters_assigned.append(clusters) except: print(f"Can't cluster with {i} clusters") fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs )) fig.show() # Visual Evaluation: Kmeans fig, ax = plt.subplots() fig.set_size_inches((20, 10)) #scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) # produce a legend with the unique colors from the scatter legend1 = ax.legend(*scatter.legend_elements(num=15), loc="lower left", title="Classes") ax.add_artist(legend1) # produce a legend with the unique colors from the scatter legend1 = ax.legend(*scatter.legend_elements(num=15), loc="lower left", title="Classes") ax.add_artist(legend1) # Visual Evaluation: K-Prototypes fig, ax = plt.subplots() fig.set_size_inches((20, 10)) scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) # produce a legend with the unique colors from the scatter legend1 = ax.legend(*scatter.legend_elements(num=2), #15 loc="lower left", title="Classes") ax.add_artist(legend1) # Evaluation by Classification #Setting the objects to category lgbm_data = full_data.copy() for c in lgbm_data.select_dtypes(include='object'): lgbm_data[c] = lgbm_data[c].astype('category') #KMeans clusters clf_km = LGBMClassifier(colsample_by_tree=0.8) cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted') print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}') #Fit the model clf_km.fit(lgbm_data, kmeans_labels) #SHAP values explainer_km = shap.TreeExplainer(clf_km) shap_values_km = explainer_km.shap_values(lgbm_data) shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10)) #K-Prototypes clf_kp = LGBMClassifier(colsample_by_tree=0.8) cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted') cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted') print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}') clf_kp.fit(lgbm_data, proto_clusters) clf_kp.fit(lgbm_data, clusters) explainer_kp = shap.TreeExplainer(clf_kp) shap_values_kp = explainer_kp.shap_values(lgbm_data) shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))