added tutorial examples and my data workthrough examplesin unsup_v1.py

2022-03-23 16:23:18 +00:00 · 2022-03-23 16:23:18 +00:00 · 89a0c3a58a
commit 89a0c3a58a
parent ad5ebad7f8
4 changed files with 1123 additions and 0 deletions
--- a/imports_unsup.py
+++ b/imports_unsup.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Mar  6 13:41:54 2022
+
+@author: tanu
+"""
+import os, sys
+import pandas as pd
+import numpy as np
+import pprint as pp
+from copy import deepcopy
+import sklearn
+from sklearn import linear_model
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.neural_network import MLPClassifier
+from xgboost import XGBClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import SGDClassifier
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
+
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_transformer
+
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score 
+from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
+from sklearn.metrics import jaccard_score
+
+from sklearn.metrics import make_scorer
+from sklearn.metrics import classification_report
+
+from sklearn.metrics import average_precision_score
+
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedKFold
+
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+
+from sklearn.feature_selection import RFE
+from sklearn.feature_selection import RFECV
+import itertools
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+
+print(np.__version__)
+print(pd.__version__)
+from statistics import mean, stdev, median, mode
+
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import Pipeline
+#from sklearn.datasets import make_classification
+from sklearn.model_selection import cross_validate, cross_val_score
+from sklearn.model_selection import RepeatedStratifiedKFold
+from sklearn.ensemble import AdaBoostClassifier
+from imblearn.combine import SMOTEENN
+from imblearn.under_sampling import EditedNearestNeighbours
+
+from sklearn.model_selection import GridSearchCV
+from sklearn.base import BaseEstimator
+
+from sklearn import cluster, datasets
+from sklearn.cluster import KMeans
+from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
+
+print("Python Version : ",sys.version)
+print("Python Version : ",sys.version)
+print("Scikit-Learn Version : ",sklearn.__version__)
+#warnings.filterwarnings('ignore') ## We'll silent future warnings using this command.
+np.set_printoptions(precision=3)
+#fits plot inside of current notebook. 
+#%matplotlib inline
+
+#%%
+scoring_fn = ({'accuracy'      : make_scorer(accuracy_score)
+                 , 'fscore'    : make_scorer(f1_score)
+                 , 'mcc'       : make_scorer(matthews_corrcoef)
+                 , 'precision' : make_scorer(precision_score)
+                 , 'recall'    : make_scorer(recall_score)
+                 , 'roc_auc'   : make_scorer(roc_auc_score)
+                 , 'jcc'       : make_scorer(jaccard_score)
+            }) 
+  
+rs = {'random_state': 42}
+njobs = {'n_jobs': 10}
+skf_cv = StratifiedKFold(n_splits = 10
+                          #, shuffle = False, random_state= None)
+                           , shuffle = True,**rs)
+rskf_cv = RepeatedStratifiedKFold(n_splits = 10
+                                  , n_repeats=3
+                                 #, shuffle = False, random_state= None)
+                                 #, shuffle = True
+                                 ,**rs)
+#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
+mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
+
+#%%
+homedir = os.path.expanduser("~")
+os.chdir(homedir + "/git/ML_AI_training/")
+
+# my function
+#from MultClassPipe import MultClassPipeline
+from MultClassPipe2 import MultClassPipeline2
+from loopity_loop import MultClassPipeSKFLoop
+from MultClassPipe3 import MultClassPipeSKFCV
+
+
+gene = 'pncA'
+drug = 'pyrazinamide'
+
+#==============
+# directories
+#==============
+datadir = homedir + '/git/Data/'
+indir   = datadir + drug + '/input/'
+outdir  = datadir + drug + '/output/'
+
+#=======
+# input
+#=======
+infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' 
+#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
+
+my_df = pd.read_csv(infile_ml1)
+my_df.dtypes
+my_df_cols = my_df.columns
+
+geneL_basic     = ['pnca']
+geneL_na        = ['gid']
+geneL_na_ppi2   = ['rpob']
+geneL_ppi2      = ['alr', 'embb', 'katg']
+#%% get cols
+mycols = my_df.columns
+
+# change from numberic to 
+num_type = ['int64', 'float64']
+cat_type = ['object', 'bool']
+
+if my_df['active_aa_pos'].dtype in num_type:
+    my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
+    my_df['active_aa_pos'].dtype
+
+# FIXME: if this is not structural, remove from source..
+# Drop NA where numerical cols have them
+if gene.lower() in geneL_na_ppi2:
+    #D1148 get rid of
+    na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+
+# FIXME: either impute or remove!
+# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
+if gene.lower() in ['embb']:
+    na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
+    my_df = my_df.drop(index=na_index)
+#%%============================================================================
+
+# Target1: mutation_info_labels, convert to 
+dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
+my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map)
+my_df['mutation_class'].value_counts()
+my_df['mutation_info_labels']. value_counts()
+
+#%%
+# GET X
+common_cols_stabiltyN = ['ligand_distance'
+           , 'ligand_affinity_change'
+           , 'duet_stability_change'
+           , 'ddg_foldx'
+           , 'deepddg'
+           , 'ddg_dynamut2']
+
+# Build stability columns ~ gene
+if gene.lower() in geneL_basic:
+    x_stabilityN = common_cols_stabiltyN
+    
+if gene.lower() in geneL_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
+                                               , 'interface_dist'] 
+if gene.lower() in geneL_na:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] 
+
+if gene.lower() in geneL_na_ppi2:
+    x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] 
+
+X_strFN =  ['asa'
+           , 'rsa'
+           , 'kd_values'
+           , 'rd_values']    
+
+X_evolFN =  ['consurf_score'
+           , 'snap2_score'
+           , 'snap2_accuracy_pc']
+
+# X_genomicFN =  ['af'
+#            , 'or_mychisq'
+#            , 'or_logistic'
+#            , 'or_fisher'
+#            , 'pval_fisher']
+
+#%% Construct numerical and categorical column names
+numerical_FN = x_stabilityN + X_strFN + X_evolFN
+
+# separate ones for foldx?
+categorical_FN = ['ss_class'
+             , 'wt_prop_water'
+            # , 'lineage_labels' # misleading if using merged_df3
+             , 'mut_prop_water'
+             , 'wt_prop_polarity'
+             , 'mut_prop_polarity'
+             , 'wt_calcprop'
+             , 'mut_calcprop'
+             , 'active_aa_pos']
+
+#%% extracting dfs based on numerical, categorical column names
+#----------------------------------
+# WITHOUT the target var included
+#----------------------------------
+num_df = my_df[numerical_FN]
+num_df.shape
+
+cat_df = my_df[categorical_FN]
+cat_df.shape
+
+all_df = my_df[numerical_FN + categorical_FN]
+all_df.shape
+
+#------------------------------
+# WITH the target var included:
+    #'wtgt': with target
+#------------------------------
+num_df_wtgt = my_df[numerical_FN + ['mutation_class']]
+num_df_wtgt.shape
+
+cat_df_wtgt = my_df[categorical_FN + ['mutation_class']]
+cat_df_wtgt.shape
+
+all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
+all_df_wtgt.shape
+
+#%%
+#%% Get train-test split and scoring functions
+# X = num_df_wtgt[numerical_FN]
+# y = num_df_wtgt['mutation_class']
+
+# X_train, X_test, y_train, y_test = train_test_split(X
+#                                             , y
+#                                             , test_size    = 0.33
+#                                             , **rs
+#                                             , shuffle      = True
+#                                             , stratify     = y)
+
--- a/itertools.py
+++ b/itertools.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 22 17:12:12 2022
+
+@author: tanu
+"""
+# itertools
+# https://datagy.io/python-combinations-of-a-list/
+from itertools import combinations
+sample_list = ['a', 'b', 'c']
+list_combinations = list()
+for n in range(len(sample_list) + 1):
+    list_combinations += list(combinations(sample_list, n))
+print(list_combinations)
+
+
+#%%
+col_list = num_df_wtgt.columns
+col_list = [1:4]
+len(col_list)
+
+col_L_combinations = list()
+for n in range(1, len(col_list) + 1):
+    col_L_combinations += list(combinations(col_list, n))
+print(col_L_combinations)
+print(len(col_L_combinations))
--- a/umap_fs.py
+++ b/umap_fs.py
@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 23 13:36:46 2022
+
+@author: tanu
+"""
+#https://umap-learn.readthedocs.io/en/latest/auto_examples/plot_feature_extraction_classification.html
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+
+from umap import UMAP
+
+# Make a toy dataset
+X, y = make_classification(
+    n_samples=1000,
+    n_features=300,
+    n_informative=250,
+    n_redundant=0,
+    n_repeated=0,
+    n_classes=2,
+    random_state=1212,
+)
+
+# Split the dataset into a training set and a test set
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+# Classification with a linear SVM
+svc = LinearSVC(dual=False, random_state=123)
+params_grid = {"C": [10 ** k for k in range(-3, 4)]}
+clf = GridSearchCV(svc, params_grid)
+clf.fit(X_train, y_train)
+print(
+    "Accuracy on the test set with raw data: {:.3f}".format(clf.score(X_test, y_test))
+)
+
+# Transformation with UMAP followed by classification with a linear SVM
+umap = UMAP(random_state=456)
+pipeline = Pipeline([("umap", umap), ("svc", svc)])
+params_grid_pipeline = {
+    "umap__n_neighbors": [5, 20],
+    "umap__n_components": [15, 25, 50],
+    "svc__C": [10 ** k for k in range(-3, 4)],
+}
+
+
+clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline)
+clf_pipeline.fit(X_train, y_train)
+print(
+    "Accuracy on the test set with UMAP transformation: {:.3f}".format(
+        clf_pipeline.score(X_test, y_test)
+    )
+)
--- a/unsup_v1.py
+++ b/unsup_v1.py
@ -0,0 +1,778 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 16 16:55:06 2022
+
+@author: tanu
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import sklearn
+from sklearn import cluster, datasets
+import warnings
+import sys
+from sklearn.cluster import KMeans
+from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
+from sklearn.metrics import v_measure_score
+# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
+# K-means
+# HC
+# Gaussian mixed mixture
+# FP growth
+# PCA
+# Meanshift
+# DBScan
+
+# Model assessment:
+# Mututal information
+# Silloute score
+# v_measure_score
+
+# Itertools.combinations()
+
+#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
+
+# For K-means clustering, the model is that all clusters
+# have equal, spherical variance.
+
+samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
+print('Dataset size : ', samples.shape, clusters.shape)
+print('Cluster names : ',set(clusters))
+with plt.style.context(('ggplot', 'seaborn')):
+    plt.figure(figsize=(8,6))
+    for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
+        plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
+
+    plt.xlabel('Feature 1')
+    plt.ylabel('Feature 2')
+    plt.title('Visualizing Dataset')
+    plt.legend(loc='best')
+    
+#kmeans = cluster.KMeans(n_clusters=5)
+kmeans = cluster.KMeans(n_clusters=2)
+
+kmeans.fit(samples)
+preds = kmeans.predict(samples)
+
+print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
+print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
+print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
+print('Cluster Centers : \n', str(kmeans.cluster_centers_))
+
+print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
+with plt.style.context(('ggplot', 'seaborn')):
+    plt.figure(figsize=(10,6))
+
+    plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
+    plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
+    plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
+    plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
+    plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
+
+    for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
+        plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
+    for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
+        plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
+    for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
+        plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
+    for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
+        plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
+    for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
+        plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
+
+    plt.xlabel('Feature 1')
+    plt.ylabel('Feature 2')
+    plt.title('Visualizing Predictions & Cluster Centers')
+    plt.legend(loc='best')
+    
+# The Elbow Method: To decide the numbers of cluster i.e. 'k'
+plt.figure(figsize=(8,5))
+distortions = []
+for i in range(1,11):
+    kmeans = cluster.KMeans(n_clusters=i)
+    kmeans.fit(samples)
+    distortions.append(kmeans.inertia_)
+
+print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
+
+with plt.style.context(('ggplot', 'seaborn')):
+    plt.plot(range(1,11), distortions, )
+    plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
+    plt.xlabel('Number Of Clusters')
+    plt.ylabel('Distortions')
+    plt.title('The Elbow Method (Num of Clusters vs Distortions)')
+    plt.xticks(range(1,11));
+
+#%% Example 1: My data
+X_unsup = num_df_wtgt[['ligand_affinity_change'
+                       , 'duet_stability_change']]
+
+kmeans = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    , ('clf', KMeans(n_clusters = 2))
+])
+
+#kmeans = KMeans(n_clusters = 2)
+#kmeans.fit(X_unsup)
+kmeans.fit(X_unsup)
+y_kmeans = kmeans.predict(X_unsup)
+
+plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
+            , X_unsup.loc[:, 'duet_stability_change']
+            , c = y_kmeans
+            , s = 50
+            , cmap = 'viridis')
+
+centers = kmeans.cluster_centers_
+plt.scatter(centers[:, 0], centers[:, 1]
+            , c = 'black'
+            , s = 200
+            , alpha = 0.5);
+plt.show()
+#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
+
+# Loading dataset
+iris_df = datasets.load_iris()
+
+# Available methods on dataset
+print(dir(iris_df))
+
+# Features
+print(iris_df.feature_names)
+
+# Targets
+print(iris_df.target)
+
+# Target Names
+print(iris_df.target_names)
+label = {0: 'red', 1: 'blue', 2: 'green'}
+
+# Dataset Slicing
+x_axis = iris_df.data[:, 0]  # Sepal Length
+y_axis = iris_df.data[:, 2]  # Sepal Width
+
+# Plotting
+plt.scatter(x_axis, y_axis, c=iris_df.target)
+plt.show()
+
+# Use Kmeans: Declaring Model
+model = KMeans(n_clusters = 3)
+
+# Fitting Model
+model.fit(iris_df.data)
+
+# Predicitng a single input
+predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
+
+# Prediction on the entire data
+all_predictions = model.predict(iris_df.data)
+
+# Printing Predictions
+print(predicted_label)
+print(all_predictions)
+
+#%% Example 2: My data
+X_unsup = num_df_wtgt[[#'ligand_affinity_change'
+                       #, 'duet_stability_change'
+                       'ddg_foldx'
+                        ,'deepddg']]
+y_unsup = num_df_wtgt[['mutation_class']]
+
+
+# X_train, X_test, y_train, y_test = train_test_split(X_unsup
+#                                             , y_unsup
+#                                             , test_size    = 0.33
+#                                             , **rs
+#                                             , shuffle      = True
+#                                             , stratify     = y_unsup)
+
+#model = KMeans(n_clusters=2)
+model = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    , ('clf', KMeans(n_clusters = 2))
+])
+
+label = {0: 'blue', 1: 'red'}
+
+model.fit(X_unsup)
+predicted_label = model.predict(X_test)
+all_predictions = model.predict(X_unsup)
+print(predicted_label)
+print(all_predictions)
+
+plt.scatter(X_train.loc[:, 'ligand_affinity_change']
+            , X_train.loc[:, 'duet_stability_change']
+            , c = y_train.loc[:, 'mutation_class'])
+
+#from yellowbrick.cluster import KElbowVisualizer
+model = KMeans()
+visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
+visualizer.show()
+
+#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
+
+# List of V-Measure Scores for different models
+v_scores = []
+  
+# List of different types of covariance parameters
+N_Clusters = [2, 3]
+
+# Building the clustering model
+kmeans2 = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    # ('pre'), StandardScaler())
+    , ('clf', KMeans(n_clusters = 2))
+])
+
+# Training the clustering model
+kmeans2.fit(X_unsup)
+  
+# Storing the predicted Clustering labels
+labels2 = kmeans2.predict(X_unsup)
+  
+# Evaluating the performance
+v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
+
+# Building the clustering model
+ 
+kmeans3 = Pipeline([
+    #('pca', PCA())
+     ('pre', MinMaxScaler())
+    # ('pre'), StandardScaler())
+    , ('clf', KMeans(n_clusters = 3))
+])
+
+# Training the clustering model
+kmeans3.fit(X_unsup)
+  
+# Storing the predicted Clustering labels
+labels3 = kmeans3.predict(X_unsup)
+  
+# Evaluating the performance
+v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
+
+# Plotting a Bar Graph to compare the models
+plt.bar(N_Clusters, v_scores)
+plt.xlabel('Number of Clusters')
+plt.ylabel('V-Measure Score')
+plt.title('Comparison of different Clustering Models')
+plt.show()
+
+
+# Score: silhouette
+kmeans_kwargs = {
+   "init": "random",
+   "n_init": 10,
+   "max_iter": 300,
+   "random_state": 42}
+
+from sklearn.metrics import silhouette_score
+#https://realpython.com/k-means-clustering-python/
+#import kneed
+#from kneed import KneeLocator
+scaler = StandardScaler()
+X_unsup_scaled = scaler.fit_transform(X_unsup)
+
+silhouette_coefficients = []
+# Notice you start at 2 clusters for silhouette coefficient
+for k in range(2, 5):
+    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
+    # kmeans = Pipeline([
+    #    #  ('pre', MinMaxScaler())
+    #     ('pre', StandardScaler())
+    #     , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
+    #     ])
+    kmeans.fit(X_unsup_scaled)
+    score = silhouette_score(X_unsup_scaled, kmeans.labels_)
+    silhouette_coefficients.append(score)
+    
+plt.style.use("fivethirtyeight")
+plt.plot(range(2, 5), silhouette_coefficients)
+plt.xticks(range(2, 5))
+plt.xlabel("Number of Clusters")
+plt.ylabel("Silhouette Coefficient")
+plt.show()    
+plt.bar(range(2, 5), silhouette_coefficients)
+
+from sklearn.cluster import DBSCAN
+from sklearn.datasets import make_moons
+from sklearn.metrics import adjusted_rand_score
+
+# Instantiate k-means and dbscan algorithms
+kmeans = KMeans(n_clusters=2)
+dbscan = DBSCAN(eps=0.3)
+
+# Fit the algorithms to the features
+kmeans.fit(X_unsup_scaled)
+dbscan.fit(X_unsup_scaled)
+
+# Compute the silhouette scores for each algorithm
+kmeans_silhouette = silhouette_score(
+    X_unsup_scaled, kmeans.labels_ ).round(2)
+dbscan_silhouette = silhouette_score(
+    X_unsup_scaled, dbscan.labels_).round (2)
+
+kmeans_silhouette 
+dbscan_silhouette
+
+ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
+ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
+
+round(ari_kmeans, 2)
+round(ari_dbscan, 2)
+
+# Crescent plot
+fig, (ax1, ax2) = plt.subplots(
+        1, 2, figsize=(8, 6), sharex=True, sharey=True
+        )
+
+fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
+fte_colors = {
+        0: "#008fd5",
+        1: "#fc4f30",
+    }
+
+# The k-means plot
+km_colors = [fte_colors[label] for label in kmeans.labels_]
+ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
+ax1.set_title(
+    f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
+    )
+   
+# The dbscan plot
+db_colors = [fte_colors[label] for label in dbscan.labels_]
+ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
+ax2.set_title(
+    f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
+ )
+plt.show()
+
+
+#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
+from sklearn.cluster import AffinityPropagation
+from matplotlib import pyplot
+from numpy import unique
+from numpy import where
+from sklearn.datasets import make_classification
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.cluster import Birch
+from sklearn.cluster import DBSCAN
+from sklearn.cluster import KMeans
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import OPTICS
+from sklearn.cluster import SpectralClustering
+from sklearn.mixture import GaussianMixture
+from sklearn.decomposition import PCA
+# affinity propagation
+# 
+# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
+#=============
+XA = np.array(X_unsup)
+
+# define the model
+#model = AffinityPropagation(damping=0.9)
+model = Pipeline([
+    #('pca', PCA(n_components = 2))
+     ('pre', MinMaxScaler())
+    #('pre', StandardScaler())
+    #, ('clf', AffinityPropagation(damping=0.9))
+    , ('clf', AgglomerativeClustering(n_clusters=2)) #y
+    #, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
+    #, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
+    #, ('clf', KMeans(n_clusters=2)) #y
+    #, ('clf', MiniBatchKMeans(n_clusters=2))
+    #, ('clf', OPTICS(eps=0.8, min_samples=10))
+    #, ('clf', SpectralClustering(n_clusters=2))
+    # , ('clf', GaussianMixture(n_components=2))
+    
+    ])
+model
+# fit the model
+#model.fit(X_unsup)
+#yhat = model.predict(X_unsup)
+yhat = model.fit_predict(X_unsup)
+# retrieve unique clusters
+clusters = unique(yhat)
+
+# create scatter plot for samples from each cluster
+for cluster in clusters:
+    print(cluster)
+	# get row indexes for samples with this cluster
+    row_ix = where(yhat == cluster)
+    print(row_ix)
+	
+    # create scatter of these samples
+    #pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
+    pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
+
+# show the plot
+pyplot.show()
+#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
+import os
+import json
+import numpy as np
+import pandas as pd
+from pandas.io.json import json_normalize
+from datetime import datetime 
+from tqdm import tqdm
+from sklearn.preprocessing import PowerTransformer
+import umap
+import matplotlib.pyplot as plt
+import plotly.graph_objects as go
+from scipy import stats
+from sklearn.cluster import KMeans
+from kmodes.kprototypes import KPrototypes
+from lightgbm import LGBMClassifier
+import shap
+from sklearn.model_selection import cross_val_score
+from kmodes.kprototypes import KPrototypes
+import lightgbm 
+from lightgbm import LGBMClassifier
+
+full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
+full_data.head()
+
+#Preprocessing numerical
+numerical = full_data.select_dtypes(exclude='object')
+
+for c in numerical.columns:
+    print(c)
+    pt = PowerTransformer()
+    numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
+    
+##preprocessing categorical
+categorical = full_data.select_dtypes(include='object')
+categorical.head()
+
+for col in categorical.columns:
+    #print('-' * 40 + col + '-' * 40 , end=' - ')
+    display(categorical[col].value_counts().head(10))
+
+categorical = pd.get_dummies(categorical)
+categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
+
+#Embedding numerical & categorical
+# fit1 = PCA(n_components=2).fit(numerical)
+# fit2 = PCA(n_components=2).fit(categorical)
+fit1 = umap.UMAP(metric='l2').fit(numerical)
+fit2 = umap.UMAP(metric='dice').fit(categorical)
+
+intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
+intersection = umap.umap_.reset_local_connectivity(intersection)
+#https://github.com/lmcinnes/umap/issues/561
+embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
+                                                , intersection
+                                                , fit1.n_components
+                                                , fit1._initial_alpha
+                                                , fit1._a
+                                                , fit1._b
+                                                , fit1.repulsion_strength
+                                                , fit1.negative_sample_rate
+                                                , 200
+                                                , 'random'
+                                                , np.random
+                                                , fit1.metric
+                                                , fit1._metric_kwds
+                                                , False
+                                                , densmap_kwds = {}
+                                                , output_dens = False)
+plt.figure(figsize=(20, 10))
+plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
+plt.show()
+
+#One-Hot-Encoding
+data = pd.get_dummies(full_data)
+
+#Pre-processing
+for c in data.columns:
+    pt = PowerTransformer()
+    data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
+#Actual Clustering
+kmeans = KMeans(n_clusters=15).fit(data)
+kmeans_labels = kmeans.labels_
+#OPTIONAL: Elbow plot with inertia
+#Elbow method to choose the optimal number of clusters
+# sse = {}
+# for k in tqdm(range(2, 50)):
+#     kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
+#     sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
+    
+# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
+# fig.show()
+
+# K-Prototypes
+kprot_data = full_data.copy()
+#Pre-processing
+for c in full_data.select_dtypes(exclude='object').columns:
+    pt = PowerTransformer()
+    kprot_data[c] =  pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
+
+#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
+categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
+
+#Actual clustering
+kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
+clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+
+#Prints the count of each cluster group
+pd.Series(clusters).value_counts()
+
+#OPTIONAL: Elbow plot with cost (will take a LONG time)
+costs = []
+n_clusters = []
+clusters_assigned = []
+for i in tqdm(range(2, 25)):
+#for i in tqdm(range(2, 10)):
+
+    try:
+        kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
+        clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+        costs.append(kproto.cost_)
+        n_clusters.append(i)
+        clusters_assigned.append(clusters)
+    except:
+        print(f"Can't cluster with {i} clusters")
+        
+fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
+fig.show()
+
+# Visual Evaluation: Kmeans
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Visual Evaluation: K-Prototypes
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Evaluation by Classification
+#Setting the objects to category 
+lgbm_data = full_data.copy()
+for c in lgbm_data.select_dtypes(include='object'):
+    lgbm_data[c] = lgbm_data[c].astype('category')
+
+#KMeans clusters
+clf_km = LGBMClassifier(colsample_by_tree=0.8)
+cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
+print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
+
+#Fit the model
+clf_km.fit(lgbm_data, kmeans_labels)
+#SHAP values
+explainer_km = shap.TreeExplainer(clf_km)
+shap_values_km = explainer_km.shap_values(lgbm_data)
+shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
+
+#K-Prototypes
+clf_kp = LGBMClassifier(colsample_by_tree=0.8)
+#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
+cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
+print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
+
+#clf_kp.fit(lgbm_data, proto_clusters)
+clf_kp.fit(lgbm_data, clusters)
+
+explainer_kp = shap.TreeExplainer(clf_kp)
+shap_values_kp = explainer_kp.shap_values(lgbm_data)
+shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
+#%%Example 5: My data
+# FIXME: clusters and proto_clusters?
+
+full_data = all_df.copy()
+full_data.head()
+full_data.shape
+
+#Preprocessing numerical
+numerical = full_data.select_dtypes(exclude='object')
+numerical.shape
+
+for c in numerical.columns:
+    print(c)
+    pt = PowerTransformer()
+    numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
+    
+##preprocessing categorical
+categorical = full_data.select_dtypes(include='object')
+categorical.head()
+
+for col in categorical.columns:
+    #print('-' * 40 + col + '-' * 40 , end=' - ')
+    display(categorical[col].value_counts().head(10))
+
+categorical = pd.get_dummies(categorical)
+categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
+
+#Embedding numerical & categorical
+# fit1 = PCA(n_components=2).fit(numerical)
+# fit2 = PCA(n_components=2).fit(categorical)
+fit1 = umap.UMAP(metric='l2').fit(numerical)
+fit2 = umap.UMAP(metric='dice').fit(categorical)
+
+intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
+intersection = umap.umap_.reset_local_connectivity(intersection)
+#https://github.com/lmcinnes/umap/issues/561
+embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
+                                                , intersection
+                                                , fit1.n_components
+                                                , fit1._initial_alpha
+                                                , fit1._a
+                                                , fit1._b
+                                                , fit1.repulsion_strength
+                                                , fit1.negative_sample_rate
+                                                , 200
+                                                , 'random'
+                                                , np.random
+                                                , fit1.metric
+                                                , fit1._metric_kwds
+                                                , False
+                                                , densmap_kwds = {}
+                                                , output_dens = False)
+plt.figure(figsize=(20, 10))
+plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
+plt.show()
+
+#One-Hot-Encoding
+data = pd.get_dummies(full_data)
+
+#Pre-processing
+for c in data.columns:
+    pt = PowerTransformer()
+    data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
+#Actual Clustering
+kmeans = KMeans(n_clusters=2).fit(data)
+kmeans_labels = kmeans.labels_
+#OPTIONAL: Elbow plot with inertia
+#Elbow method to choose the optimal number of clusters
+# sse = {}
+# for k in tqdm(range(2, 50)):
+#     kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
+#     sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
+    
+# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
+# fig.show()
+
+# K-Prototypes
+kprot_data = full_data.copy()
+#Pre-processing
+for c in full_data.select_dtypes(exclude='object').columns:
+    pt = PowerTransformer()
+    kprot_data[c] =  pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
+
+#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
+#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
+categorical_columns = []
+for col in cat_df.columns:
+    print(col)
+    #categ_i += all_df.columns.get_loc(col)
+    categorical_columns.append(full_data.columns.get_loc(col))
+
+print(categorical_columns)
+print(len(categorical_columns))
+
+#Actual clustering
+kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
+clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+
+#Prints the count of each cluster group
+pd.Series(clusters).value_counts()
+
+#OPTIONAL: Elbow plot with cost (will take a LONG time)
+costs = []
+n_clusters = []
+clusters_assigned = []
+for i in tqdm(range(2, 25)):
+#for i in tqdm(range(2, 10)):
+    #print(i)
+
+    try:
+        kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
+        #clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+        proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
+
+        costs.append(kproto.cost_)
+        n_clusters.append(i)
+        clusters_assigned.append(clusters)
+    except:
+        print(f"Can't cluster with {i} clusters")
+        
+fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
+fig.show()
+
+# Visual Evaluation: Kmeans
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=15),
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Visual Evaluation: K-Prototypes
+fig, ax = plt.subplots()
+fig.set_size_inches((20, 10))
+scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
+
+# produce a legend with the unique colors from the scatter
+legend1 = ax.legend(*scatter.legend_elements(num=2), #15
+                    loc="lower left", title="Classes")
+ax.add_artist(legend1)
+
+# Evaluation by Classification
+#Setting the objects to category 
+lgbm_data = full_data.copy()
+for c in lgbm_data.select_dtypes(include='object'):
+    lgbm_data[c] = lgbm_data[c].astype('category')
+
+#KMeans clusters
+clf_km = LGBMClassifier(colsample_by_tree=0.8)
+cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
+print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
+
+#Fit the model
+clf_km.fit(lgbm_data, kmeans_labels)
+#SHAP values
+explainer_km = shap.TreeExplainer(clf_km)
+shap_values_km = explainer_km.shap_values(lgbm_data)
+shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
+
+#K-Prototypes
+clf_kp = LGBMClassifier(colsample_by_tree=0.8)
+cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
+cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
+print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
+
+clf_kp.fit(lgbm_data, proto_clusters)
+clf_kp.fit(lgbm_data, clusters)
+
+explainer_kp = shap.TreeExplainer(clf_kp)
+shap_values_kp = explainer_kp.shap_values(lgbm_data)
+shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))