diff --git a/imports_unsup.py b/imports_unsup.py new file mode 100644 index 0000000..5670ede --- /dev/null +++ b/imports_unsup.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Mar 6 13:41:54 2022 + +@author: tanu +""" +import os, sys +import pandas as pd +import numpy as np +import pprint as pp +from copy import deepcopy +import sklearn +from sklearn import linear_model +from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.naive_bayes import BernoulliNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.neural_network import MLPClassifier +from xgboost import XGBClassifier +from sklearn.naive_bayes import MultinomialNB +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_transformer + +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score +from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef +from sklearn.metrics import jaccard_score + +from sklearn.metrics import make_scorer +from sklearn.metrics import classification_report + +from sklearn.metrics import average_precision_score + +from sklearn.model_selection import cross_validate +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedKFold + +from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline + +from sklearn.feature_selection import RFE +from sklearn.feature_selection import RFECV +import itertools +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np + +print(np.__version__) +print(pd.__version__) +from statistics import mean, stdev, median, mode + +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline +#from sklearn.datasets import make_classification +from sklearn.model_selection import cross_validate, cross_val_score +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.ensemble import AdaBoostClassifier +from imblearn.combine import SMOTEENN +from imblearn.under_sampling import EditedNearestNeighbours + +from sklearn.model_selection import GridSearchCV +from sklearn.base import BaseEstimator + +from sklearn import cluster, datasets +from sklearn.cluster import KMeans +from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score + +print("Python Version : ",sys.version) +print("Python Version : ",sys.version) +print("Scikit-Learn Version : ",sklearn.__version__) +#warnings.filterwarnings('ignore') ## We'll silent future warnings using this command. +np.set_printoptions(precision=3) +#fits plot inside of current notebook. +#%matplotlib inline + +#%% +scoring_fn = ({'accuracy' : make_scorer(accuracy_score) + , 'fscore' : make_scorer(f1_score) + , 'mcc' : make_scorer(matthews_corrcoef) + , 'precision' : make_scorer(precision_score) + , 'recall' : make_scorer(recall_score) + , 'roc_auc' : make_scorer(roc_auc_score) + , 'jcc' : make_scorer(jaccard_score) + }) + +rs = {'random_state': 42} +njobs = {'n_jobs': 10} +skf_cv = StratifiedKFold(n_splits = 10 + #, shuffle = False, random_state= None) + , shuffle = True,**rs) +rskf_cv = RepeatedStratifiedKFold(n_splits = 10 + , n_repeats=3 + #, shuffle = False, random_state= None) + #, shuffle = True + ,**rs) +#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef}) +mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)} + +#%% +homedir = os.path.expanduser("~") +os.chdir(homedir + "/git/ML_AI_training/") + +# my function +#from MultClassPipe import MultClassPipeline +from MultClassPipe2 import MultClassPipeline2 +from loopity_loop import MultClassPipeSKFLoop +from MultClassPipe3 import MultClassPipeSKFCV + + +gene = 'pncA' +drug = 'pyrazinamide' + +#============== +# directories +#============== +datadir = homedir + '/git/Data/' +indir = datadir + drug + '/input/' +outdir = datadir + drug + '/output/' + +#======= +# input +#======= +infile_ml1 = outdir + gene.lower() + '_merged_df3.csv' +#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv' + +my_df = pd.read_csv(infile_ml1) +my_df.dtypes +my_df_cols = my_df.columns + +geneL_basic = ['pnca'] +geneL_na = ['gid'] +geneL_na_ppi2 = ['rpob'] +geneL_ppi2 = ['alr', 'embb', 'katg'] +#%% get cols +mycols = my_df.columns + +# change from numberic to +num_type = ['int64', 'float64'] +cat_type = ['object', 'bool'] + +if my_df['active_aa_pos'].dtype in num_type: + my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object) + my_df['active_aa_pos'].dtype + +# FIXME: if this is not structural, remove from source.. +# Drop NA where numerical cols have them +if gene.lower() in geneL_na_ppi2: + #D1148 get rid of + na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) + +# FIXME: either impute or remove! +# for embb (L114M, F115L, V123L, V125I, V131M) delete for now +if gene.lower() in ['embb']: + na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)] + my_df = my_df.drop(index=na_index) +#%%============================================================================ + +# Target1: mutation_info_labels, convert to +dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority +my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map) +my_df['mutation_class'].value_counts() +my_df['mutation_info_labels']. value_counts() + +#%% +# GET X +common_cols_stabiltyN = ['ligand_distance' + , 'ligand_affinity_change' + , 'duet_stability_change' + , 'ddg_foldx' + , 'deepddg' + , 'ddg_dynamut2'] + +# Build stability columns ~ gene +if gene.lower() in geneL_basic: + x_stabilityN = common_cols_stabiltyN + +if gene.lower() in geneL_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity' + , 'interface_dist'] +if gene.lower() in geneL_na: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + +if gene.lower() in geneL_na_ppi2: + x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist'] + +X_strFN = ['asa' + , 'rsa' + , 'kd_values' + , 'rd_values'] + +X_evolFN = ['consurf_score' + , 'snap2_score' + , 'snap2_accuracy_pc'] + +# X_genomicFN = ['af' +# , 'or_mychisq' +# , 'or_logistic' +# , 'or_fisher' +# , 'pval_fisher'] + +#%% Construct numerical and categorical column names +numerical_FN = x_stabilityN + X_strFN + X_evolFN + +# separate ones for foldx? +categorical_FN = ['ss_class' + , 'wt_prop_water' + # , 'lineage_labels' # misleading if using merged_df3 + , 'mut_prop_water' + , 'wt_prop_polarity' + , 'mut_prop_polarity' + , 'wt_calcprop' + , 'mut_calcprop' + , 'active_aa_pos'] + +#%% extracting dfs based on numerical, categorical column names +#---------------------------------- +# WITHOUT the target var included +#---------------------------------- +num_df = my_df[numerical_FN] +num_df.shape + +cat_df = my_df[categorical_FN] +cat_df.shape + +all_df = my_df[numerical_FN + categorical_FN] +all_df.shape + +#------------------------------ +# WITH the target var included: + #'wtgt': with target +#------------------------------ +num_df_wtgt = my_df[numerical_FN + ['mutation_class']] +num_df_wtgt.shape + +cat_df_wtgt = my_df[categorical_FN + ['mutation_class']] +cat_df_wtgt.shape + +all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']] +all_df_wtgt.shape + +#%% +#%% Get train-test split and scoring functions +# X = num_df_wtgt[numerical_FN] +# y = num_df_wtgt['mutation_class'] + +# X_train, X_test, y_train, y_test = train_test_split(X +# , y +# , test_size = 0.33 +# , **rs +# , shuffle = True +# , stratify = y) + diff --git a/itertools.py b/itertools.py new file mode 100644 index 0000000..b86e2b8 --- /dev/null +++ b/itertools.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 22 17:12:12 2022 + +@author: tanu +""" +# itertools +# https://datagy.io/python-combinations-of-a-list/ +from itertools import combinations +sample_list = ['a', 'b', 'c'] +list_combinations = list() +for n in range(len(sample_list) + 1): + list_combinations += list(combinations(sample_list, n)) +print(list_combinations) + + +#%% +col_list = num_df_wtgt.columns +col_list = [1:4] +len(col_list) + +col_L_combinations = list() +for n in range(1, len(col_list) + 1): + col_L_combinations += list(combinations(col_list, n)) +print(col_L_combinations) +print(len(col_L_combinations)) diff --git a/umap_fs.py b/umap_fs.py new file mode 100644 index 0000000..0bf2a51 --- /dev/null +++ b/umap_fs.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 23 13:36:46 2022 + +@author: tanu +""" +#https://umap-learn.readthedocs.io/en/latest/auto_examples/plot_feature_extraction_classification.html +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.svm import LinearSVC + +from umap import UMAP + +# Make a toy dataset +X, y = make_classification( + n_samples=1000, + n_features=300, + n_informative=250, + n_redundant=0, + n_repeated=0, + n_classes=2, + random_state=1212, +) + +# Split the dataset into a training set and a test set +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +# Classification with a linear SVM +svc = LinearSVC(dual=False, random_state=123) +params_grid = {"C": [10 ** k for k in range(-3, 4)]} +clf = GridSearchCV(svc, params_grid) +clf.fit(X_train, y_train) +print( + "Accuracy on the test set with raw data: {:.3f}".format(clf.score(X_test, y_test)) +) + +# Transformation with UMAP followed by classification with a linear SVM +umap = UMAP(random_state=456) +pipeline = Pipeline([("umap", umap), ("svc", svc)]) +params_grid_pipeline = { + "umap__n_neighbors": [5, 20], + "umap__n_components": [15, 25, 50], + "svc__C": [10 ** k for k in range(-3, 4)], +} + + +clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline) +clf_pipeline.fit(X_train, y_train) +print( + "Accuracy on the test set with UMAP transformation: {:.3f}".format( + clf_pipeline.score(X_test, y_test) + ) +) diff --git a/unsup_v1.py b/unsup_v1.py new file mode 100644 index 0000000..4110ce0 --- /dev/null +++ b/unsup_v1.py @@ -0,0 +1,778 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 16 16:55:06 2022 + +@author: tanu +""" +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import sklearn +from sklearn import cluster, datasets +import warnings +import sys +from sklearn.cluster import KMeans +from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score +from sklearn.metrics import v_measure_score +# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/ +# K-means +# HC +# Gaussian mixed mixture +# FP growth +# PCA +# Meanshift +# DBScan + +# Model assessment: +# Mututal information +# Silloute score +# v_measure_score + +# Itertools.combinations() + +#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn + +# For K-means clustering, the model is that all clusters +# have equal, spherical variance. + +samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345) +print('Dataset size : ', samples.shape, clusters.shape) +print('Cluster names : ',set(clusters)) +with plt.style.context(('ggplot', 'seaborn')): + plt.figure(figsize=(8,6)) + for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']): + plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i) + + plt.xlabel('Feature 1') + plt.ylabel('Feature 2') + plt.title('Visualizing Dataset') + plt.legend(loc='best') + +#kmeans = cluster.KMeans(n_clusters=5) +kmeans = cluster.KMeans(n_clusters=2) + +kmeans.fit(samples) +preds = kmeans.predict(samples) + +print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds)) +print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds)) +print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds)) +print('Cluster Centers : \n', str(kmeans.cluster_centers_)) + +print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,) +with plt.style.context(('ggplot', 'seaborn')): + plt.figure(figsize=(10,6)) + + plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0') + plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1') + plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2') + plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3') + plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4') + + for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]): + plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red') + for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]): + plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green') + for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]): + plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue') + for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]): + plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange') + for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]): + plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple') + + plt.xlabel('Feature 1') + plt.ylabel('Feature 2') + plt.title('Visualizing Predictions & Cluster Centers') + plt.legend(loc='best') + +# The Elbow Method: To decide the numbers of cluster i.e. 'k' +plt.figure(figsize=(8,5)) +distortions = [] +for i in range(1,11): + kmeans = cluster.KMeans(n_clusters=i) + kmeans.fit(samples) + distortions.append(kmeans.inertia_) + +print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions) + +with plt.style.context(('ggplot', 'seaborn')): + plt.plot(range(1,11), distortions, ) + plt.scatter(range(1,11), distortions, color='red', marker='o', s=80) + plt.xlabel('Number Of Clusters') + plt.ylabel('Distortions') + plt.title('The Elbow Method (Num of Clusters vs Distortions)') + plt.xticks(range(1,11)); + +#%% Example 1: My data +X_unsup = num_df_wtgt[['ligand_affinity_change' + , 'duet_stability_change']] + +kmeans = Pipeline([ + #('pca', PCA()) + ('pre', MinMaxScaler()) + , ('clf', KMeans(n_clusters = 2)) +]) + +#kmeans = KMeans(n_clusters = 2) +#kmeans.fit(X_unsup) +kmeans.fit(X_unsup) +y_kmeans = kmeans.predict(X_unsup) + +plt.scatter(X_unsup.loc [:, 'ligand_affinity_change'] + , X_unsup.loc[:, 'duet_stability_change'] + , c = y_kmeans + , s = 50 + , cmap = 'viridis') + +centers = kmeans.cluster_centers_ +plt.scatter(centers[:, 0], centers[:, 1] + , c = 'black' + , s = 200 + , alpha = 0.5); +plt.show() +#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python + +# Loading dataset +iris_df = datasets.load_iris() + +# Available methods on dataset +print(dir(iris_df)) + +# Features +print(iris_df.feature_names) + +# Targets +print(iris_df.target) + +# Target Names +print(iris_df.target_names) +label = {0: 'red', 1: 'blue', 2: 'green'} + +# Dataset Slicing +x_axis = iris_df.data[:, 0] # Sepal Length +y_axis = iris_df.data[:, 2] # Sepal Width + +# Plotting +plt.scatter(x_axis, y_axis, c=iris_df.target) +plt.show() + +# Use Kmeans: Declaring Model +model = KMeans(n_clusters = 3) + +# Fitting Model +model.fit(iris_df.data) + +# Predicitng a single input +predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]]) + +# Prediction on the entire data +all_predictions = model.predict(iris_df.data) + +# Printing Predictions +print(predicted_label) +print(all_predictions) + +#%% Example 2: My data +X_unsup = num_df_wtgt[[#'ligand_affinity_change' + #, 'duet_stability_change' + 'ddg_foldx' + ,'deepddg']] +y_unsup = num_df_wtgt[['mutation_class']] + + +# X_train, X_test, y_train, y_test = train_test_split(X_unsup +# , y_unsup +# , test_size = 0.33 +# , **rs +# , shuffle = True +# , stratify = y_unsup) + +#model = KMeans(n_clusters=2) +model = Pipeline([ + #('pca', PCA()) + ('pre', MinMaxScaler()) + , ('clf', KMeans(n_clusters = 2)) +]) + +label = {0: 'blue', 1: 'red'} + +model.fit(X_unsup) +predicted_label = model.predict(X_test) +all_predictions = model.predict(X_unsup) +print(predicted_label) +print(all_predictions) + +plt.scatter(X_train.loc[:, 'ligand_affinity_change'] + , X_train.loc[:, 'duet_stability_change'] + , c = y_train.loc[:, 'mutation_class']) + +#from yellowbrick.cluster import KElbowVisualizer +model = KMeans() +visualizer = KElbowVisualizer(model, k=(1,12)).fit(df) +visualizer.show() + +#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/ + +# List of V-Measure Scores for different models +v_scores = [] + +# List of different types of covariance parameters +N_Clusters = [2, 3] + +# Building the clustering model +kmeans2 = Pipeline([ + #('pca', PCA()) + ('pre', MinMaxScaler()) + # ('pre'), StandardScaler()) + , ('clf', KMeans(n_clusters = 2)) +]) + +# Training the clustering model +kmeans2.fit(X_unsup) + +# Storing the predicted Clustering labels +labels2 = kmeans2.predict(X_unsup) + +# Evaluating the performance +v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2)) + +# Building the clustering model + +kmeans3 = Pipeline([ + #('pca', PCA()) + ('pre', MinMaxScaler()) + # ('pre'), StandardScaler()) + , ('clf', KMeans(n_clusters = 3)) +]) + +# Training the clustering model +kmeans3.fit(X_unsup) + +# Storing the predicted Clustering labels +labels3 = kmeans3.predict(X_unsup) + +# Evaluating the performance +v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3)) + +# Plotting a Bar Graph to compare the models +plt.bar(N_Clusters, v_scores) +plt.xlabel('Number of Clusters') +plt.ylabel('V-Measure Score') +plt.title('Comparison of different Clustering Models') +plt.show() + + +# Score: silhouette +kmeans_kwargs = { + "init": "random", + "n_init": 10, + "max_iter": 300, + "random_state": 42} + +from sklearn.metrics import silhouette_score +#https://realpython.com/k-means-clustering-python/ +#import kneed +#from kneed import KneeLocator +scaler = StandardScaler() +X_unsup_scaled = scaler.fit_transform(X_unsup) + +silhouette_coefficients = [] +# Notice you start at 2 clusters for silhouette coefficient +for k in range(2, 5): + kmeans = KMeans(n_clusters=k, **kmeans_kwargs) + # kmeans = Pipeline([ + # # ('pre', MinMaxScaler()) + # ('pre', StandardScaler()) + # , ('clf', KMeans(n_clusters=k, **kmeans_kwargs)) + # ]) + kmeans.fit(X_unsup_scaled) + score = silhouette_score(X_unsup_scaled, kmeans.labels_) + silhouette_coefficients.append(score) + +plt.style.use("fivethirtyeight") +plt.plot(range(2, 5), silhouette_coefficients) +plt.xticks(range(2, 5)) +plt.xlabel("Number of Clusters") +plt.ylabel("Silhouette Coefficient") +plt.show() +plt.bar(range(2, 5), silhouette_coefficients) + +from sklearn.cluster import DBSCAN +from sklearn.datasets import make_moons +from sklearn.metrics import adjusted_rand_score + +# Instantiate k-means and dbscan algorithms +kmeans = KMeans(n_clusters=2) +dbscan = DBSCAN(eps=0.3) + +# Fit the algorithms to the features +kmeans.fit(X_unsup_scaled) +dbscan.fit(X_unsup_scaled) + +# Compute the silhouette scores for each algorithm +kmeans_silhouette = silhouette_score( + X_unsup_scaled, kmeans.labels_ ).round(2) +dbscan_silhouette = silhouette_score( + X_unsup_scaled, dbscan.labels_).round (2) + +kmeans_silhouette +dbscan_silhouette + +ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_) +ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_) + +round(ari_kmeans, 2) +round(ari_dbscan, 2) + +# Crescent plot +fig, (ax1, ax2) = plt.subplots( + 1, 2, figsize=(8, 6), sharex=True, sharey=True + ) + +fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16) +fte_colors = { + 0: "#008fd5", + 1: "#fc4f30", + } + +# The k-means plot +km_colors = [fte_colors[label] for label in kmeans.labels_] +ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors) +ax1.set_title( + f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12} + ) + +# The dbscan plot +db_colors = [fte_colors[label] for label in dbscan.labels_] +ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors) +ax2.set_title( + f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12} + ) +plt.show() + + +#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/ +from sklearn.cluster import AffinityPropagation +from matplotlib import pyplot +from numpy import unique +from numpy import where +from sklearn.datasets import make_classification +from sklearn.cluster import AgglomerativeClustering +from sklearn.cluster import Birch +from sklearn.cluster import DBSCAN +from sklearn.cluster import KMeans +from sklearn.cluster import MiniBatchKMeans +from sklearn.cluster import OPTICS +from sklearn.cluster import SpectralClustering +from sklearn.mixture import GaussianMixture +from sklearn.decomposition import PCA +# affinity propagation +# +# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges. +#============= +XA = np.array(X_unsup) + +# define the model +#model = AffinityPropagation(damping=0.9) +model = Pipeline([ + #('pca', PCA(n_components = 2)) + ('pre', MinMaxScaler()) + #('pre', StandardScaler()) + #, ('clf', AffinityPropagation(damping=0.9)) + , ('clf', AgglomerativeClustering(n_clusters=2)) #y + #, ('clf', Birch(threshold=0.01, n_clusters=2)) #y + #, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n + #, ('clf', KMeans(n_clusters=2)) #y + #, ('clf', MiniBatchKMeans(n_clusters=2)) + #, ('clf', OPTICS(eps=0.8, min_samples=10)) + #, ('clf', SpectralClustering(n_clusters=2)) + # , ('clf', GaussianMixture(n_components=2)) + + ]) +model +# fit the model +#model.fit(X_unsup) +#yhat = model.predict(X_unsup) +yhat = model.fit_predict(X_unsup) +# retrieve unique clusters +clusters = unique(yhat) + +# create scatter plot for samples from each cluster +for cluster in clusters: + print(cluster) + # get row indexes for samples with this cluster + row_ix = where(yhat == cluster) + print(row_ix) + + # create scatter of these samples + #pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) + pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1]) + +# show the plot +pyplot.show() +#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb +import os +import json +import numpy as np +import pandas as pd +from pandas.io.json import json_normalize +from datetime import datetime +from tqdm import tqdm +from sklearn.preprocessing import PowerTransformer +import umap +import matplotlib.pyplot as plt +import plotly.graph_objects as go +from scipy import stats +from sklearn.cluster import KMeans +from kmodes.kprototypes import KPrototypes +from lightgbm import LGBMClassifier +import shap +from sklearn.model_selection import cross_val_score +from kmodes.kprototypes import KPrototypes +import lightgbm +from lightgbm import LGBMClassifier + +full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv') +full_data.head() + +#Preprocessing numerical +numerical = full_data.select_dtypes(exclude='object') + +for c in numerical.columns: + print(c) + pt = PowerTransformer() + numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1)) + +##preprocessing categorical +categorical = full_data.select_dtypes(include='object') +categorical.head() + +for col in categorical.columns: + #print('-' * 40 + col + '-' * 40 , end=' - ') + display(categorical[col].value_counts().head(10)) + +categorical = pd.get_dummies(categorical) +categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1] + +#Embedding numerical & categorical +# fit1 = PCA(n_components=2).fit(numerical) +# fit2 = PCA(n_components=2).fit(categorical) +fit1 = umap.UMAP(metric='l2').fit(numerical) +fit2 = umap.UMAP(metric='dice').fit(categorical) + +intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight) +intersection = umap.umap_.reset_local_connectivity(intersection) +#https://github.com/lmcinnes/umap/issues/561 +embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data + , intersection + , fit1.n_components + , fit1._initial_alpha + , fit1._a + , fit1._b + , fit1.repulsion_strength + , fit1.negative_sample_rate + , 200 + , 'random' + , np.random + , fit1.metric + , fit1._metric_kwds + , False + , densmap_kwds = {} + , output_dens = False) +plt.figure(figsize=(20, 10)) +plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0) +plt.show() + +#One-Hot-Encoding +data = pd.get_dummies(full_data) + +#Pre-processing +for c in data.columns: + pt = PowerTransformer() + data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1)) +#Actual Clustering +kmeans = KMeans(n_clusters=15).fit(data) +kmeans_labels = kmeans.labels_ +#OPTIONAL: Elbow plot with inertia +#Elbow method to choose the optimal number of clusters +# sse = {} +# for k in tqdm(range(2, 50)): +# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data) +# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center + +# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values()))) +# fig.show() + +# K-Prototypes +kprot_data = full_data.copy() +#Pre-processing +for c in full_data.select_dtypes(exclude='object').columns: + pt = PowerTransformer() + kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1)) + +#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices +categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices + +#Actual clustering +kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4) +clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) + +#Prints the count of each cluster group +pd.Series(clusters).value_counts() + +#OPTIONAL: Elbow plot with cost (will take a LONG time) +costs = [] +n_clusters = [] +clusters_assigned = [] +for i in tqdm(range(2, 25)): +#for i in tqdm(range(2, 10)): + + try: + kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10) + clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) + costs.append(kproto.cost_) + n_clusters.append(i) + clusters_assigned.append(clusters) + except: + print(f"Can't cluster with {i} clusters") + +fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs )) +fig.show() + +# Visual Evaluation: Kmeans +fig, ax = plt.subplots() +fig.set_size_inches((20, 10)) +#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) +scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) + +# produce a legend with the unique colors from the scatter +legend1 = ax.legend(*scatter.legend_elements(num=15), + loc="lower left", title="Classes") +ax.add_artist(legend1) + +# produce a legend with the unique colors from the scatter +legend1 = ax.legend(*scatter.legend_elements(num=15), + loc="lower left", title="Classes") +ax.add_artist(legend1) + +# Visual Evaluation: K-Prototypes +fig, ax = plt.subplots() +fig.set_size_inches((20, 10)) +scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) + +# produce a legend with the unique colors from the scatter +legend1 = ax.legend(*scatter.legend_elements(num=15), + loc="lower left", title="Classes") +ax.add_artist(legend1) + +# Evaluation by Classification +#Setting the objects to category +lgbm_data = full_data.copy() +for c in lgbm_data.select_dtypes(include='object'): + lgbm_data[c] = lgbm_data[c].astype('category') + +#KMeans clusters +clf_km = LGBMClassifier(colsample_by_tree=0.8) +cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted') +print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}') + +#Fit the model +clf_km.fit(lgbm_data, kmeans_labels) +#SHAP values +explainer_km = shap.TreeExplainer(clf_km) +shap_values_km = explainer_km.shap_values(lgbm_data) +shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10)) + +#K-Prototypes +clf_kp = LGBMClassifier(colsample_by_tree=0.8) +#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted') +cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted') +print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}') + +#clf_kp.fit(lgbm_data, proto_clusters) +clf_kp.fit(lgbm_data, clusters) + +explainer_kp = shap.TreeExplainer(clf_kp) +shap_values_kp = explainer_kp.shap_values(lgbm_data) +shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10)) +#%%Example 5: My data +# FIXME: clusters and proto_clusters? + +full_data = all_df.copy() +full_data.head() +full_data.shape + +#Preprocessing numerical +numerical = full_data.select_dtypes(exclude='object') +numerical.shape + +for c in numerical.columns: + print(c) + pt = PowerTransformer() + numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1)) + +##preprocessing categorical +categorical = full_data.select_dtypes(include='object') +categorical.head() + +for col in categorical.columns: + #print('-' * 40 + col + '-' * 40 , end=' - ') + display(categorical[col].value_counts().head(10)) + +categorical = pd.get_dummies(categorical) +categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1] + +#Embedding numerical & categorical +# fit1 = PCA(n_components=2).fit(numerical) +# fit2 = PCA(n_components=2).fit(categorical) +fit1 = umap.UMAP(metric='l2').fit(numerical) +fit2 = umap.UMAP(metric='dice').fit(categorical) + +intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight) +intersection = umap.umap_.reset_local_connectivity(intersection) +#https://github.com/lmcinnes/umap/issues/561 +embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data + , intersection + , fit1.n_components + , fit1._initial_alpha + , fit1._a + , fit1._b + , fit1.repulsion_strength + , fit1.negative_sample_rate + , 200 + , 'random' + , np.random + , fit1.metric + , fit1._metric_kwds + , False + , densmap_kwds = {} + , output_dens = False) +plt.figure(figsize=(20, 10)) +plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0) +plt.show() + +#One-Hot-Encoding +data = pd.get_dummies(full_data) + +#Pre-processing +for c in data.columns: + pt = PowerTransformer() + data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1)) +#Actual Clustering +kmeans = KMeans(n_clusters=2).fit(data) +kmeans_labels = kmeans.labels_ +#OPTIONAL: Elbow plot with inertia +#Elbow method to choose the optimal number of clusters +# sse = {} +# for k in tqdm(range(2, 50)): +# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data) +# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center + +# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values()))) +# fig.show() + +# K-Prototypes +kprot_data = full_data.copy() +#Pre-processing +for c in full_data.select_dtypes(exclude='object').columns: + pt = PowerTransformer() + kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1)) + +#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices +#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices +categorical_columns = [] +for col in cat_df.columns: + print(col) + #categ_i += all_df.columns.get_loc(col) + categorical_columns.append(full_data.columns.get_loc(col)) + +print(categorical_columns) +print(len(categorical_columns)) + +#Actual clustering +kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10) +clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) +proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) + +#Prints the count of each cluster group +pd.Series(clusters).value_counts() + +#OPTIONAL: Elbow plot with cost (will take a LONG time) +costs = [] +n_clusters = [] +clusters_assigned = [] +for i in tqdm(range(2, 25)): +#for i in tqdm(range(2, 10)): + #print(i) + + try: + kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10) + #clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) + proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns) + + costs.append(kproto.cost_) + n_clusters.append(i) + clusters_assigned.append(clusters) + except: + print(f"Can't cluster with {i} clusters") + +fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs )) +fig.show() + +# Visual Evaluation: Kmeans +fig, ax = plt.subplots() +fig.set_size_inches((20, 10)) +#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) +scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) + +# produce a legend with the unique colors from the scatter +legend1 = ax.legend(*scatter.legend_elements(num=15), + loc="lower left", title="Classes") +ax.add_artist(legend1) + +# produce a legend with the unique colors from the scatter +legend1 = ax.legend(*scatter.legend_elements(num=15), + loc="lower left", title="Classes") +ax.add_artist(legend1) + +# Visual Evaluation: K-Prototypes +fig, ax = plt.subplots() +fig.set_size_inches((20, 10)) +scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0) + +# produce a legend with the unique colors from the scatter +legend1 = ax.legend(*scatter.legend_elements(num=2), #15 + loc="lower left", title="Classes") +ax.add_artist(legend1) + +# Evaluation by Classification +#Setting the objects to category +lgbm_data = full_data.copy() +for c in lgbm_data.select_dtypes(include='object'): + lgbm_data[c] = lgbm_data[c].astype('category') + +#KMeans clusters +clf_km = LGBMClassifier(colsample_by_tree=0.8) +cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted') +print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}') + +#Fit the model +clf_km.fit(lgbm_data, kmeans_labels) +#SHAP values +explainer_km = shap.TreeExplainer(clf_km) +shap_values_km = explainer_km.shap_values(lgbm_data) +shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10)) + +#K-Prototypes +clf_kp = LGBMClassifier(colsample_by_tree=0.8) +cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted') +cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted') +print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}') + +clf_kp.fit(lgbm_data, proto_clusters) +clf_kp.fit(lgbm_data, clusters) + +explainer_kp = shap.TreeExplainer(clf_kp) +shap_values_kp = explainer_kp.shap_values(lgbm_data) +shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))