added tutorial examples and my data workthrough examplesin unsup_v1.py

This commit is contained in:
Tanushree Tunstall 2022-03-23 16:23:18 +00:00
parent ad5ebad7f8
commit 89a0c3a58a
4 changed files with 1123 additions and 0 deletions

261
imports_unsup.py Normal file
View file

@ -0,0 +1,261 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 6 13:41:54 2022
@author: tanu
"""
import os, sys
import pandas as pd
import numpy as np
import pprint as pp
from copy import deepcopy
import sklearn
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
from sklearn.metrics import jaccard_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
print(np.__version__)
print(pd.__version__)
from statistics import mean, stdev, median, mode
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
#from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn import cluster, datasets
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
print("Python Version : ",sys.version)
print("Python Version : ",sys.version)
print("Scikit-Learn Version : ",sklearn.__version__)
#warnings.filterwarnings('ignore') ## We'll silent future warnings using this command.
np.set_printoptions(precision=3)
#fits plot inside of current notebook.
#%matplotlib inline
#%%
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
, 'fscore' : make_scorer(f1_score)
, 'mcc' : make_scorer(matthews_corrcoef)
, 'precision' : make_scorer(precision_score)
, 'recall' : make_scorer(recall_score)
, 'roc_auc' : make_scorer(roc_auc_score)
, 'jcc' : make_scorer(jaccard_score)
})
rs = {'random_state': 42}
njobs = {'n_jobs': 10}
skf_cv = StratifiedKFold(n_splits = 10
#, shuffle = False, random_state= None)
, shuffle = True,**rs)
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
, n_repeats=3
#, shuffle = False, random_state= None)
#, shuffle = True
,**rs)
#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
#%%
homedir = os.path.expanduser("~")
os.chdir(homedir + "/git/ML_AI_training/")
# my function
#from MultClassPipe import MultClassPipeline
from MultClassPipe2 import MultClassPipeline2
from loopity_loop import MultClassPipeSKFLoop
from MultClassPipe3 import MultClassPipeSKFCV
gene = 'pncA'
drug = 'pyrazinamide'
#==============
# directories
#==============
datadir = homedir + '/git/Data/'
indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
#=======
# input
#=======
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
my_df = pd.read_csv(infile_ml1)
my_df.dtypes
my_df_cols = my_df.columns
geneL_basic = ['pnca']
geneL_na = ['gid']
geneL_na_ppi2 = ['rpob']
geneL_ppi2 = ['alr', 'embb', 'katg']
#%% get cols
mycols = my_df.columns
# change from numberic to
num_type = ['int64', 'float64']
cat_type = ['object', 'bool']
if my_df['active_aa_pos'].dtype in num_type:
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
my_df['active_aa_pos'].dtype
# FIXME: if this is not structural, remove from source..
# Drop NA where numerical cols have them
if gene.lower() in geneL_na_ppi2:
#D1148 get rid of
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
# FIXME: either impute or remove!
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
if gene.lower() in ['embb']:
na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
my_df = my_df.drop(index=na_index)
#%%============================================================================
# Target1: mutation_info_labels, convert to
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map)
my_df['mutation_class'].value_counts()
my_df['mutation_info_labels']. value_counts()
#%%
# GET X
common_cols_stabiltyN = ['ligand_distance'
, 'ligand_affinity_change'
, 'duet_stability_change'
, 'ddg_foldx'
, 'deepddg'
, 'ddg_dynamut2']
# Build stability columns ~ gene
if gene.lower() in geneL_basic:
x_stabilityN = common_cols_stabiltyN
if gene.lower() in geneL_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
, 'interface_dist']
if gene.lower() in geneL_na:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity']
if gene.lower() in geneL_na_ppi2:
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
X_strFN = ['asa'
, 'rsa'
, 'kd_values'
, 'rd_values']
X_evolFN = ['consurf_score'
, 'snap2_score'
, 'snap2_accuracy_pc']
# X_genomicFN = ['af'
# , 'or_mychisq'
# , 'or_logistic'
# , 'or_fisher'
# , 'pval_fisher']
#%% Construct numerical and categorical column names
numerical_FN = x_stabilityN + X_strFN + X_evolFN
# separate ones for foldx?
categorical_FN = ['ss_class'
, 'wt_prop_water'
# , 'lineage_labels' # misleading if using merged_df3
, 'mut_prop_water'
, 'wt_prop_polarity'
, 'mut_prop_polarity'
, 'wt_calcprop'
, 'mut_calcprop'
, 'active_aa_pos']
#%% extracting dfs based on numerical, categorical column names
#----------------------------------
# WITHOUT the target var included
#----------------------------------
num_df = my_df[numerical_FN]
num_df.shape
cat_df = my_df[categorical_FN]
cat_df.shape
all_df = my_df[numerical_FN + categorical_FN]
all_df.shape
#------------------------------
# WITH the target var included:
#'wtgt': with target
#------------------------------
num_df_wtgt = my_df[numerical_FN + ['mutation_class']]
num_df_wtgt.shape
cat_df_wtgt = my_df[categorical_FN + ['mutation_class']]
cat_df_wtgt.shape
all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
all_df_wtgt.shape
#%%
#%% Get train-test split and scoring functions
# X = num_df_wtgt[numerical_FN]
# y = num_df_wtgt['mutation_class']
# X_train, X_test, y_train, y_test = train_test_split(X
# , y
# , test_size = 0.33
# , **rs
# , shuffle = True
# , stratify = y)

27
itertools.py Normal file
View file

@ -0,0 +1,27 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 22 17:12:12 2022
@author: tanu
"""
# itertools
# https://datagy.io/python-combinations-of-a-list/
from itertools import combinations
sample_list = ['a', 'b', 'c']
list_combinations = list()
for n in range(len(sample_list) + 1):
list_combinations += list(combinations(sample_list, n))
print(list_combinations)
#%%
col_list = num_df_wtgt.columns
col_list = [1:4]
len(col_list)
col_L_combinations = list()
for n in range(1, len(col_list) + 1):
col_L_combinations += list(combinations(col_list, n))
print(col_L_combinations)
print(len(col_L_combinations))

57
umap_fs.py Normal file
View file

@ -0,0 +1,57 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 23 13:36:46 2022
@author: tanu
"""
#https://umap-learn.readthedocs.io/en/latest/auto_examples/plot_feature_extraction_classification.html
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from umap import UMAP
# Make a toy dataset
X, y = make_classification(
n_samples=1000,
n_features=300,
n_informative=250,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=1212,
)
# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Classification with a linear SVM
svc = LinearSVC(dual=False, random_state=123)
params_grid = {"C": [10 ** k for k in range(-3, 4)]}
clf = GridSearchCV(svc, params_grid)
clf.fit(X_train, y_train)
print(
"Accuracy on the test set with raw data: {:.3f}".format(clf.score(X_test, y_test))
)
# Transformation with UMAP followed by classification with a linear SVM
umap = UMAP(random_state=456)
pipeline = Pipeline([("umap", umap), ("svc", svc)])
params_grid_pipeline = {
"umap__n_neighbors": [5, 20],
"umap__n_components": [15, 25, 50],
"svc__C": [10 ** k for k in range(-3, 4)],
}
clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline)
clf_pipeline.fit(X_train, y_train)
print(
"Accuracy on the test set with UMAP transformation: {:.3f}".format(
clf_pipeline.score(X_test, y_test)
)
)

778
unsup_v1.py Normal file
View file

@ -0,0 +1,778 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 16 16:55:06 2022
@author: tanu
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import cluster, datasets
import warnings
import sys
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
from sklearn.metrics import v_measure_score
# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
# K-means
# HC
# Gaussian mixed mixture
# FP growth
# PCA
# Meanshift
# DBScan
# Model assessment:
# Mututal information
# Silloute score
# v_measure_score
# Itertools.combinations()
#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
# For K-means clustering, the model is that all clusters
# have equal, spherical variance.
samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
print('Dataset size : ', samples.shape, clusters.shape)
print('Cluster names : ',set(clusters))
with plt.style.context(('ggplot', 'seaborn')):
plt.figure(figsize=(8,6))
for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Visualizing Dataset')
plt.legend(loc='best')
#kmeans = cluster.KMeans(n_clusters=5)
kmeans = cluster.KMeans(n_clusters=2)
kmeans.fit(samples)
preds = kmeans.predict(samples)
print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
print('Cluster Centers : \n', str(kmeans.cluster_centers_))
print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
with plt.style.context(('ggplot', 'seaborn')):
plt.figure(figsize=(10,6))
plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Visualizing Predictions & Cluster Centers')
plt.legend(loc='best')
# The Elbow Method: To decide the numbers of cluster i.e. 'k'
plt.figure(figsize=(8,5))
distortions = []
for i in range(1,11):
kmeans = cluster.KMeans(n_clusters=i)
kmeans.fit(samples)
distortions.append(kmeans.inertia_)
print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
with plt.style.context(('ggplot', 'seaborn')):
plt.plot(range(1,11), distortions, )
plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
plt.xlabel('Number Of Clusters')
plt.ylabel('Distortions')
plt.title('The Elbow Method (Num of Clusters vs Distortions)')
plt.xticks(range(1,11));
#%% Example 1: My data
X_unsup = num_df_wtgt[['ligand_affinity_change'
, 'duet_stability_change']]
kmeans = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
, ('clf', KMeans(n_clusters = 2))
])
#kmeans = KMeans(n_clusters = 2)
#kmeans.fit(X_unsup)
kmeans.fit(X_unsup)
y_kmeans = kmeans.predict(X_unsup)
plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
, X_unsup.loc[:, 'duet_stability_change']
, c = y_kmeans
, s = 50
, cmap = 'viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1]
, c = 'black'
, s = 200
, alpha = 0.5);
plt.show()
#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
# Loading dataset
iris_df = datasets.load_iris()
# Available methods on dataset
print(dir(iris_df))
# Features
print(iris_df.feature_names)
# Targets
print(iris_df.target)
# Target Names
print(iris_df.target_names)
label = {0: 'red', 1: 'blue', 2: 'green'}
# Dataset Slicing
x_axis = iris_df.data[:, 0] # Sepal Length
y_axis = iris_df.data[:, 2] # Sepal Width
# Plotting
plt.scatter(x_axis, y_axis, c=iris_df.target)
plt.show()
# Use Kmeans: Declaring Model
model = KMeans(n_clusters = 3)
# Fitting Model
model.fit(iris_df.data)
# Predicitng a single input
predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
# Prediction on the entire data
all_predictions = model.predict(iris_df.data)
# Printing Predictions
print(predicted_label)
print(all_predictions)
#%% Example 2: My data
X_unsup = num_df_wtgt[[#'ligand_affinity_change'
#, 'duet_stability_change'
'ddg_foldx'
,'deepddg']]
y_unsup = num_df_wtgt[['mutation_class']]
# X_train, X_test, y_train, y_test = train_test_split(X_unsup
# , y_unsup
# , test_size = 0.33
# , **rs
# , shuffle = True
# , stratify = y_unsup)
#model = KMeans(n_clusters=2)
model = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
, ('clf', KMeans(n_clusters = 2))
])
label = {0: 'blue', 1: 'red'}
model.fit(X_unsup)
predicted_label = model.predict(X_test)
all_predictions = model.predict(X_unsup)
print(predicted_label)
print(all_predictions)
plt.scatter(X_train.loc[:, 'ligand_affinity_change']
, X_train.loc[:, 'duet_stability_change']
, c = y_train.loc[:, 'mutation_class'])
#from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
visualizer.show()
#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
# List of V-Measure Scores for different models
v_scores = []
# List of different types of covariance parameters
N_Clusters = [2, 3]
# Building the clustering model
kmeans2 = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
# ('pre'), StandardScaler())
, ('clf', KMeans(n_clusters = 2))
])
# Training the clustering model
kmeans2.fit(X_unsup)
# Storing the predicted Clustering labels
labels2 = kmeans2.predict(X_unsup)
# Evaluating the performance
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
# Building the clustering model
kmeans3 = Pipeline([
#('pca', PCA())
('pre', MinMaxScaler())
# ('pre'), StandardScaler())
, ('clf', KMeans(n_clusters = 3))
])
# Training the clustering model
kmeans3.fit(X_unsup)
# Storing the predicted Clustering labels
labels3 = kmeans3.predict(X_unsup)
# Evaluating the performance
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
# Plotting a Bar Graph to compare the models
plt.bar(N_Clusters, v_scores)
plt.xlabel('Number of Clusters')
plt.ylabel('V-Measure Score')
plt.title('Comparison of different Clustering Models')
plt.show()
# Score: silhouette
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42}
from sklearn.metrics import silhouette_score
#https://realpython.com/k-means-clustering-python/
#import kneed
#from kneed import KneeLocator
scaler = StandardScaler()
X_unsup_scaled = scaler.fit_transform(X_unsup)
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 5):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
# kmeans = Pipeline([
# # ('pre', MinMaxScaler())
# ('pre', StandardScaler())
# , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
# ])
kmeans.fit(X_unsup_scaled)
score = silhouette_score(X_unsup_scaled, kmeans.labels_)
silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2, 5), silhouette_coefficients)
plt.xticks(range(2, 5))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
plt.bar(range(2, 5), silhouette_coefficients)
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
from sklearn.metrics import adjusted_rand_score
# Instantiate k-means and dbscan algorithms
kmeans = KMeans(n_clusters=2)
dbscan = DBSCAN(eps=0.3)
# Fit the algorithms to the features
kmeans.fit(X_unsup_scaled)
dbscan.fit(X_unsup_scaled)
# Compute the silhouette scores for each algorithm
kmeans_silhouette = silhouette_score(
X_unsup_scaled, kmeans.labels_ ).round(2)
dbscan_silhouette = silhouette_score(
X_unsup_scaled, dbscan.labels_).round (2)
kmeans_silhouette
dbscan_silhouette
ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
round(ari_kmeans, 2)
round(ari_dbscan, 2)
# Crescent plot
fig, (ax1, ax2) = plt.subplots(
1, 2, figsize=(8, 6), sharex=True, sharey=True
)
fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
fte_colors = {
0: "#008fd5",
1: "#fc4f30",
}
# The k-means plot
km_colors = [fte_colors[label] for label in kmeans.labels_]
ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
ax1.set_title(
f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
)
# The dbscan plot
db_colors = [fte_colors[label] for label in dbscan.labels_]
ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
ax2.set_title(
f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
)
plt.show()
#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
from sklearn.cluster import AffinityPropagation
from matplotlib import pyplot
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import OPTICS
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
# affinity propagation
#
# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
#=============
XA = np.array(X_unsup)
# define the model
#model = AffinityPropagation(damping=0.9)
model = Pipeline([
#('pca', PCA(n_components = 2))
('pre', MinMaxScaler())
#('pre', StandardScaler())
#, ('clf', AffinityPropagation(damping=0.9))
, ('clf', AgglomerativeClustering(n_clusters=2)) #y
#, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
#, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
#, ('clf', KMeans(n_clusters=2)) #y
#, ('clf', MiniBatchKMeans(n_clusters=2))
#, ('clf', OPTICS(eps=0.8, min_samples=10))
#, ('clf', SpectralClustering(n_clusters=2))
# , ('clf', GaussianMixture(n_components=2))
])
model
# fit the model
#model.fit(X_unsup)
#yhat = model.predict(X_unsup)
yhat = model.fit_predict(X_unsup)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
print(cluster)
# get row indexes for samples with this cluster
row_ix = where(yhat == cluster)
print(row_ix)
# create scatter of these samples
#pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
# show the plot
pyplot.show()
#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime
from tqdm import tqdm
from sklearn.preprocessing import PowerTransformer
import umap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from scipy import stats
from sklearn.cluster import KMeans
from kmodes.kprototypes import KPrototypes
from lightgbm import LGBMClassifier
import shap
from sklearn.model_selection import cross_val_score
from kmodes.kprototypes import KPrototypes
import lightgbm
from lightgbm import LGBMClassifier
full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
full_data.head()
#Preprocessing numerical
numerical = full_data.select_dtypes(exclude='object')
for c in numerical.columns:
print(c)
pt = PowerTransformer()
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
##preprocessing categorical
categorical = full_data.select_dtypes(include='object')
categorical.head()
for col in categorical.columns:
#print('-' * 40 + col + '-' * 40 , end=' - ')
display(categorical[col].value_counts().head(10))
categorical = pd.get_dummies(categorical)
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
#Embedding numerical & categorical
# fit1 = PCA(n_components=2).fit(numerical)
# fit2 = PCA(n_components=2).fit(categorical)
fit1 = umap.UMAP(metric='l2').fit(numerical)
fit2 = umap.UMAP(metric='dice').fit(categorical)
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
intersection = umap.umap_.reset_local_connectivity(intersection)
#https://github.com/lmcinnes/umap/issues/561
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
, intersection
, fit1.n_components
, fit1._initial_alpha
, fit1._a
, fit1._b
, fit1.repulsion_strength
, fit1.negative_sample_rate
, 200
, 'random'
, np.random
, fit1.metric
, fit1._metric_kwds
, False
, densmap_kwds = {}
, output_dens = False)
plt.figure(figsize=(20, 10))
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
plt.show()
#One-Hot-Encoding
data = pd.get_dummies(full_data)
#Pre-processing
for c in data.columns:
pt = PowerTransformer()
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
#Actual Clustering
kmeans = KMeans(n_clusters=15).fit(data)
kmeans_labels = kmeans.labels_
#OPTIONAL: Elbow plot with inertia
#Elbow method to choose the optimal number of clusters
# sse = {}
# for k in tqdm(range(2, 50)):
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
# fig.show()
# K-Prototypes
kprot_data = full_data.copy()
#Pre-processing
for c in full_data.select_dtypes(exclude='object').columns:
pt = PowerTransformer()
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
#Actual clustering
kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
#Prints the count of each cluster group
pd.Series(clusters).value_counts()
#OPTIONAL: Elbow plot with cost (will take a LONG time)
costs = []
n_clusters = []
clusters_assigned = []
for i in tqdm(range(2, 25)):
#for i in tqdm(range(2, 10)):
try:
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
costs.append(kproto.cost_)
n_clusters.append(i)
clusters_assigned.append(clusters)
except:
print(f"Can't cluster with {i} clusters")
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
fig.show()
# Visual Evaluation: Kmeans
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Visual Evaluation: K-Prototypes
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Evaluation by Classification
#Setting the objects to category
lgbm_data = full_data.copy()
for c in lgbm_data.select_dtypes(include='object'):
lgbm_data[c] = lgbm_data[c].astype('category')
#KMeans clusters
clf_km = LGBMClassifier(colsample_by_tree=0.8)
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
#Fit the model
clf_km.fit(lgbm_data, kmeans_labels)
#SHAP values
explainer_km = shap.TreeExplainer(clf_km)
shap_values_km = explainer_km.shap_values(lgbm_data)
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
#K-Prototypes
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
#clf_kp.fit(lgbm_data, proto_clusters)
clf_kp.fit(lgbm_data, clusters)
explainer_kp = shap.TreeExplainer(clf_kp)
shap_values_kp = explainer_kp.shap_values(lgbm_data)
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
#%%Example 5: My data
# FIXME: clusters and proto_clusters?
full_data = all_df.copy()
full_data.head()
full_data.shape
#Preprocessing numerical
numerical = full_data.select_dtypes(exclude='object')
numerical.shape
for c in numerical.columns:
print(c)
pt = PowerTransformer()
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
##preprocessing categorical
categorical = full_data.select_dtypes(include='object')
categorical.head()
for col in categorical.columns:
#print('-' * 40 + col + '-' * 40 , end=' - ')
display(categorical[col].value_counts().head(10))
categorical = pd.get_dummies(categorical)
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
#Embedding numerical & categorical
# fit1 = PCA(n_components=2).fit(numerical)
# fit2 = PCA(n_components=2).fit(categorical)
fit1 = umap.UMAP(metric='l2').fit(numerical)
fit2 = umap.UMAP(metric='dice').fit(categorical)
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
intersection = umap.umap_.reset_local_connectivity(intersection)
#https://github.com/lmcinnes/umap/issues/561
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
, intersection
, fit1.n_components
, fit1._initial_alpha
, fit1._a
, fit1._b
, fit1.repulsion_strength
, fit1.negative_sample_rate
, 200
, 'random'
, np.random
, fit1.metric
, fit1._metric_kwds
, False
, densmap_kwds = {}
, output_dens = False)
plt.figure(figsize=(20, 10))
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
plt.show()
#One-Hot-Encoding
data = pd.get_dummies(full_data)
#Pre-processing
for c in data.columns:
pt = PowerTransformer()
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
#Actual Clustering
kmeans = KMeans(n_clusters=2).fit(data)
kmeans_labels = kmeans.labels_
#OPTIONAL: Elbow plot with inertia
#Elbow method to choose the optimal number of clusters
# sse = {}
# for k in tqdm(range(2, 50)):
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
# fig.show()
# K-Prototypes
kprot_data = full_data.copy()
#Pre-processing
for c in full_data.select_dtypes(exclude='object').columns:
pt = PowerTransformer()
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
categorical_columns = []
for col in cat_df.columns:
print(col)
#categ_i += all_df.columns.get_loc(col)
categorical_columns.append(full_data.columns.get_loc(col))
print(categorical_columns)
print(len(categorical_columns))
#Actual clustering
kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
#Prints the count of each cluster group
pd.Series(clusters).value_counts()
#OPTIONAL: Elbow plot with cost (will take a LONG time)
costs = []
n_clusters = []
clusters_assigned = []
for i in tqdm(range(2, 25)):
#for i in tqdm(range(2, 10)):
#print(i)
try:
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
#clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
costs.append(kproto.cost_)
n_clusters.append(i)
clusters_assigned.append(clusters)
except:
print(f"Can't cluster with {i} clusters")
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
fig.show()
# Visual Evaluation: Kmeans
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Visual Evaluation: K-Prototypes
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=2), #15
loc="lower left", title="Classes")
ax.add_artist(legend1)
# Evaluation by Classification
#Setting the objects to category
lgbm_data = full_data.copy()
for c in lgbm_data.select_dtypes(include='object'):
lgbm_data[c] = lgbm_data[c].astype('category')
#KMeans clusters
clf_km = LGBMClassifier(colsample_by_tree=0.8)
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
#Fit the model
clf_km.fit(lgbm_data, kmeans_labels)
#SHAP values
explainer_km = shap.TreeExplainer(clf_km)
shap_values_km = explainer_km.shap_values(lgbm_data)
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
#K-Prototypes
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
clf_kp.fit(lgbm_data, proto_clusters)
clf_kp.fit(lgbm_data, clusters)
explainer_kp = shap.TreeExplainer(clf_kp)
shap_values_kp = explainer_kp.shap_values(lgbm_data)
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))