added tutorial examples and my data workthrough examplesin unsup_v1.py
This commit is contained in:
parent
ad5ebad7f8
commit
89a0c3a58a
4 changed files with 1123 additions and 0 deletions
261
imports_unsup.py
Normal file
261
imports_unsup.py
Normal file
|
@ -0,0 +1,261 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sun Mar 6 13:41:54 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import pprint as pp
|
||||
from copy import deepcopy
|
||||
import sklearn
|
||||
from sklearn import linear_model
|
||||
from sklearn.linear_model import LogisticRegression, LinearRegression
|
||||
from sklearn.naive_bayes import BernoulliNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from xgboost import XGBClassifier
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
||||
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.compose import make_column_transformer
|
||||
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
|
||||
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, matthews_corrcoef
|
||||
from sklearn.metrics import jaccard_score
|
||||
|
||||
from sklearn.metrics import make_scorer
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
from sklearn.metrics import average_precision_score
|
||||
|
||||
from sklearn.model_selection import cross_validate
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.pipeline import make_pipeline
|
||||
|
||||
from sklearn.feature_selection import RFE
|
||||
from sklearn.feature_selection import RFECV
|
||||
import itertools
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
print(np.__version__)
|
||||
print(pd.__version__)
|
||||
from statistics import mean, stdev, median, mode
|
||||
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from imblearn.pipeline import Pipeline
|
||||
#from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import cross_validate, cross_val_score
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from imblearn.combine import SMOTEENN
|
||||
from imblearn.under_sampling import EditedNearestNeighbours
|
||||
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.base import BaseEstimator
|
||||
|
||||
from sklearn import cluster, datasets
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
|
||||
|
||||
print("Python Version : ",sys.version)
|
||||
print("Python Version : ",sys.version)
|
||||
print("Scikit-Learn Version : ",sklearn.__version__)
|
||||
#warnings.filterwarnings('ignore') ## We'll silent future warnings using this command.
|
||||
np.set_printoptions(precision=3)
|
||||
#fits plot inside of current notebook.
|
||||
#%matplotlib inline
|
||||
|
||||
#%%
|
||||
scoring_fn = ({'accuracy' : make_scorer(accuracy_score)
|
||||
, 'fscore' : make_scorer(f1_score)
|
||||
, 'mcc' : make_scorer(matthews_corrcoef)
|
||||
, 'precision' : make_scorer(precision_score)
|
||||
, 'recall' : make_scorer(recall_score)
|
||||
, 'roc_auc' : make_scorer(roc_auc_score)
|
||||
, 'jcc' : make_scorer(jaccard_score)
|
||||
})
|
||||
|
||||
rs = {'random_state': 42}
|
||||
njobs = {'n_jobs': 10}
|
||||
skf_cv = StratifiedKFold(n_splits = 10
|
||||
#, shuffle = False, random_state= None)
|
||||
, shuffle = True,**rs)
|
||||
rskf_cv = RepeatedStratifiedKFold(n_splits = 10
|
||||
, n_repeats=3
|
||||
#, shuffle = False, random_state= None)
|
||||
#, shuffle = True
|
||||
,**rs)
|
||||
#my_mcc = make_scorer({'mcc':make_scorer(matthews_corrcoef})
|
||||
mcc_score_fn = {'mcc': make_scorer(matthews_corrcoef)}
|
||||
|
||||
#%%
|
||||
homedir = os.path.expanduser("~")
|
||||
os.chdir(homedir + "/git/ML_AI_training/")
|
||||
|
||||
# my function
|
||||
#from MultClassPipe import MultClassPipeline
|
||||
from MultClassPipe2 import MultClassPipeline2
|
||||
from loopity_loop import MultClassPipeSKFLoop
|
||||
from MultClassPipe3 import MultClassPipeSKFCV
|
||||
|
||||
|
||||
gene = 'pncA'
|
||||
drug = 'pyrazinamide'
|
||||
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
datadir = homedir + '/git/Data/'
|
||||
indir = datadir + drug + '/input/'
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
infile_ml1 = outdir + gene.lower() + '_merged_df3.csv'
|
||||
#infile_ml2 = outdir + gene.lower() + '_merged_df2.csv'
|
||||
|
||||
my_df = pd.read_csv(infile_ml1)
|
||||
my_df.dtypes
|
||||
my_df_cols = my_df.columns
|
||||
|
||||
geneL_basic = ['pnca']
|
||||
geneL_na = ['gid']
|
||||
geneL_na_ppi2 = ['rpob']
|
||||
geneL_ppi2 = ['alr', 'embb', 'katg']
|
||||
#%% get cols
|
||||
mycols = my_df.columns
|
||||
|
||||
# change from numberic to
|
||||
num_type = ['int64', 'float64']
|
||||
cat_type = ['object', 'bool']
|
||||
|
||||
if my_df['active_aa_pos'].dtype in num_type:
|
||||
my_df['active_aa_pos'] = my_df['active_aa_pos'].astype(object)
|
||||
my_df['active_aa_pos'].dtype
|
||||
|
||||
# FIXME: if this is not structural, remove from source..
|
||||
# Drop NA where numerical cols have them
|
||||
if gene.lower() in geneL_na_ppi2:
|
||||
#D1148 get rid of
|
||||
na_index = my_df['mutationinformation'].index[my_df['mcsm_na_affinity'].apply(np.isnan)]
|
||||
my_df = my_df.drop(index=na_index)
|
||||
|
||||
# FIXME: either impute or remove!
|
||||
# for embb (L114M, F115L, V123L, V125I, V131M) delete for now
|
||||
if gene.lower() in ['embb']:
|
||||
na_index = my_df['mutationinformation'].index[my_df['ligand_distance'].apply(np.isnan)]
|
||||
my_df = my_df.drop(index=na_index)
|
||||
#%%============================================================================
|
||||
|
||||
# Target1: mutation_info_labels, convert to
|
||||
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
|
||||
my_df['mutation_class'] = my_df['mutation_info_labels'].map(dm_om_map)
|
||||
my_df['mutation_class'].value_counts()
|
||||
my_df['mutation_info_labels']. value_counts()
|
||||
|
||||
#%%
|
||||
# GET X
|
||||
common_cols_stabiltyN = ['ligand_distance'
|
||||
, 'ligand_affinity_change'
|
||||
, 'duet_stability_change'
|
||||
, 'ddg_foldx'
|
||||
, 'deepddg'
|
||||
, 'ddg_dynamut2']
|
||||
|
||||
# Build stability columns ~ gene
|
||||
if gene.lower() in geneL_basic:
|
||||
x_stabilityN = common_cols_stabiltyN
|
||||
|
||||
if gene.lower() in geneL_ppi2:
|
||||
x_stabilityN = common_cols_stabiltyN + ['mcsm_ppi2_affinity'
|
||||
, 'interface_dist']
|
||||
if gene.lower() in geneL_na:
|
||||
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity']
|
||||
|
||||
if gene.lower() in geneL_na_ppi2:
|
||||
x_stabilityN = common_cols_stabiltyN + ['mcsm_na_affinity'] + ['mcsm_ppi2_affinity', 'interface_dist']
|
||||
|
||||
X_strFN = ['asa'
|
||||
, 'rsa'
|
||||
, 'kd_values'
|
||||
, 'rd_values']
|
||||
|
||||
X_evolFN = ['consurf_score'
|
||||
, 'snap2_score'
|
||||
, 'snap2_accuracy_pc']
|
||||
|
||||
# X_genomicFN = ['af'
|
||||
# , 'or_mychisq'
|
||||
# , 'or_logistic'
|
||||
# , 'or_fisher'
|
||||
# , 'pval_fisher']
|
||||
|
||||
#%% Construct numerical and categorical column names
|
||||
numerical_FN = x_stabilityN + X_strFN + X_evolFN
|
||||
|
||||
# separate ones for foldx?
|
||||
categorical_FN = ['ss_class'
|
||||
, 'wt_prop_water'
|
||||
# , 'lineage_labels' # misleading if using merged_df3
|
||||
, 'mut_prop_water'
|
||||
, 'wt_prop_polarity'
|
||||
, 'mut_prop_polarity'
|
||||
, 'wt_calcprop'
|
||||
, 'mut_calcprop'
|
||||
, 'active_aa_pos']
|
||||
|
||||
#%% extracting dfs based on numerical, categorical column names
|
||||
#----------------------------------
|
||||
# WITHOUT the target var included
|
||||
#----------------------------------
|
||||
num_df = my_df[numerical_FN]
|
||||
num_df.shape
|
||||
|
||||
cat_df = my_df[categorical_FN]
|
||||
cat_df.shape
|
||||
|
||||
all_df = my_df[numerical_FN + categorical_FN]
|
||||
all_df.shape
|
||||
|
||||
#------------------------------
|
||||
# WITH the target var included:
|
||||
#'wtgt': with target
|
||||
#------------------------------
|
||||
num_df_wtgt = my_df[numerical_FN + ['mutation_class']]
|
||||
num_df_wtgt.shape
|
||||
|
||||
cat_df_wtgt = my_df[categorical_FN + ['mutation_class']]
|
||||
cat_df_wtgt.shape
|
||||
|
||||
all_df_wtgt = my_df[numerical_FN + categorical_FN + ['mutation_class']]
|
||||
all_df_wtgt.shape
|
||||
|
||||
#%%
|
||||
#%% Get train-test split and scoring functions
|
||||
# X = num_df_wtgt[numerical_FN]
|
||||
# y = num_df_wtgt['mutation_class']
|
||||
|
||||
# X_train, X_test, y_train, y_test = train_test_split(X
|
||||
# , y
|
||||
# , test_size = 0.33
|
||||
# , **rs
|
||||
# , shuffle = True
|
||||
# , stratify = y)
|
||||
|
27
itertools.py
Normal file
27
itertools.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Mar 22 17:12:12 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
# itertools
|
||||
# https://datagy.io/python-combinations-of-a-list/
|
||||
from itertools import combinations
|
||||
sample_list = ['a', 'b', 'c']
|
||||
list_combinations = list()
|
||||
for n in range(len(sample_list) + 1):
|
||||
list_combinations += list(combinations(sample_list, n))
|
||||
print(list_combinations)
|
||||
|
||||
|
||||
#%%
|
||||
col_list = num_df_wtgt.columns
|
||||
col_list = [1:4]
|
||||
len(col_list)
|
||||
|
||||
col_L_combinations = list()
|
||||
for n in range(1, len(col_list) + 1):
|
||||
col_L_combinations += list(combinations(col_list, n))
|
||||
print(col_L_combinations)
|
||||
print(len(col_L_combinations))
|
57
umap_fs.py
Normal file
57
umap_fs.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Mar 23 13:36:46 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#https://umap-learn.readthedocs.io/en/latest/auto_examples/plot_feature_extraction_classification.html
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.model_selection import train_test_split, GridSearchCV
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from umap import UMAP
|
||||
|
||||
# Make a toy dataset
|
||||
X, y = make_classification(
|
||||
n_samples=1000,
|
||||
n_features=300,
|
||||
n_informative=250,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
n_classes=2,
|
||||
random_state=1212,
|
||||
)
|
||||
|
||||
# Split the dataset into a training set and a test set
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
# Classification with a linear SVM
|
||||
svc = LinearSVC(dual=False, random_state=123)
|
||||
params_grid = {"C": [10 ** k for k in range(-3, 4)]}
|
||||
clf = GridSearchCV(svc, params_grid)
|
||||
clf.fit(X_train, y_train)
|
||||
print(
|
||||
"Accuracy on the test set with raw data: {:.3f}".format(clf.score(X_test, y_test))
|
||||
)
|
||||
|
||||
# Transformation with UMAP followed by classification with a linear SVM
|
||||
umap = UMAP(random_state=456)
|
||||
pipeline = Pipeline([("umap", umap), ("svc", svc)])
|
||||
params_grid_pipeline = {
|
||||
"umap__n_neighbors": [5, 20],
|
||||
"umap__n_components": [15, 25, 50],
|
||||
"svc__C": [10 ** k for k in range(-3, 4)],
|
||||
}
|
||||
|
||||
|
||||
clf_pipeline = GridSearchCV(pipeline, params_grid_pipeline)
|
||||
clf_pipeline.fit(X_train, y_train)
|
||||
print(
|
||||
"Accuracy on the test set with UMAP transformation: {:.3f}".format(
|
||||
clf_pipeline.score(X_test, y_test)
|
||||
)
|
||||
)
|
778
unsup_v1.py
Normal file
778
unsup_v1.py
Normal file
|
@ -0,0 +1,778 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Mar 16 16:55:06 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import sklearn
|
||||
from sklearn import cluster, datasets
|
||||
import warnings
|
||||
import sys
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, adjusted_rand_score
|
||||
from sklearn.metrics import v_measure_score
|
||||
# Algo:https://machinelearningmastery.com/clustering-algorithms-with-python/
|
||||
# K-means
|
||||
# HC
|
||||
# Gaussian mixed mixture
|
||||
# FP growth
|
||||
# PCA
|
||||
# Meanshift
|
||||
# DBScan
|
||||
|
||||
# Model assessment:
|
||||
# Mututal information
|
||||
# Silloute score
|
||||
# v_measure_score
|
||||
|
||||
# Itertools.combinations()
|
||||
|
||||
#%% Example 1: Kmeans, https://coderzcolumn.com/tutorials/machine-learning/unsupervised-learning-clustering-kmeans-using-scikit-learn-sklearn
|
||||
|
||||
# For K-means clustering, the model is that all clusters
|
||||
# have equal, spherical variance.
|
||||
|
||||
samples, clusters = datasets.make_blobs(n_samples=250, n_features=2, centers=5, cluster_std=0.7, random_state=12345)
|
||||
print('Dataset size : ', samples.shape, clusters.shape)
|
||||
print('Cluster names : ',set(clusters))
|
||||
with plt.style.context(('ggplot', 'seaborn')):
|
||||
plt.figure(figsize=(8,6))
|
||||
for i, c, m in zip(range(5),['red','green','blue','orange','purple'], ['s','+','^','o', 'x']):
|
||||
plt.scatter(samples[clusters == i,0],samples[clusters == i,1], color=c, marker=m, s=80, alpha = 0.8, label= 'Cluster %d'%i)
|
||||
|
||||
plt.xlabel('Feature 1')
|
||||
plt.ylabel('Feature 2')
|
||||
plt.title('Visualizing Dataset')
|
||||
plt.legend(loc='best')
|
||||
|
||||
#kmeans = cluster.KMeans(n_clusters=5)
|
||||
kmeans = cluster.KMeans(n_clusters=2)
|
||||
|
||||
kmeans.fit(samples)
|
||||
preds = kmeans.predict(samples)
|
||||
|
||||
print('Accuracy : %.3f'%accuracy_score(y_true = clusters, y_pred=preds))
|
||||
print('Confusion Matrix : \n', confusion_matrix(y_true=clusters, y_pred=preds))
|
||||
print('Adjusted Accuracy : %.3f'%adjusted_rand_score(labels_true=clusters, labels_pred=preds))
|
||||
print('Cluster Centers : \n', str(kmeans.cluster_centers_))
|
||||
|
||||
print('Sum of squared distances of samples to their closest cluster center : %.2f'%kmeans.inertia_,)
|
||||
with plt.style.context(('ggplot', 'seaborn')):
|
||||
plt.figure(figsize=(10,6))
|
||||
|
||||
plt.scatter(samples[preds == 0,0],samples[preds == 0,1], color='red', marker='s', s=80, alpha = 0.8, label= 'Cluster 0')
|
||||
plt.scatter(samples[preds == 1,0],samples[preds == 1,1], color='green', marker='^', s=80, alpha = 0.8, label= 'Cluster 1')
|
||||
plt.scatter(samples[preds == 2,0],samples[preds == 2,1], color='blue', marker='*', s=80, alpha = 0.8, label= 'Cluster 2')
|
||||
plt.scatter(samples[preds == 3,0],samples[preds == 3,1], color='orange', marker='o', s=80, alpha = 0.8, label= 'Cluster 3')
|
||||
plt.scatter(samples[preds == 4,0],samples[preds == 4,1], color='purple', marker='+', s=80, alpha = 0.8, label= 'Cluster 4')
|
||||
|
||||
for x,y in zip(samples[preds == 0,0],samples[preds == 0,1]):
|
||||
plt.plot([kmeans.cluster_centers_[0][0],x],[kmeans.cluster_centers_[0][1],y], color='red')
|
||||
for x,y in zip(samples[preds == 1,0],samples[preds == 1,1]):
|
||||
plt.plot([kmeans.cluster_centers_[1][0],x],[kmeans.cluster_centers_[1][1],y], color='green')
|
||||
for x,y in zip(samples[preds == 2,0],samples[preds == 2,1]):
|
||||
plt.plot([kmeans.cluster_centers_[2][0],x],[kmeans.cluster_centers_[2][1],y], color='blue')
|
||||
for x,y in zip(samples[preds == 3,0],samples[preds == 3,1]):
|
||||
plt.plot([kmeans.cluster_centers_[3][0],x],[kmeans.cluster_centers_[3][1],y], color='orange')
|
||||
for x,y in zip(samples[preds == 4,0],samples[preds == 4,1]):
|
||||
plt.plot([kmeans.cluster_centers_[4][0],x],[kmeans.cluster_centers_[4][1],y], color='purple')
|
||||
|
||||
plt.xlabel('Feature 1')
|
||||
plt.ylabel('Feature 2')
|
||||
plt.title('Visualizing Predictions & Cluster Centers')
|
||||
plt.legend(loc='best')
|
||||
|
||||
# The Elbow Method: To decide the numbers of cluster i.e. 'k'
|
||||
plt.figure(figsize=(8,5))
|
||||
distortions = []
|
||||
for i in range(1,11):
|
||||
kmeans = cluster.KMeans(n_clusters=i)
|
||||
kmeans.fit(samples)
|
||||
distortions.append(kmeans.inertia_)
|
||||
|
||||
print('Distortions (Sum Of Squared Distance of Samples from Closest Cluster Center) : ',distortions)
|
||||
|
||||
with plt.style.context(('ggplot', 'seaborn')):
|
||||
plt.plot(range(1,11), distortions, )
|
||||
plt.scatter(range(1,11), distortions, color='red', marker='o', s=80)
|
||||
plt.xlabel('Number Of Clusters')
|
||||
plt.ylabel('Distortions')
|
||||
plt.title('The Elbow Method (Num of Clusters vs Distortions)')
|
||||
plt.xticks(range(1,11));
|
||||
|
||||
#%% Example 1: My data
|
||||
X_unsup = num_df_wtgt[['ligand_affinity_change'
|
||||
, 'duet_stability_change']]
|
||||
|
||||
kmeans = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
, ('clf', KMeans(n_clusters = 2))
|
||||
])
|
||||
|
||||
#kmeans = KMeans(n_clusters = 2)
|
||||
#kmeans.fit(X_unsup)
|
||||
kmeans.fit(X_unsup)
|
||||
y_kmeans = kmeans.predict(X_unsup)
|
||||
|
||||
plt.scatter(X_unsup.loc [:, 'ligand_affinity_change']
|
||||
, X_unsup.loc[:, 'duet_stability_change']
|
||||
, c = y_kmeans
|
||||
, s = 50
|
||||
, cmap = 'viridis')
|
||||
|
||||
centers = kmeans.cluster_centers_
|
||||
plt.scatter(centers[:, 0], centers[:, 1]
|
||||
, c = 'black'
|
||||
, s = 200
|
||||
, alpha = 0.5);
|
||||
plt.show()
|
||||
#%% Example 2: https://builtin.com/data-science/unsupervised-learning-python
|
||||
|
||||
# Loading dataset
|
||||
iris_df = datasets.load_iris()
|
||||
|
||||
# Available methods on dataset
|
||||
print(dir(iris_df))
|
||||
|
||||
# Features
|
||||
print(iris_df.feature_names)
|
||||
|
||||
# Targets
|
||||
print(iris_df.target)
|
||||
|
||||
# Target Names
|
||||
print(iris_df.target_names)
|
||||
label = {0: 'red', 1: 'blue', 2: 'green'}
|
||||
|
||||
# Dataset Slicing
|
||||
x_axis = iris_df.data[:, 0] # Sepal Length
|
||||
y_axis = iris_df.data[:, 2] # Sepal Width
|
||||
|
||||
# Plotting
|
||||
plt.scatter(x_axis, y_axis, c=iris_df.target)
|
||||
plt.show()
|
||||
|
||||
# Use Kmeans: Declaring Model
|
||||
model = KMeans(n_clusters = 3)
|
||||
|
||||
# Fitting Model
|
||||
model.fit(iris_df.data)
|
||||
|
||||
# Predicitng a single input
|
||||
predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])
|
||||
|
||||
# Prediction on the entire data
|
||||
all_predictions = model.predict(iris_df.data)
|
||||
|
||||
# Printing Predictions
|
||||
print(predicted_label)
|
||||
print(all_predictions)
|
||||
|
||||
#%% Example 2: My data
|
||||
X_unsup = num_df_wtgt[[#'ligand_affinity_change'
|
||||
#, 'duet_stability_change'
|
||||
'ddg_foldx'
|
||||
,'deepddg']]
|
||||
y_unsup = num_df_wtgt[['mutation_class']]
|
||||
|
||||
|
||||
# X_train, X_test, y_train, y_test = train_test_split(X_unsup
|
||||
# , y_unsup
|
||||
# , test_size = 0.33
|
||||
# , **rs
|
||||
# , shuffle = True
|
||||
# , stratify = y_unsup)
|
||||
|
||||
#model = KMeans(n_clusters=2)
|
||||
model = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
, ('clf', KMeans(n_clusters = 2))
|
||||
])
|
||||
|
||||
label = {0: 'blue', 1: 'red'}
|
||||
|
||||
model.fit(X_unsup)
|
||||
predicted_label = model.predict(X_test)
|
||||
all_predictions = model.predict(X_unsup)
|
||||
print(predicted_label)
|
||||
print(all_predictions)
|
||||
|
||||
plt.scatter(X_train.loc[:, 'ligand_affinity_change']
|
||||
, X_train.loc[:, 'duet_stability_change']
|
||||
, c = y_train.loc[:, 'mutation_class'])
|
||||
|
||||
#from yellowbrick.cluster import KElbowVisualizer
|
||||
model = KMeans()
|
||||
visualizer = KElbowVisualizer(model, k=(1,12)).fit(df)
|
||||
visualizer.show()
|
||||
|
||||
#%% Example 3: My data Vscores, https://www.geeksforgeeks.org/ml-v-measure-for-evaluating-clustering-performance/
|
||||
|
||||
# List of V-Measure Scores for different models
|
||||
v_scores = []
|
||||
|
||||
# List of different types of covariance parameters
|
||||
N_Clusters = [2, 3]
|
||||
|
||||
# Building the clustering model
|
||||
kmeans2 = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
# ('pre'), StandardScaler())
|
||||
, ('clf', KMeans(n_clusters = 2))
|
||||
])
|
||||
|
||||
# Training the clustering model
|
||||
kmeans2.fit(X_unsup)
|
||||
|
||||
# Storing the predicted Clustering labels
|
||||
labels2 = kmeans2.predict(X_unsup)
|
||||
|
||||
# Evaluating the performance
|
||||
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels2))
|
||||
|
||||
# Building the clustering model
|
||||
|
||||
kmeans3 = Pipeline([
|
||||
#('pca', PCA())
|
||||
('pre', MinMaxScaler())
|
||||
# ('pre'), StandardScaler())
|
||||
, ('clf', KMeans(n_clusters = 3))
|
||||
])
|
||||
|
||||
# Training the clustering model
|
||||
kmeans3.fit(X_unsup)
|
||||
|
||||
# Storing the predicted Clustering labels
|
||||
labels3 = kmeans3.predict(X_unsup)
|
||||
|
||||
# Evaluating the performance
|
||||
v_scores.append(v_measure_score(y_unsup.loc[:,'mutation_class'], labels3))
|
||||
|
||||
# Plotting a Bar Graph to compare the models
|
||||
plt.bar(N_Clusters, v_scores)
|
||||
plt.xlabel('Number of Clusters')
|
||||
plt.ylabel('V-Measure Score')
|
||||
plt.title('Comparison of different Clustering Models')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Score: silhouette
|
||||
kmeans_kwargs = {
|
||||
"init": "random",
|
||||
"n_init": 10,
|
||||
"max_iter": 300,
|
||||
"random_state": 42}
|
||||
|
||||
from sklearn.metrics import silhouette_score
|
||||
#https://realpython.com/k-means-clustering-python/
|
||||
#import kneed
|
||||
#from kneed import KneeLocator
|
||||
scaler = StandardScaler()
|
||||
X_unsup_scaled = scaler.fit_transform(X_unsup)
|
||||
|
||||
silhouette_coefficients = []
|
||||
# Notice you start at 2 clusters for silhouette coefficient
|
||||
for k in range(2, 5):
|
||||
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
|
||||
# kmeans = Pipeline([
|
||||
# # ('pre', MinMaxScaler())
|
||||
# ('pre', StandardScaler())
|
||||
# , ('clf', KMeans(n_clusters=k, **kmeans_kwargs))
|
||||
# ])
|
||||
kmeans.fit(X_unsup_scaled)
|
||||
score = silhouette_score(X_unsup_scaled, kmeans.labels_)
|
||||
silhouette_coefficients.append(score)
|
||||
|
||||
plt.style.use("fivethirtyeight")
|
||||
plt.plot(range(2, 5), silhouette_coefficients)
|
||||
plt.xticks(range(2, 5))
|
||||
plt.xlabel("Number of Clusters")
|
||||
plt.ylabel("Silhouette Coefficient")
|
||||
plt.show()
|
||||
plt.bar(range(2, 5), silhouette_coefficients)
|
||||
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.datasets import make_moons
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
|
||||
# Instantiate k-means and dbscan algorithms
|
||||
kmeans = KMeans(n_clusters=2)
|
||||
dbscan = DBSCAN(eps=0.3)
|
||||
|
||||
# Fit the algorithms to the features
|
||||
kmeans.fit(X_unsup_scaled)
|
||||
dbscan.fit(X_unsup_scaled)
|
||||
|
||||
# Compute the silhouette scores for each algorithm
|
||||
kmeans_silhouette = silhouette_score(
|
||||
X_unsup_scaled, kmeans.labels_ ).round(2)
|
||||
dbscan_silhouette = silhouette_score(
|
||||
X_unsup_scaled, dbscan.labels_).round (2)
|
||||
|
||||
kmeans_silhouette
|
||||
dbscan_silhouette
|
||||
|
||||
ari_kmeans = adjusted_rand_score(y_unsup.iloc[:,0], kmeans.labels_)
|
||||
ari_dbscan = adjusted_rand_score(y_unsup.iloc[:,0], dbscan.labels_)
|
||||
|
||||
round(ari_kmeans, 2)
|
||||
round(ari_dbscan, 2)
|
||||
|
||||
# Crescent plot
|
||||
fig, (ax1, ax2) = plt.subplots(
|
||||
1, 2, figsize=(8, 6), sharex=True, sharey=True
|
||||
)
|
||||
|
||||
fig.suptitle(f"Clustering Algorithm Comparison: Crescents", fontsize=16)
|
||||
fte_colors = {
|
||||
0: "#008fd5",
|
||||
1: "#fc4f30",
|
||||
}
|
||||
|
||||
# The k-means plot
|
||||
km_colors = [fte_colors[label] for label in kmeans.labels_]
|
||||
ax1.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=km_colors)
|
||||
ax1.set_title(
|
||||
f"k-means\nSilhouette: {kmeans_silhouette}", fontdict={"fontsize": 12}
|
||||
)
|
||||
|
||||
# The dbscan plot
|
||||
db_colors = [fte_colors[label] for label in dbscan.labels_]
|
||||
ax2.scatter(X_unsup_scaled[:, 0], X_unsup_scaled[:, 1], c=db_colors)
|
||||
ax2.set_title(
|
||||
f"DBSCAN\nSilhouette: {dbscan_silhouette}", fontdict={"fontsize": 12}
|
||||
)
|
||||
plt.show()
|
||||
|
||||
|
||||
#%% Example 4: Machinelearning mastery, https://machinelearningmastery.com/clustering-algorithms-with-python/
|
||||
from sklearn.cluster import AffinityPropagation
|
||||
from matplotlib import pyplot
|
||||
from numpy import unique
|
||||
from numpy import where
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.cluster import Birch
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.cluster import MiniBatchKMeans
|
||||
from sklearn.cluster import OPTICS
|
||||
from sklearn.cluster import SpectralClustering
|
||||
from sklearn.mixture import GaussianMixture
|
||||
from sklearn.decomposition import PCA
|
||||
# affinity propagation
|
||||
#
|
||||
# which takes as input measures of similarity between pairs of data points. Real-valued messages are exchanged between data points until a high-quality set of exemplars and corresponding clusters gradually emerges.
|
||||
#=============
|
||||
XA = np.array(X_unsup)
|
||||
|
||||
# define the model
|
||||
#model = AffinityPropagation(damping=0.9)
|
||||
model = Pipeline([
|
||||
#('pca', PCA(n_components = 2))
|
||||
('pre', MinMaxScaler())
|
||||
#('pre', StandardScaler())
|
||||
#, ('clf', AffinityPropagation(damping=0.9))
|
||||
, ('clf', AgglomerativeClustering(n_clusters=2)) #y
|
||||
#, ('clf', Birch(threshold=0.01, n_clusters=2)) #y
|
||||
#, ('clf', DBSCAN(eps=0.30, min_samples=9) ) #n
|
||||
#, ('clf', KMeans(n_clusters=2)) #y
|
||||
#, ('clf', MiniBatchKMeans(n_clusters=2))
|
||||
#, ('clf', OPTICS(eps=0.8, min_samples=10))
|
||||
#, ('clf', SpectralClustering(n_clusters=2))
|
||||
# , ('clf', GaussianMixture(n_components=2))
|
||||
|
||||
])
|
||||
model
|
||||
# fit the model
|
||||
#model.fit(X_unsup)
|
||||
#yhat = model.predict(X_unsup)
|
||||
yhat = model.fit_predict(X_unsup)
|
||||
# retrieve unique clusters
|
||||
clusters = unique(yhat)
|
||||
|
||||
# create scatter plot for samples from each cluster
|
||||
for cluster in clusters:
|
||||
print(cluster)
|
||||
# get row indexes for samples with this cluster
|
||||
row_ix = where(yhat == cluster)
|
||||
print(row_ix)
|
||||
|
||||
# create scatter of these samples
|
||||
#pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
|
||||
pyplot.scatter(XA[row_ix, 0], XA[row_ix, 1])
|
||||
|
||||
# show the plot
|
||||
pyplot.show()
|
||||
#%%Example 5:https://github.com/AntonsRuberts/datascience_marketing/blob/master/KMeans_vs_KPrototypes.ipynb
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.io.json import json_normalize
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
from sklearn.preprocessing import PowerTransformer
|
||||
import umap
|
||||
import matplotlib.pyplot as plt
|
||||
import plotly.graph_objects as go
|
||||
from scipy import stats
|
||||
from sklearn.cluster import KMeans
|
||||
from kmodes.kprototypes import KPrototypes
|
||||
from lightgbm import LGBMClassifier
|
||||
import shap
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from kmodes.kprototypes import KPrototypes
|
||||
import lightgbm
|
||||
from lightgbm import LGBMClassifier
|
||||
|
||||
full_data = pd.read_csv('/home/tanu/git/ML_AI_training/ml_data/ga_customers.csv')
|
||||
full_data.head()
|
||||
|
||||
#Preprocessing numerical
|
||||
numerical = full_data.select_dtypes(exclude='object')
|
||||
|
||||
for c in numerical.columns:
|
||||
print(c)
|
||||
pt = PowerTransformer()
|
||||
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
|
||||
|
||||
##preprocessing categorical
|
||||
categorical = full_data.select_dtypes(include='object')
|
||||
categorical.head()
|
||||
|
||||
for col in categorical.columns:
|
||||
#print('-' * 40 + col + '-' * 40 , end=' - ')
|
||||
display(categorical[col].value_counts().head(10))
|
||||
|
||||
categorical = pd.get_dummies(categorical)
|
||||
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
|
||||
|
||||
#Embedding numerical & categorical
|
||||
# fit1 = PCA(n_components=2).fit(numerical)
|
||||
# fit2 = PCA(n_components=2).fit(categorical)
|
||||
fit1 = umap.UMAP(metric='l2').fit(numerical)
|
||||
fit2 = umap.UMAP(metric='dice').fit(categorical)
|
||||
|
||||
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
|
||||
intersection = umap.umap_.reset_local_connectivity(intersection)
|
||||
#https://github.com/lmcinnes/umap/issues/561
|
||||
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
|
||||
, intersection
|
||||
, fit1.n_components
|
||||
, fit1._initial_alpha
|
||||
, fit1._a
|
||||
, fit1._b
|
||||
, fit1.repulsion_strength
|
||||
, fit1.negative_sample_rate
|
||||
, 200
|
||||
, 'random'
|
||||
, np.random
|
||||
, fit1.metric
|
||||
, fit1._metric_kwds
|
||||
, False
|
||||
, densmap_kwds = {}
|
||||
, output_dens = False)
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
|
||||
plt.show()
|
||||
|
||||
#One-Hot-Encoding
|
||||
data = pd.get_dummies(full_data)
|
||||
|
||||
#Pre-processing
|
||||
for c in data.columns:
|
||||
pt = PowerTransformer()
|
||||
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
|
||||
#Actual Clustering
|
||||
kmeans = KMeans(n_clusters=15).fit(data)
|
||||
kmeans_labels = kmeans.labels_
|
||||
#OPTIONAL: Elbow plot with inertia
|
||||
#Elbow method to choose the optimal number of clusters
|
||||
# sse = {}
|
||||
# for k in tqdm(range(2, 50)):
|
||||
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
|
||||
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
||||
|
||||
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
|
||||
# fig.show()
|
||||
|
||||
# K-Prototypes
|
||||
kprot_data = full_data.copy()
|
||||
#Pre-processing
|
||||
for c in full_data.select_dtypes(exclude='object').columns:
|
||||
pt = PowerTransformer()
|
||||
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
|
||||
|
||||
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
|
||||
categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
|
||||
|
||||
#Actual clustering
|
||||
kproto = KPrototypes(n_clusters= 15, init='Cao', n_jobs = 4)
|
||||
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
|
||||
#Prints the count of each cluster group
|
||||
pd.Series(clusters).value_counts()
|
||||
|
||||
#OPTIONAL: Elbow plot with cost (will take a LONG time)
|
||||
costs = []
|
||||
n_clusters = []
|
||||
clusters_assigned = []
|
||||
for i in tqdm(range(2, 25)):
|
||||
#for i in tqdm(range(2, 10)):
|
||||
|
||||
try:
|
||||
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
|
||||
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
costs.append(kproto.cost_)
|
||||
n_clusters.append(i)
|
||||
clusters_assigned.append(clusters)
|
||||
except:
|
||||
print(f"Can't cluster with {i} clusters")
|
||||
|
||||
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
|
||||
fig.show()
|
||||
|
||||
# Visual Evaluation: Kmeans
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Visual Evaluation: K-Prototypes
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Evaluation by Classification
|
||||
#Setting the objects to category
|
||||
lgbm_data = full_data.copy()
|
||||
for c in lgbm_data.select_dtypes(include='object'):
|
||||
lgbm_data[c] = lgbm_data[c].astype('category')
|
||||
|
||||
#KMeans clusters
|
||||
clf_km = LGBMClassifier(colsample_by_tree=0.8)
|
||||
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
|
||||
|
||||
#Fit the model
|
||||
clf_km.fit(lgbm_data, kmeans_labels)
|
||||
#SHAP values
|
||||
explainer_km = shap.TreeExplainer(clf_km)
|
||||
shap_values_km = explainer_km.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
||||
|
||||
#K-Prototypes
|
||||
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
|
||||
#cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
|
||||
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
|
||||
|
||||
#clf_kp.fit(lgbm_data, proto_clusters)
|
||||
clf_kp.fit(lgbm_data, clusters)
|
||||
|
||||
explainer_kp = shap.TreeExplainer(clf_kp)
|
||||
shap_values_kp = explainer_kp.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
||||
#%%Example 5: My data
|
||||
# FIXME: clusters and proto_clusters?
|
||||
|
||||
full_data = all_df.copy()
|
||||
full_data.head()
|
||||
full_data.shape
|
||||
|
||||
#Preprocessing numerical
|
||||
numerical = full_data.select_dtypes(exclude='object')
|
||||
numerical.shape
|
||||
|
||||
for c in numerical.columns:
|
||||
print(c)
|
||||
pt = PowerTransformer()
|
||||
numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
|
||||
|
||||
##preprocessing categorical
|
||||
categorical = full_data.select_dtypes(include='object')
|
||||
categorical.head()
|
||||
|
||||
for col in categorical.columns:
|
||||
#print('-' * 40 + col + '-' * 40 , end=' - ')
|
||||
display(categorical[col].value_counts().head(10))
|
||||
|
||||
categorical = pd.get_dummies(categorical)
|
||||
categorical_weight = len(full_data.select_dtypes(include='object').columns) / full_data.shape[1]
|
||||
|
||||
#Embedding numerical & categorical
|
||||
# fit1 = PCA(n_components=2).fit(numerical)
|
||||
# fit2 = PCA(n_components=2).fit(categorical)
|
||||
fit1 = umap.UMAP(metric='l2').fit(numerical)
|
||||
fit2 = umap.UMAP(metric='dice').fit(categorical)
|
||||
|
||||
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
|
||||
intersection = umap.umap_.reset_local_connectivity(intersection)
|
||||
#https://github.com/lmcinnes/umap/issues/561
|
||||
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data
|
||||
, intersection
|
||||
, fit1.n_components
|
||||
, fit1._initial_alpha
|
||||
, fit1._a
|
||||
, fit1._b
|
||||
, fit1.repulsion_strength
|
||||
, fit1.negative_sample_rate
|
||||
, 200
|
||||
, 'random'
|
||||
, np.random
|
||||
, fit1.metric
|
||||
, fit1._metric_kwds
|
||||
, False
|
||||
, densmap_kwds = {}
|
||||
, output_dens = False)
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.scatter(*np.array(embedding)[0].T, s=2, cmap='Spectral', alpha=1.0)
|
||||
plt.show()
|
||||
|
||||
#One-Hot-Encoding
|
||||
data = pd.get_dummies(full_data)
|
||||
|
||||
#Pre-processing
|
||||
for c in data.columns:
|
||||
pt = PowerTransformer()
|
||||
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
|
||||
#Actual Clustering
|
||||
kmeans = KMeans(n_clusters=2).fit(data)
|
||||
kmeans_labels = kmeans.labels_
|
||||
#OPTIONAL: Elbow plot with inertia
|
||||
#Elbow method to choose the optimal number of clusters
|
||||
# sse = {}
|
||||
# for k in tqdm(range(2, 50)):
|
||||
# kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
|
||||
# sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
|
||||
|
||||
# fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
|
||||
# fig.show()
|
||||
|
||||
# K-Prototypes
|
||||
kprot_data = full_data.copy()
|
||||
#Pre-processing
|
||||
for c in full_data.select_dtypes(exclude='object').columns:
|
||||
pt = PowerTransformer()
|
||||
kprot_data[c] = pt.fit_transform(np.array(kprot_data[c]).reshape(-1, 1))
|
||||
|
||||
#categorical_columns = [0, 4, 5, 7, 11] #make sure to specify correct indices
|
||||
#categorical_columns = [1, 5, 6, 8, 12] #make sure to specify correct indices
|
||||
categorical_columns = []
|
||||
for col in cat_df.columns:
|
||||
print(col)
|
||||
#categ_i += all_df.columns.get_loc(col)
|
||||
categorical_columns.append(full_data.columns.get_loc(col))
|
||||
|
||||
print(categorical_columns)
|
||||
print(len(categorical_columns))
|
||||
|
||||
#Actual clustering
|
||||
kproto = KPrototypes(n_clusters= 2, init='Cao', n_jobs = 10)
|
||||
clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
|
||||
#Prints the count of each cluster group
|
||||
pd.Series(clusters).value_counts()
|
||||
|
||||
#OPTIONAL: Elbow plot with cost (will take a LONG time)
|
||||
costs = []
|
||||
n_clusters = []
|
||||
clusters_assigned = []
|
||||
for i in tqdm(range(2, 25)):
|
||||
#for i in tqdm(range(2, 10)):
|
||||
#print(i)
|
||||
|
||||
try:
|
||||
kproto = KPrototypes(n_clusters= i, init='Cao', verbose=2, n_jobs = 10)
|
||||
#clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
proto_clusters = kproto.fit_predict(kprot_data, categorical=categorical_columns)
|
||||
|
||||
costs.append(kproto.cost_)
|
||||
n_clusters.append(i)
|
||||
clusters_assigned.append(clusters)
|
||||
except:
|
||||
print(f"Can't cluster with {i} clusters")
|
||||
|
||||
fig = go.Figure(data=go.Scatter(x=n_clusters, y=costs ))
|
||||
fig.show()
|
||||
|
||||
# Visual Evaluation: Kmeans
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
#scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=15),
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Visual Evaluation: K-Prototypes
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches((20, 10))
|
||||
scatter = ax.scatter(embedding[0][:,0], embedding[0][:,1], s=2, c=kmeans_labels, cmap='tab20b', alpha=1.0)
|
||||
|
||||
# produce a legend with the unique colors from the scatter
|
||||
legend1 = ax.legend(*scatter.legend_elements(num=2), #15
|
||||
loc="lower left", title="Classes")
|
||||
ax.add_artist(legend1)
|
||||
|
||||
# Evaluation by Classification
|
||||
#Setting the objects to category
|
||||
lgbm_data = full_data.copy()
|
||||
for c in lgbm_data.select_dtypes(include='object'):
|
||||
lgbm_data[c] = lgbm_data[c].astype('category')
|
||||
|
||||
#KMeans clusters
|
||||
clf_km = LGBMClassifier(colsample_by_tree=0.8)
|
||||
cv_scores_km = cross_val_score(clf_km, lgbm_data, kmeans_labels, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Means clusters is {np.mean(cv_scores_km)}')
|
||||
|
||||
#Fit the model
|
||||
clf_km.fit(lgbm_data, kmeans_labels)
|
||||
#SHAP values
|
||||
explainer_km = shap.TreeExplainer(clf_km)
|
||||
shap_values_km = explainer_km.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_km, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
||||
|
||||
#K-Prototypes
|
||||
clf_kp = LGBMClassifier(colsample_by_tree=0.8)
|
||||
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, proto_clusters, scoring='f1_weighted')
|
||||
cv_scores_kp = cross_val_score(clf_kp, lgbm_data, clusters, scoring='f1_weighted')
|
||||
print(f'CV F1 score for K-Prototypes clusters is {np.mean(cv_scores_kp)}')
|
||||
|
||||
clf_kp.fit(lgbm_data, proto_clusters)
|
||||
clf_kp.fit(lgbm_data, clusters)
|
||||
|
||||
explainer_kp = shap.TreeExplainer(clf_kp)
|
||||
shap_values_kp = explainer_kp.shap_values(lgbm_data)
|
||||
shap.summary_plot(shap_values_kp, lgbm_data, plot_type="bar", plot_size=(15, 10))
|
Loading…
Add table
Add a link
Reference in a new issue