|
|
|
|
|
""" |
|
Created on Sun Jul 17 06:46:02 PM EDT 2022 |
|
author: Ryan Hildebrandt, github.com/ryancahildebrandt |
|
""" |
|
|
|
import pandas as pd |
|
import random |
|
|
|
from hdbscan import HDBSCAN |
|
from sklearn.cluster import AffinityPropagation |
|
from sklearn.cluster import AgglomerativeClustering |
|
from sklearn.cluster import Birch |
|
from sklearn.cluster import DBSCAN |
|
from sklearn.cluster import KMeans |
|
from sklearn.cluster import MeanShift |
|
from sklearn.cluster import MiniBatchKMeans |
|
from sklearn.cluster import OPTICS |
|
from sklearn.cluster import SpectralClustering |
|
|
|
random.seed(42) |
|
|
|
def cluster_kmeans(in_embs, n_clusters, algorithm): |
|
""" |
|
KMeans(n_clusters=8, *, init='k-means++', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True, algorithm='lloyd') |
|
""" |
|
return KMeans(n_clusters = n_clusters, algorithm = algorithm).fit(in_embs).labels_ |
|
|
|
def cluster_affinity(in_embs): |
|
""" |
|
AffinityPropagation(*, damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=None)[source]¶ |
|
""" |
|
return AffinityPropagation().fit(in_embs).labels_ |
|
|
|
def cluster_agglom(in_embs, n_clusters, affinity, linkage): |
|
""" |
|
AgglomerativeClustering(n_clusters=2, *, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None, compute_distances=False)[source]¶ |
|
""" |
|
return AgglomerativeClustering(n_clusters = n_clusters, affinity = affinity, linkage = linkage).fit(in_embs).labels_ |
|
|
|
def cluster_birch(in_embs, branching_factor, n_clusters): |
|
""" |
|
Birch(*, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True)[source]¶ |
|
""" |
|
return Birch(branching_factor = branching_factor, n_clusters = n_clusters).fit(in_embs).labels_ |
|
|
|
def cluster_dbscan(in_embs, eps, min_samples, metric): |
|
""" |
|
DBSCAN(eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)[source]¶ |
|
""" |
|
return DBSCAN(eps = eps, min_samples = min_samples, metric = metric).fit(in_embs).labels_ |
|
|
|
def cluster_minikmeans(in_embs, n_clusters): |
|
""" |
|
MiniBatchKMeans(n_clusters=8, *, init='k-means++', max_iter=100, batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01)[source]¶ |
|
""" |
|
return MiniBatchKMeans(n_clusters = n_clusters).fit(in_embs).labels_ |
|
|
|
def cluster_meanshift(in_embs, bin_seeding, cluster_all): |
|
""" |
|
MeanShift(*, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300) |
|
""" |
|
return MeanShift(bin_seeding = bin_seeding, cluster_all = cluster_all).fit(in_embs).labels_ |
|
|
|
def cluster_optics(in_embs, min_samples, metric, min_cluster_size): |
|
""" |
|
OPTICS(*, min_samples=5, max_eps=inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None)[source]¶ |
|
""" |
|
return OPTICS(min_samples = min_samples, metric = metric, min_cluster_size = min_cluster_size).fit(in_embs).labels_ |
|
|
|
def cluster_spectral(in_embs, n_clusters, affinity): |
|
""" |
|
SpectralClustering(n_clusters=8, *, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False)[source]¶ |
|
""" |
|
return SpectralClustering(n_clusters = n_clusters, affinity = affinity).fit(in_embs).labels_ |
|
|
|
def cluster_hdbscan(in_embs, alpha, metric, min_cluster_size): |
|
""" |
|
HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, memory=Memory(cachedir=None), metric='euclidean', min_cluster_size=5, min_samples=None, p=None) |
|
""" |
|
return HDBSCAN(alpha = alpha, metric = metric, min_cluster_size = min_cluster_size).fit(in_embs).labels_ |
|
|
|
metrics_list = ["cityblock", "cosine", "euclidean", "l1", "l2", "manhattan", "braycurtis", "canberra", "chebyshev", "correlation", "dice", "hamming", "jaccard", "kulsinski", "mahalanobis", "minkowski", "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule"] |
|
|
|
def cluster_ex(in_text, labels): |
|
out = pd.DataFrame({"Text" : in_text , "Cluster" : labels}).sort_values(by = "Cluster") |
|
return out |