|
"""Algorithms for spectral clustering""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy.linalg import LinAlgError, qr, svd |
|
from scipy.sparse import csc_matrix |
|
|
|
from ..base import BaseEstimator, ClusterMixin, _fit_context |
|
from ..manifold._spectral_embedding import _spectral_embedding |
|
from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels |
|
from ..neighbors import NearestNeighbors, kneighbors_graph |
|
from ..utils import as_float_array, check_random_state |
|
from ..utils._param_validation import Interval, StrOptions, validate_params |
|
from ..utils.validation import validate_data |
|
from ._kmeans import k_means |
|
|
|
|
|
def cluster_qr(vectors): |
|
"""Find the discrete partition closest to the eigenvector embedding. |
|
|
|
This implementation was proposed in [1]_. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
Parameters |
|
---------- |
|
vectors : array-like, shape: (n_samples, n_clusters) |
|
The embedding space of the samples. |
|
|
|
Returns |
|
------- |
|
labels : array of integers, shape: n_samples |
|
The cluster labels of vectors. |
|
|
|
References |
|
---------- |
|
.. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 |
|
Anil Damle, Victor Minden, Lexing Ying |
|
<10.1093/imaiai/iay008>` |
|
|
|
""" |
|
|
|
k = vectors.shape[1] |
|
_, _, piv = qr(vectors.T, pivoting=True) |
|
ut, _, v = svd(vectors[piv[:k], :].T) |
|
vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) |
|
return vectors.argmax(axis=1) |
|
|
|
|
|
def discretize( |
|
vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None |
|
): |
|
"""Search for a partition matrix which is closest to the eigenvector embedding. |
|
|
|
This implementation was proposed in [1]_. |
|
|
|
Parameters |
|
---------- |
|
vectors : array-like of shape (n_samples, n_clusters) |
|
The embedding space of the samples. |
|
|
|
copy : bool, default=True |
|
Whether to copy vectors, or perform in-place normalization. |
|
|
|
max_svd_restarts : int, default=30 |
|
Maximum number of attempts to restart SVD if convergence fails |
|
|
|
n_iter_max : int, default=30 |
|
Maximum number of iterations to attempt in rotation and partition |
|
matrix search if machine precision convergence is not reached |
|
|
|
random_state : int, RandomState instance, default=None |
|
Determines random number generation for rotation matrix initialization. |
|
Use an int to make the randomness deterministic. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Returns |
|
------- |
|
labels : array of integers, shape: n_samples |
|
The labels of the clusters. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] `Multiclass spectral clustering, 2003 |
|
Stella X. Yu, Jianbo Shi |
|
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_ |
|
|
|
Notes |
|
----- |
|
|
|
The eigenvector embedding is used to iteratively search for the |
|
closest discrete partition. First, the eigenvector embedding is |
|
normalized to the space of partition matrices. An optimal discrete |
|
partition matrix closest to this normalized embedding multiplied by |
|
an initial rotation is calculated. Fixing this discrete partition |
|
matrix, an optimal rotation matrix is calculated. These two |
|
calculations are performed until convergence. The discrete partition |
|
matrix is returned as the clustering solution. Used in spectral |
|
clustering, this method tends to be faster and more robust to random |
|
initialization than k-means. |
|
|
|
""" |
|
|
|
random_state = check_random_state(random_state) |
|
|
|
vectors = as_float_array(vectors, copy=copy) |
|
|
|
eps = np.finfo(float).eps |
|
n_samples, n_components = vectors.shape |
|
|
|
|
|
|
|
|
|
|
|
|
|
norm_ones = np.sqrt(n_samples) |
|
for i in range(vectors.shape[1]): |
|
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones |
|
if vectors[0, i] != 0: |
|
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i]) |
|
|
|
|
|
|
|
|
|
vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis] |
|
|
|
svd_restarts = 0 |
|
has_converged = False |
|
|
|
|
|
|
|
while (svd_restarts < max_svd_restarts) and not has_converged: |
|
|
|
|
|
rotation = np.zeros((n_components, n_components)) |
|
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T |
|
|
|
|
|
|
|
|
|
c = np.zeros(n_samples) |
|
for j in range(1, n_components): |
|
|
|
|
|
c += np.abs(np.dot(vectors, rotation[:, j - 1])) |
|
rotation[:, j] = vectors[c.argmin(), :].T |
|
|
|
last_objective_value = 0.0 |
|
n_iter = 0 |
|
|
|
while not has_converged: |
|
n_iter += 1 |
|
|
|
t_discrete = np.dot(vectors, rotation) |
|
|
|
labels = t_discrete.argmax(axis=1) |
|
vectors_discrete = csc_matrix( |
|
(np.ones(len(labels)), (np.arange(0, n_samples), labels)), |
|
shape=(n_samples, n_components), |
|
) |
|
|
|
t_svd = vectors_discrete.T * vectors |
|
|
|
try: |
|
U, S, Vh = np.linalg.svd(t_svd) |
|
except LinAlgError: |
|
svd_restarts += 1 |
|
print("SVD did not converge, randomizing and trying again") |
|
break |
|
|
|
ncut_value = 2.0 * (n_samples - S.sum()) |
|
if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max): |
|
has_converged = True |
|
else: |
|
|
|
last_objective_value = ncut_value |
|
rotation = np.dot(Vh.T, U.T) |
|
|
|
if not has_converged: |
|
raise LinAlgError("SVD did not converge") |
|
return labels |
|
|
|
|
|
@validate_params( |
|
{"affinity": ["array-like", "sparse matrix"]}, |
|
prefer_skip_nested_validation=False, |
|
) |
|
def spectral_clustering( |
|
affinity, |
|
*, |
|
n_clusters=8, |
|
n_components=None, |
|
eigen_solver=None, |
|
random_state=None, |
|
n_init=10, |
|
eigen_tol="auto", |
|
assign_labels="kmeans", |
|
verbose=False, |
|
): |
|
"""Apply clustering to a projection of the normalized Laplacian. |
|
|
|
In practice Spectral Clustering is very useful when the structure of |
|
the individual clusters is highly non-convex or more generally when |
|
a measure of the center and spread of the cluster is not a suitable |
|
description of the complete cluster. For instance, when clusters are |
|
nested circles on the 2D plane. |
|
|
|
If affinity is the adjacency matrix of a graph, this method can be |
|
used to find normalized graph cuts [1]_, [2]_. |
|
|
|
Read more in the :ref:`User Guide <spectral_clustering>`. |
|
|
|
Parameters |
|
---------- |
|
affinity : {array-like, sparse matrix} of shape (n_samples, n_samples) |
|
The affinity matrix describing the relationship of the samples to |
|
embed. **Must be symmetric**. |
|
|
|
Possible examples: |
|
- adjacency matrix of a graph, |
|
- heat kernel of the pairwise distance matrix of the samples, |
|
- symmetric k-nearest neighbours connectivity matrix of the samples. |
|
|
|
n_clusters : int, default=None |
|
Number of clusters to extract. |
|
|
|
n_components : int, default=n_clusters |
|
Number of eigenvectors to use for the spectral embedding. |
|
|
|
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} |
|
The eigenvalue decomposition method. If None then ``'arpack'`` is used. |
|
See [4]_ for more details regarding ``'lobpcg'``. |
|
Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional |
|
Algebraic MultiGrid preconditioning and requires pyamg to be installed. |
|
It can be faster on very large sparse problems [6]_ and [7]_. |
|
|
|
random_state : int, RandomState instance, default=None |
|
A pseudo random number generator used for the initialization |
|
of the lobpcg eigenvectors decomposition when `eigen_solver == |
|
'amg'`, and for the K-Means initialization. Use an int to make |
|
the results deterministic across calls (See |
|
:term:`Glossary <random_state>`). |
|
|
|
.. note:: |
|
When using `eigen_solver == 'amg'`, |
|
it is necessary to also fix the global numpy seed with |
|
`np.random.seed(int)` to get deterministic results. See |
|
https://github.com/pyamg/pyamg/issues/139 for further |
|
information. |
|
|
|
n_init : int, default=10 |
|
Number of time the k-means algorithm will be run with different |
|
centroid seeds. The final results will be the best output of n_init |
|
consecutive runs in terms of inertia. Only used if |
|
``assign_labels='kmeans'``. |
|
|
|
eigen_tol : float, default="auto" |
|
Stopping criterion for eigendecomposition of the Laplacian matrix. |
|
If `eigen_tol="auto"` then the passed tolerance will depend on the |
|
`eigen_solver`: |
|
|
|
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`; |
|
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then |
|
`eigen_tol=None` which configures the underlying `lobpcg` solver to |
|
automatically resolve the value according to their heuristics. See, |
|
:func:`scipy.sparse.linalg.lobpcg` for details. |
|
|
|
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` |
|
values of `tol<1e-5` may lead to convergence issues and should be |
|
avoided. |
|
|
|
.. versionadded:: 1.2 |
|
Added 'auto' option. |
|
|
|
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' |
|
The strategy to use to assign labels in the embedding |
|
space. There are three ways to assign labels after the Laplacian |
|
embedding. k-means can be applied and is a popular choice. But it can |
|
also be sensitive to initialization. Discretization is another |
|
approach which is less sensitive to random initialization [3]_. |
|
The cluster_qr method [5]_ directly extracts clusters from eigenvectors |
|
in spectral clustering. In contrast to k-means and discretization, cluster_qr |
|
has no tuning parameters and is not an iterative method, yet may outperform |
|
k-means and discretization in terms of both quality and speed. For a detailed |
|
comparison of clustering strategies, refer to the following example: |
|
:ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`. |
|
|
|
.. versionchanged:: 1.1 |
|
Added new labeling method 'cluster_qr'. |
|
|
|
verbose : bool, default=False |
|
Verbosity mode. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Returns |
|
------- |
|
labels : array of integers, shape: n_samples |
|
The labels of the clusters. |
|
|
|
Notes |
|
----- |
|
The graph should contain only one connected component, elsewhere |
|
the results make little sense. |
|
|
|
This algorithm solves the normalized cut for `k=2`: it is a |
|
normalized spectral clustering. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] :doi:`Normalized cuts and image segmentation, 2000 |
|
Jianbo Shi, Jitendra Malik |
|
<10.1109/34.868688>` |
|
|
|
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007 |
|
Ulrike von Luxburg |
|
<10.1007/s11222-007-9033-z>` |
|
|
|
.. [3] `Multiclass spectral clustering, 2003 |
|
Stella X. Yu, Jianbo Shi |
|
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_ |
|
|
|
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver: |
|
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 |
|
A. V. Knyazev |
|
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. |
|
<10.1137/S1064827500366124>` |
|
|
|
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 |
|
Anil Damle, Victor Minden, Lexing Ying |
|
<10.1093/imaiai/iay008>` |
|
|
|
.. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning |
|
for computing eigenvalues of graph Laplacians in image segmentation, 2006 |
|
Andrew Knyazev |
|
<10.13140/RG.2.2.35280.02565>` |
|
|
|
.. [7] :doi:`Preconditioned spectral clustering for stochastic block partition |
|
streaming graph challenge (Preliminary version at arXiv.) |
|
David Zhuzhunashvili, Andrew Knyazev |
|
<10.1109/HPEC.2017.8091045>` |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics.pairwise import pairwise_kernels |
|
>>> from sklearn.cluster import spectral_clustering |
|
>>> X = np.array([[1, 1], [2, 1], [1, 0], |
|
... [4, 7], [3, 5], [3, 6]]) |
|
>>> affinity = pairwise_kernels(X, metric='rbf') |
|
>>> spectral_clustering( |
|
... affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0 |
|
... ) |
|
array([1, 1, 1, 0, 0, 0]) |
|
""" |
|
|
|
clusterer = SpectralClustering( |
|
n_clusters=n_clusters, |
|
n_components=n_components, |
|
eigen_solver=eigen_solver, |
|
random_state=random_state, |
|
n_init=n_init, |
|
affinity="precomputed", |
|
eigen_tol=eigen_tol, |
|
assign_labels=assign_labels, |
|
verbose=verbose, |
|
).fit(affinity) |
|
|
|
return clusterer.labels_ |
|
|
|
|
|
class SpectralClustering(ClusterMixin, BaseEstimator): |
|
"""Apply clustering to a projection of the normalized Laplacian. |
|
|
|
In practice Spectral Clustering is very useful when the structure of |
|
the individual clusters is highly non-convex, or more generally when |
|
a measure of the center and spread of the cluster is not a suitable |
|
description of the complete cluster, such as when clusters are |
|
nested circles on the 2D plane. |
|
|
|
If the affinity matrix is the adjacency matrix of a graph, this method |
|
can be used to find normalized graph cuts [1]_, [2]_. |
|
|
|
When calling ``fit``, an affinity matrix is constructed using either |
|
a kernel function such the Gaussian (aka RBF) kernel with Euclidean |
|
distance ``d(X, X)``:: |
|
|
|
np.exp(-gamma * d(X,X) ** 2) |
|
|
|
or a k-nearest neighbors connectivity matrix. |
|
|
|
Alternatively, a user-provided affinity matrix can be specified by |
|
setting ``affinity='precomputed'``. |
|
|
|
Read more in the :ref:`User Guide <spectral_clustering>`. |
|
|
|
Parameters |
|
---------- |
|
n_clusters : int, default=8 |
|
The dimension of the projection subspace. |
|
|
|
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None |
|
The eigenvalue decomposition strategy to use. AMG requires pyamg |
|
to be installed. It can be faster on very large, sparse problems, |
|
but may also lead to instabilities. If None, then ``'arpack'`` is |
|
used. See [4]_ for more details regarding `'lobpcg'`. |
|
|
|
n_components : int, default=None |
|
Number of eigenvectors to use for the spectral embedding. If None, |
|
defaults to `n_clusters`. |
|
|
|
random_state : int, RandomState instance, default=None |
|
A pseudo random number generator used for the initialization |
|
of the lobpcg eigenvectors decomposition when `eigen_solver == |
|
'amg'`, and for the K-Means initialization. Use an int to make |
|
the results deterministic across calls (See |
|
:term:`Glossary <random_state>`). |
|
|
|
.. note:: |
|
When using `eigen_solver == 'amg'`, |
|
it is necessary to also fix the global numpy seed with |
|
`np.random.seed(int)` to get deterministic results. See |
|
https://github.com/pyamg/pyamg/issues/139 for further |
|
information. |
|
|
|
n_init : int, default=10 |
|
Number of time the k-means algorithm will be run with different |
|
centroid seeds. The final results will be the best output of n_init |
|
consecutive runs in terms of inertia. Only used if |
|
``assign_labels='kmeans'``. |
|
|
|
gamma : float, default=1.0 |
|
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. |
|
Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'`` |
|
or ``affinity='precomputed_nearest_neighbors'``. |
|
|
|
affinity : str or callable, default='rbf' |
|
How to construct the affinity matrix. |
|
- 'nearest_neighbors': construct the affinity matrix by computing a |
|
graph of nearest neighbors. |
|
- 'rbf': construct the affinity matrix using a radial basis function |
|
(RBF) kernel. |
|
- 'precomputed': interpret ``X`` as a precomputed affinity matrix, |
|
where larger values indicate greater similarity between instances. |
|
- 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph |
|
of precomputed distances, and construct a binary affinity matrix |
|
from the ``n_neighbors`` nearest neighbors of each instance. |
|
- one of the kernels supported by |
|
:func:`~sklearn.metrics.pairwise.pairwise_kernels`. |
|
|
|
Only kernels that produce similarity scores (non-negative values that |
|
increase with similarity) should be used. This property is not checked |
|
by the clustering algorithm. |
|
|
|
n_neighbors : int, default=10 |
|
Number of neighbors to use when constructing the affinity matrix using |
|
the nearest neighbors method. Ignored for ``affinity='rbf'``. |
|
|
|
eigen_tol : float, default="auto" |
|
Stopping criterion for eigen decomposition of the Laplacian matrix. |
|
If `eigen_tol="auto"` then the passed tolerance will depend on the |
|
`eigen_solver`: |
|
|
|
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`; |
|
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then |
|
`eigen_tol=None` which configures the underlying `lobpcg` solver to |
|
automatically resolve the value according to their heuristics. See, |
|
:func:`scipy.sparse.linalg.lobpcg` for details. |
|
|
|
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` |
|
values of `tol<1e-5` may lead to convergence issues and should be |
|
avoided. |
|
|
|
.. versionadded:: 1.2 |
|
Added 'auto' option. |
|
|
|
assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' |
|
The strategy for assigning labels in the embedding space. There are two |
|
ways to assign labels after the Laplacian embedding. k-means is a |
|
popular choice, but it can be sensitive to initialization. |
|
Discretization is another approach which is less sensitive to random |
|
initialization [3]_. |
|
The cluster_qr method [5]_ directly extract clusters from eigenvectors |
|
in spectral clustering. In contrast to k-means and discretization, cluster_qr |
|
has no tuning parameters and runs no iterations, yet may outperform |
|
k-means and discretization in terms of both quality and speed. |
|
|
|
.. versionchanged:: 1.1 |
|
Added new labeling method 'cluster_qr'. |
|
|
|
degree : float, default=3 |
|
Degree of the polynomial kernel. Ignored by other kernels. |
|
|
|
coef0 : float, default=1 |
|
Zero coefficient for polynomial and sigmoid kernels. |
|
Ignored by other kernels. |
|
|
|
kernel_params : dict of str to any, default=None |
|
Parameters (keyword arguments) and values for kernel passed as |
|
callable object. Ignored by other kernels. |
|
|
|
n_jobs : int, default=None |
|
The number of parallel jobs to run when `affinity='nearest_neighbors'` |
|
or `affinity='precomputed_nearest_neighbors'`. The neighbors search |
|
will be done in parallel. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
verbose : bool, default=False |
|
Verbosity mode. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Attributes |
|
---------- |
|
affinity_matrix_ : array-like of shape (n_samples, n_samples) |
|
Affinity matrix used for clustering. Available only after calling |
|
``fit``. |
|
|
|
labels_ : ndarray of shape (n_samples,) |
|
Labels of each point |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
sklearn.cluster.KMeans : K-Means clustering. |
|
sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of |
|
Applications with Noise. |
|
|
|
Notes |
|
----- |
|
A distance matrix for which 0 indicates identical elements and high values |
|
indicate very dissimilar elements can be transformed into an affinity / |
|
similarity matrix that is well-suited for the algorithm by |
|
applying the Gaussian (aka RBF, heat) kernel:: |
|
|
|
np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) |
|
|
|
where ``delta`` is a free parameter representing the width of the Gaussian |
|
kernel. |
|
|
|
An alternative is to take a symmetric version of the k-nearest neighbors |
|
connectivity matrix of the points. |
|
|
|
If the pyamg package is installed, it is used: this greatly |
|
speeds up computation. |
|
|
|
References |
|
---------- |
|
.. [1] :doi:`Normalized cuts and image segmentation, 2000 |
|
Jianbo Shi, Jitendra Malik |
|
<10.1109/34.868688>` |
|
|
|
.. [2] :doi:`A Tutorial on Spectral Clustering, 2007 |
|
Ulrike von Luxburg |
|
<10.1007/s11222-007-9033-z>` |
|
|
|
.. [3] `Multiclass spectral clustering, 2003 |
|
Stella X. Yu, Jianbo Shi |
|
<https://people.eecs.berkeley.edu/~jordan/courses/281B-spring04/readings/yu-shi.pdf>`_ |
|
|
|
.. [4] :doi:`Toward the Optimal Preconditioned Eigensolver: |
|
Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 |
|
A. V. Knyazev |
|
SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. |
|
<10.1137/S1064827500366124>` |
|
|
|
.. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 |
|
Anil Damle, Victor Minden, Lexing Ying |
|
<10.1093/imaiai/iay008>` |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.cluster import SpectralClustering |
|
>>> import numpy as np |
|
>>> X = np.array([[1, 1], [2, 1], [1, 0], |
|
... [4, 7], [3, 5], [3, 6]]) |
|
>>> clustering = SpectralClustering(n_clusters=2, |
|
... assign_labels='discretize', |
|
... random_state=0).fit(X) |
|
>>> clustering.labels_ |
|
array([1, 1, 1, 0, 0, 0]) |
|
>>> clustering |
|
SpectralClustering(assign_labels='discretize', n_clusters=2, |
|
random_state=0) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"n_clusters": [Interval(Integral, 1, None, closed="left")], |
|
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None], |
|
"n_components": [Interval(Integral, 1, None, closed="left"), None], |
|
"random_state": ["random_state"], |
|
"n_init": [Interval(Integral, 1, None, closed="left")], |
|
"gamma": [Interval(Real, 0, None, closed="left")], |
|
"affinity": [ |
|
callable, |
|
StrOptions( |
|
set(KERNEL_PARAMS) |
|
| {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"} |
|
), |
|
], |
|
"n_neighbors": [Interval(Integral, 1, None, closed="left")], |
|
"eigen_tol": [ |
|
Interval(Real, 0.0, None, closed="left"), |
|
StrOptions({"auto"}), |
|
], |
|
"assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})], |
|
"degree": [Interval(Real, 0, None, closed="left")], |
|
"coef0": [Interval(Real, None, None, closed="neither")], |
|
"kernel_params": [dict, None], |
|
"n_jobs": [Integral, None], |
|
"verbose": ["verbose"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
n_clusters=8, |
|
*, |
|
eigen_solver=None, |
|
n_components=None, |
|
random_state=None, |
|
n_init=10, |
|
gamma=1.0, |
|
affinity="rbf", |
|
n_neighbors=10, |
|
eigen_tol="auto", |
|
assign_labels="kmeans", |
|
degree=3, |
|
coef0=1, |
|
kernel_params=None, |
|
n_jobs=None, |
|
verbose=False, |
|
): |
|
self.n_clusters = n_clusters |
|
self.eigen_solver = eigen_solver |
|
self.n_components = n_components |
|
self.random_state = random_state |
|
self.n_init = n_init |
|
self.gamma = gamma |
|
self.affinity = affinity |
|
self.n_neighbors = n_neighbors |
|
self.eigen_tol = eigen_tol |
|
self.assign_labels = assign_labels |
|
self.degree = degree |
|
self.coef0 = coef0 |
|
self.kernel_params = kernel_params |
|
self.n_jobs = n_jobs |
|
self.verbose = verbose |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y=None): |
|
"""Perform spectral clustering from features, or affinity matrix. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) |
|
Training instances to cluster, similarities / affinities between |
|
instances if ``affinity='precomputed'``, or distances between |
|
instances if ``affinity='precomputed_nearest_neighbors``. If a |
|
sparse matrix is provided in a format other than ``csr_matrix``, |
|
``csc_matrix``, or ``coo_matrix``, it will be converted into a |
|
sparse ``csr_matrix``. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : object |
|
A fitted instance of the estimator. |
|
""" |
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse=["csr", "csc", "coo"], |
|
dtype=np.float64, |
|
ensure_min_samples=2, |
|
) |
|
allow_squared = self.affinity in [ |
|
"precomputed", |
|
"precomputed_nearest_neighbors", |
|
] |
|
if X.shape[0] == X.shape[1] and not allow_squared: |
|
warnings.warn( |
|
"The spectral clustering API has changed. ``fit``" |
|
"now constructs an affinity matrix from data. To use" |
|
" a custom affinity matrix, " |
|
"set ``affinity=precomputed``." |
|
) |
|
|
|
if self.affinity == "nearest_neighbors": |
|
connectivity = kneighbors_graph( |
|
X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs |
|
) |
|
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) |
|
elif self.affinity == "precomputed_nearest_neighbors": |
|
estimator = NearestNeighbors( |
|
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed" |
|
).fit(X) |
|
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity") |
|
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) |
|
elif self.affinity == "precomputed": |
|
self.affinity_matrix_ = X |
|
else: |
|
params = self.kernel_params |
|
if params is None: |
|
params = {} |
|
if not callable(self.affinity): |
|
params["gamma"] = self.gamma |
|
params["degree"] = self.degree |
|
params["coef0"] = self.coef0 |
|
self.affinity_matrix_ = pairwise_kernels( |
|
X, metric=self.affinity, filter_params=True, **params |
|
) |
|
|
|
random_state = check_random_state(self.random_state) |
|
n_components = ( |
|
self.n_clusters if self.n_components is None else self.n_components |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
maps = _spectral_embedding( |
|
self.affinity_matrix_, |
|
n_components=n_components, |
|
eigen_solver=self.eigen_solver, |
|
random_state=random_state, |
|
eigen_tol=self.eigen_tol, |
|
drop_first=False, |
|
) |
|
if self.verbose: |
|
print(f"Computing label assignment using {self.assign_labels}") |
|
|
|
if self.assign_labels == "kmeans": |
|
_, self.labels_, _ = k_means( |
|
maps, |
|
self.n_clusters, |
|
random_state=random_state, |
|
n_init=self.n_init, |
|
verbose=self.verbose, |
|
) |
|
elif self.assign_labels == "cluster_qr": |
|
self.labels_ = cluster_qr(maps) |
|
else: |
|
self.labels_ = discretize(maps, random_state=random_state) |
|
|
|
return self |
|
|
|
def fit_predict(self, X, y=None): |
|
"""Perform spectral clustering on `X` and return cluster labels. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ |
|
(n_samples, n_samples) |
|
Training instances to cluster, similarities / affinities between |
|
instances if ``affinity='precomputed'``, or distances between |
|
instances if ``affinity='precomputed_nearest_neighbors``. If a |
|
sparse matrix is provided in a format other than ``csr_matrix``, |
|
``csc_matrix``, or ``coo_matrix``, it will be converted into a |
|
sparse ``csr_matrix``. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
labels : ndarray of shape (n_samples,) |
|
Cluster labels. |
|
""" |
|
return super().fit_predict(X, y) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.sparse = True |
|
tags.input_tags.pairwise = self.affinity in [ |
|
"precomputed", |
|
"precomputed_nearest_neighbors", |
|
] |
|
return tags |
|
|