|
""" |
|
K-means clustering and vector quantization (:mod:`scipy.cluster.vq`) |
|
==================================================================== |
|
|
|
Provides routines for k-means clustering, generating code books |
|
from k-means models and quantizing vectors by comparing them with |
|
centroids in a code book. |
|
|
|
.. autosummary:: |
|
:toctree: generated/ |
|
|
|
whiten -- Normalize a group of observations so each feature has unit variance |
|
vq -- Calculate code book membership of a set of observation vectors |
|
kmeans -- Perform k-means on a set of observation vectors forming k clusters |
|
kmeans2 -- A different implementation of k-means with more methods |
|
-- for initializing centroids |
|
|
|
Background information |
|
---------------------- |
|
The k-means algorithm takes as input the number of clusters to |
|
generate, k, and a set of observation vectors to cluster. It |
|
returns a set of centroids, one for each of the k clusters. An |
|
observation vector is classified with the cluster number or |
|
centroid index of the centroid closest to it. |
|
|
|
A vector v belongs to cluster i if it is closer to centroid i than |
|
any other centroid. If v belongs to i, we say centroid i is the |
|
dominating centroid of v. The k-means algorithm tries to |
|
minimize distortion, which is defined as the sum of the squared distances |
|
between each observation vector and its dominating centroid. |
|
The minimization is achieved by iteratively reclassifying |
|
the observations into clusters and recalculating the centroids until |
|
a configuration is reached in which the centroids are stable. One can |
|
also define a maximum number of iterations. |
|
|
|
Since vector quantization is a natural application for k-means, |
|
information theory terminology is often used. The centroid index |
|
or cluster index is also referred to as a "code" and the table |
|
mapping codes to centroids and, vice versa, is often referred to as a |
|
"code book". The result of k-means, a set of centroids, can be |
|
used to quantize vectors. Quantization aims to find an encoding of |
|
vectors that reduces the expected distortion. |
|
|
|
All routines expect obs to be an M by N array, where the rows are |
|
the observation vectors. The codebook is a k by N array, where the |
|
ith row is the centroid of code word i. The observation vectors |
|
and centroids have the same feature dimension. |
|
|
|
As an example, suppose we wish to compress a 24-bit color image |
|
(each pixel is represented by one byte for red, one for blue, and |
|
one for green) before sending it over the web. By using a smaller |
|
8-bit encoding, we can reduce the amount of data by two |
|
thirds. Ideally, the colors for each of the 256 possible 8-bit |
|
encoding values should be chosen to minimize distortion of the |
|
color. Running k-means with k=256 generates a code book of 256 |
|
codes, which fills up all possible 8-bit sequences. Instead of |
|
sending a 3-byte value for each pixel, the 8-bit centroid index |
|
(or code word) of the dominating centroid is transmitted. The code |
|
book is also sent over the wire so each 8-bit code can be |
|
translated back to a 24-bit pixel value representation. If the |
|
image of interest was of an ocean, we would expect many 24-bit |
|
blues to be represented by 8-bit codes. If it was an image of a |
|
human face, more flesh-tone colors would be represented in the |
|
code book. |
|
|
|
""" |
|
import warnings |
|
import numpy as np |
|
from collections import deque |
|
from scipy._lib._array_api import ( |
|
_asarray, array_namespace, xp_size, xp_copy |
|
) |
|
from scipy._lib._util import (check_random_state, rng_integers, |
|
_transition_to_rng) |
|
from scipy._lib import array_api_extra as xpx |
|
from scipy.spatial.distance import cdist |
|
|
|
from . import _vq |
|
|
|
__docformat__ = 'restructuredtext' |
|
|
|
__all__ = ['whiten', 'vq', 'kmeans', 'kmeans2'] |
|
|
|
|
|
class ClusterError(Exception): |
|
pass |
|
|
|
|
|
def whiten(obs, check_finite=True): |
|
""" |
|
Normalize a group of observations on a per feature basis. |
|
|
|
Before running k-means, it is beneficial to rescale each feature |
|
dimension of the observation set by its standard deviation (i.e. "whiten" |
|
it - as in "white noise" where each frequency has equal power). |
|
Each feature is divided by its standard deviation across all observations |
|
to give it unit variance. |
|
|
|
Parameters |
|
---------- |
|
obs : ndarray |
|
Each row of the array is an observation. The |
|
columns are the features seen during each observation. |
|
|
|
>>> # f0 f1 f2 |
|
>>> obs = [[ 1., 1., 1.], #o0 |
|
... [ 2., 2., 2.], #o1 |
|
... [ 3., 3., 3.], #o2 |
|
... [ 4., 4., 4.]] #o3 |
|
|
|
check_finite : bool, optional |
|
Whether to check that the input matrices contain only finite numbers. |
|
Disabling may give a performance gain, but may result in problems |
|
(crashes, non-termination) if the inputs do contain infinities or NaNs. |
|
Default: True |
|
|
|
Returns |
|
------- |
|
result : ndarray |
|
Contains the values in `obs` scaled by the standard deviation |
|
of each column. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster.vq import whiten |
|
>>> features = np.array([[1.9, 2.3, 1.7], |
|
... [1.5, 2.5, 2.2], |
|
... [0.8, 0.6, 1.7,]]) |
|
>>> whiten(features) |
|
array([[ 4.17944278, 2.69811351, 7.21248917], |
|
[ 3.29956009, 2.93273208, 9.33380951], |
|
[ 1.75976538, 0.7038557 , 7.21248917]]) |
|
|
|
""" |
|
xp = array_namespace(obs) |
|
obs = _asarray(obs, check_finite=check_finite, xp=xp) |
|
std_dev = xp.std(obs, axis=0) |
|
zero_std_mask = std_dev == 0 |
|
if xp.any(zero_std_mask): |
|
std_dev[zero_std_mask] = 1.0 |
|
warnings.warn("Some columns have standard deviation zero. " |
|
"The values of these columns will not change.", |
|
RuntimeWarning, stacklevel=2) |
|
return obs / std_dev |
|
|
|
|
|
def vq(obs, code_book, check_finite=True): |
|
""" |
|
Assign codes from a code book to observations. |
|
|
|
Assigns a code from a code book to each observation. Each |
|
observation vector in the 'M' by 'N' `obs` array is compared with the |
|
centroids in the code book and assigned the code of the closest |
|
centroid. |
|
|
|
The features in `obs` should have unit variance, which can be |
|
achieved by passing them through the whiten function. The code |
|
book can be created with the k-means algorithm or a different |
|
encoding algorithm. |
|
|
|
Parameters |
|
---------- |
|
obs : ndarray |
|
Each row of the 'M' x 'N' array is an observation. The columns are |
|
the "features" seen during each observation. The features must be |
|
whitened first using the whiten function or something equivalent. |
|
code_book : ndarray |
|
The code book is usually generated using the k-means algorithm. |
|
Each row of the array holds a different code, and the columns are |
|
the features of the code. |
|
|
|
>>> # f0 f1 f2 f3 |
|
>>> code_book = [ |
|
... [ 1., 2., 3., 4.], #c0 |
|
... [ 1., 2., 3., 4.], #c1 |
|
... [ 1., 2., 3., 4.]] #c2 |
|
|
|
check_finite : bool, optional |
|
Whether to check that the input matrices contain only finite numbers. |
|
Disabling may give a performance gain, but may result in problems |
|
(crashes, non-termination) if the inputs do contain infinities or NaNs. |
|
Default: True |
|
|
|
Returns |
|
------- |
|
code : ndarray |
|
A length M array holding the code book index for each observation. |
|
dist : ndarray |
|
The distortion (distance) between the observation and its nearest |
|
code. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster.vq import vq |
|
>>> code_book = np.array([[1., 1., 1.], |
|
... [2., 2., 2.]]) |
|
>>> features = np.array([[1.9, 2.3, 1.7], |
|
... [1.5, 2.5, 2.2], |
|
... [0.8, 0.6, 1.7]]) |
|
>>> vq(features, code_book) |
|
(array([1, 1, 0], dtype=int32), array([0.43588989, 0.73484692, 0.83066239])) |
|
|
|
""" |
|
xp = array_namespace(obs, code_book) |
|
obs = _asarray(obs, xp=xp, check_finite=check_finite) |
|
code_book = _asarray(code_book, xp=xp, check_finite=check_finite) |
|
ct = xp.result_type(obs, code_book) |
|
|
|
c_obs = xp.astype(obs, ct, copy=False) |
|
c_code_book = xp.astype(code_book, ct, copy=False) |
|
|
|
if xp.isdtype(ct, kind='real floating'): |
|
c_obs = np.asarray(c_obs) |
|
c_code_book = np.asarray(c_code_book) |
|
result = _vq.vq(c_obs, c_code_book) |
|
return xp.asarray(result[0]), xp.asarray(result[1]) |
|
return py_vq(obs, code_book, check_finite=False) |
|
|
|
|
|
def py_vq(obs, code_book, check_finite=True): |
|
""" Python version of vq algorithm. |
|
|
|
The algorithm computes the Euclidean distance between each |
|
observation and every frame in the code_book. |
|
|
|
Parameters |
|
---------- |
|
obs : ndarray |
|
Expects a rank 2 array. Each row is one observation. |
|
code_book : ndarray |
|
Code book to use. Same format than obs. Should have same number of |
|
features (e.g., columns) than obs. |
|
check_finite : bool, optional |
|
Whether to check that the input matrices contain only finite numbers. |
|
Disabling may give a performance gain, but may result in problems |
|
(crashes, non-termination) if the inputs do contain infinities or NaNs. |
|
Default: True |
|
|
|
Returns |
|
------- |
|
code : ndarray |
|
code[i] gives the label of the ith obversation; its code is |
|
code_book[code[i]]. |
|
mind_dist : ndarray |
|
min_dist[i] gives the distance between the ith observation and its |
|
corresponding code. |
|
|
|
Notes |
|
----- |
|
This function is slower than the C version but works for |
|
all input types. If the inputs have the wrong types for the |
|
C versions of the function, this one is called as a last resort. |
|
|
|
It is about 20 times slower than the C version. |
|
|
|
""" |
|
xp = array_namespace(obs, code_book) |
|
obs = _asarray(obs, xp=xp, check_finite=check_finite) |
|
code_book = _asarray(code_book, xp=xp, check_finite=check_finite) |
|
|
|
if obs.ndim != code_book.ndim: |
|
raise ValueError("Observation and code_book should have the same rank") |
|
|
|
if obs.ndim == 1: |
|
obs = obs[:, xp.newaxis] |
|
code_book = code_book[:, xp.newaxis] |
|
|
|
|
|
dist = xp.asarray(cdist(obs, code_book)) |
|
code = xp.argmin(dist, axis=1) |
|
min_dist = xp.min(dist, axis=1) |
|
return code, min_dist |
|
|
|
|
|
def _kmeans(obs, guess, thresh=1e-5, xp=None): |
|
""" "raw" version of k-means. |
|
|
|
Returns |
|
------- |
|
code_book |
|
The lowest distortion codebook found. |
|
avg_dist |
|
The average distance a observation is from a code in the book. |
|
Lower means the code_book matches the data better. |
|
|
|
See Also |
|
-------- |
|
kmeans : wrapper around k-means |
|
|
|
Examples |
|
-------- |
|
Note: not whitened in this example. |
|
|
|
>>> import numpy as np |
|
>>> from scipy.cluster.vq import _kmeans |
|
>>> features = np.array([[ 1.9,2.3], |
|
... [ 1.5,2.5], |
|
... [ 0.8,0.6], |
|
... [ 0.4,1.8], |
|
... [ 1.0,1.0]]) |
|
>>> book = np.array((features[0],features[2])) |
|
>>> _kmeans(features,book) |
|
(array([[ 1.7 , 2.4 ], |
|
[ 0.73333333, 1.13333333]]), 0.40563916697728591) |
|
|
|
""" |
|
xp = np if xp is None else xp |
|
code_book = guess |
|
diff = xp.inf |
|
prev_avg_dists = deque([diff], maxlen=2) |
|
while diff > thresh: |
|
|
|
obs_code, distort = vq(obs, code_book, check_finite=False) |
|
prev_avg_dists.append(xp.mean(distort, axis=-1)) |
|
|
|
obs = np.asarray(obs) |
|
obs_code = np.asarray(obs_code) |
|
code_book, has_members = _vq.update_cluster_means(obs, obs_code, |
|
code_book.shape[0]) |
|
obs = xp.asarray(obs) |
|
obs_code = xp.asarray(obs_code) |
|
code_book = xp.asarray(code_book) |
|
has_members = xp.asarray(has_members) |
|
code_book = code_book[has_members] |
|
diff = xp.abs(prev_avg_dists[0] - prev_avg_dists[1]) |
|
|
|
return code_book, prev_avg_dists[1] |
|
|
|
|
|
@_transition_to_rng("seed") |
|
def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True, |
|
*, rng=None): |
|
""" |
|
Performs k-means on a set of observation vectors forming k clusters. |
|
|
|
The k-means algorithm adjusts the classification of the observations |
|
into clusters and updates the cluster centroids until the position of |
|
the centroids is stable over successive iterations. In this |
|
implementation of the algorithm, the stability of the centroids is |
|
determined by comparing the absolute value of the change in the average |
|
Euclidean distance between the observations and their corresponding |
|
centroids against a threshold. This yields |
|
a code book mapping centroids to codes and vice versa. |
|
|
|
Parameters |
|
---------- |
|
obs : ndarray |
|
Each row of the M by N array is an observation vector. The |
|
columns are the features seen during each observation. |
|
The features must be whitened first with the `whiten` function. |
|
|
|
k_or_guess : int or ndarray |
|
The number of centroids to generate. A code is assigned to |
|
each centroid, which is also the row index of the centroid |
|
in the code_book matrix generated. |
|
|
|
The initial k centroids are chosen by randomly selecting |
|
observations from the observation matrix. Alternatively, |
|
passing a k by N array specifies the initial k centroids. |
|
|
|
iter : int, optional |
|
The number of times to run k-means, returning the codebook |
|
with the lowest distortion. This argument is ignored if |
|
initial centroids are specified with an array for the |
|
``k_or_guess`` parameter. This parameter does not represent the |
|
number of iterations of the k-means algorithm. |
|
|
|
thresh : float, optional |
|
Terminates the k-means algorithm if the change in |
|
distortion since the last k-means iteration is less than |
|
or equal to threshold. |
|
|
|
check_finite : bool, optional |
|
Whether to check that the input matrices contain only finite numbers. |
|
Disabling may give a performance gain, but may result in problems |
|
(crashes, non-termination) if the inputs do contain infinities or NaNs. |
|
Default: True |
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
Returns |
|
------- |
|
codebook : ndarray |
|
A k by N array of k centroids. The ith centroid |
|
codebook[i] is represented with the code i. The centroids |
|
and codes generated represent the lowest distortion seen, |
|
not necessarily the globally minimal distortion. |
|
Note that the number of centroids is not necessarily the same as the |
|
``k_or_guess`` parameter, because centroids assigned to no observations |
|
are removed during iterations. |
|
|
|
distortion : float |
|
The mean (non-squared) Euclidean distance between the observations |
|
passed and the centroids generated. Note the difference to the standard |
|
definition of distortion in the context of the k-means algorithm, which |
|
is the sum of the squared distances. |
|
|
|
See Also |
|
-------- |
|
kmeans2 : a different implementation of k-means clustering |
|
with more methods for generating initial centroids but without |
|
using a distortion change threshold as a stopping criterion. |
|
|
|
whiten : must be called prior to passing an observation matrix |
|
to kmeans. |
|
|
|
Notes |
|
----- |
|
For more functionalities or optimal performance, you can use |
|
`sklearn.cluster.KMeans <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_. |
|
`This <https://hdbscan.readthedocs.io/en/latest/performance_and_scalability.html#comparison-of-high-performance-implementations>`_ |
|
is a benchmark result of several implementations. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy.cluster.vq import vq, kmeans, whiten |
|
>>> import matplotlib.pyplot as plt |
|
>>> features = np.array([[ 1.9,2.3], |
|
... [ 1.5,2.5], |
|
... [ 0.8,0.6], |
|
... [ 0.4,1.8], |
|
... [ 0.1,0.1], |
|
... [ 0.2,1.8], |
|
... [ 2.0,0.5], |
|
... [ 0.3,1.5], |
|
... [ 1.0,1.0]]) |
|
>>> whitened = whiten(features) |
|
>>> book = np.array((whitened[0],whitened[2])) |
|
>>> kmeans(whitened,book) |
|
(array([[ 2.3110306 , 2.86287398], # random |
|
[ 0.93218041, 1.24398691]]), 0.85684700941625547) |
|
|
|
>>> codes = 3 |
|
>>> kmeans(whitened,codes) |
|
(array([[ 2.3110306 , 2.86287398], # random |
|
[ 1.32544402, 0.65607529], |
|
[ 0.40782893, 2.02786907]]), 0.5196582527686241) |
|
|
|
>>> # Create 50 datapoints in two clusters a and b |
|
>>> pts = 50 |
|
>>> rng = np.random.default_rng() |
|
>>> a = rng.multivariate_normal([0, 0], [[4, 1], [1, 4]], size=pts) |
|
>>> b = rng.multivariate_normal([30, 10], |
|
... [[10, 2], [2, 1]], |
|
... size=pts) |
|
>>> features = np.concatenate((a, b)) |
|
>>> # Whiten data |
|
>>> whitened = whiten(features) |
|
>>> # Find 2 clusters in the data |
|
>>> codebook, distortion = kmeans(whitened, 2) |
|
>>> # Plot whitened data and cluster centers in red |
|
>>> plt.scatter(whitened[:, 0], whitened[:, 1]) |
|
>>> plt.scatter(codebook[:, 0], codebook[:, 1], c='r') |
|
>>> plt.show() |
|
|
|
""" |
|
if isinstance(k_or_guess, int): |
|
xp = array_namespace(obs) |
|
else: |
|
xp = array_namespace(obs, k_or_guess) |
|
obs = _asarray(obs, xp=xp, check_finite=check_finite) |
|
guess = _asarray(k_or_guess, xp=xp, check_finite=check_finite) |
|
if iter < 1: |
|
raise ValueError(f"iter must be at least 1, got {iter}") |
|
|
|
|
|
if xp_size(guess) != 1: |
|
if xp_size(guess) < 1: |
|
raise ValueError(f"Asked for 0 clusters. Initial book was {guess}") |
|
return _kmeans(obs, guess, thresh=thresh, xp=xp) |
|
|
|
|
|
k = int(guess) |
|
if k != guess: |
|
raise ValueError("If k_or_guess is a scalar, it must be an integer.") |
|
if k < 1: |
|
raise ValueError("Asked for %d clusters." % k) |
|
|
|
rng = check_random_state(rng) |
|
|
|
|
|
best_dist = xp.inf |
|
for i in range(iter): |
|
|
|
guess = _kpoints(obs, k, rng, xp) |
|
book, dist = _kmeans(obs, guess, thresh=thresh, xp=xp) |
|
if dist < best_dist: |
|
best_book = book |
|
best_dist = dist |
|
return best_book, best_dist |
|
|
|
|
|
def _kpoints(data, k, rng, xp): |
|
"""Pick k points at random in data (one row = one observation). |
|
|
|
Parameters |
|
---------- |
|
data : ndarray |
|
Expect a rank 1 or 2 array. Rank 1 are assumed to describe one |
|
dimensional data, rank 2 multidimensional data, in which case one |
|
row is one observation. |
|
k : int |
|
Number of samples to generate. |
|
rng : `numpy.random.Generator` or `numpy.random.RandomState` |
|
Random number generator. |
|
|
|
Returns |
|
------- |
|
x : ndarray |
|
A 'k' by 'N' containing the initial centroids |
|
|
|
""" |
|
idx = rng.choice(data.shape[0], size=int(k), replace=False) |
|
|
|
idx = xp.asarray(idx, dtype=xp.asarray([1]).dtype) |
|
return xp.take(data, idx, axis=0) |
|
|
|
|
|
def _krandinit(data, k, rng, xp): |
|
"""Returns k samples of a random variable whose parameters depend on data. |
|
|
|
More precisely, it returns k observations sampled from a Gaussian random |
|
variable whose mean and covariances are the ones estimated from the data. |
|
|
|
Parameters |
|
---------- |
|
data : ndarray |
|
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D |
|
data, rank 2 multidimensional data, in which case one |
|
row is one observation. |
|
k : int |
|
Number of samples to generate. |
|
rng : `numpy.random.Generator` or `numpy.random.RandomState` |
|
Random number generator. |
|
|
|
Returns |
|
------- |
|
x : ndarray |
|
A 'k' by 'N' containing the initial centroids |
|
|
|
""" |
|
mu = xp.mean(data, axis=0) |
|
k = np.asarray(k) |
|
|
|
if data.ndim == 1: |
|
_cov = xpx.cov(data, xp=xp) |
|
x = rng.standard_normal(size=k) |
|
x = xp.asarray(x) |
|
x *= xp.sqrt(_cov) |
|
elif data.shape[1] > data.shape[0]: |
|
|
|
_, s, vh = xp.linalg.svd(data - mu, full_matrices=False) |
|
x = rng.standard_normal(size=(k, xp_size(s))) |
|
x = xp.asarray(x) |
|
sVh = s[:, None] * vh / xp.sqrt(data.shape[0] - xp.asarray(1.)) |
|
x = x @ sVh |
|
else: |
|
_cov = xpx.atleast_nd(xpx.cov(data.T, xp=xp), ndim=2, xp=xp) |
|
|
|
|
|
|
|
x = rng.standard_normal(size=(k, xp_size(mu))) |
|
x = xp.asarray(x) |
|
x = x @ xp.linalg.cholesky(_cov).T |
|
|
|
x += mu |
|
return x |
|
|
|
|
|
def _kpp(data, k, rng, xp): |
|
""" Picks k points in the data based on the kmeans++ method. |
|
|
|
Parameters |
|
---------- |
|
data : ndarray |
|
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D |
|
data, rank 2 multidimensional data, in which case one |
|
row is one observation. |
|
k : int |
|
Number of samples to generate. |
|
rng : `numpy.random.Generator` or `numpy.random.RandomState` |
|
Random number generator. |
|
|
|
Returns |
|
------- |
|
init : ndarray |
|
A 'k' by 'N' containing the initial centroids. |
|
|
|
References |
|
---------- |
|
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of |
|
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium |
|
on Discrete Algorithms, 2007. |
|
""" |
|
|
|
ndim = len(data.shape) |
|
if ndim == 1: |
|
data = data[:, None] |
|
|
|
dims = data.shape[1] |
|
|
|
init = xp.empty((int(k), dims)) |
|
|
|
for i in range(k): |
|
if i == 0: |
|
init[i, :] = data[rng_integers(rng, data.shape[0]), :] |
|
|
|
else: |
|
D2 = cdist(init[:i,:], data, metric='sqeuclidean').min(axis=0) |
|
probs = D2/D2.sum() |
|
cumprobs = probs.cumsum() |
|
r = rng.uniform() |
|
cumprobs = np.asarray(cumprobs) |
|
init[i, :] = data[np.searchsorted(cumprobs, r), :] |
|
|
|
if ndim == 1: |
|
init = init[:, 0] |
|
return init |
|
|
|
|
|
_valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp} |
|
|
|
|
|
def _missing_warn(): |
|
"""Print a warning when called.""" |
|
warnings.warn("One of the clusters is empty. " |
|
"Re-run kmeans with a different initialization.", |
|
stacklevel=3) |
|
|
|
|
|
def _missing_raise(): |
|
"""Raise a ClusterError when called.""" |
|
raise ClusterError("One of the clusters is empty. " |
|
"Re-run kmeans with a different initialization.") |
|
|
|
|
|
_valid_miss_meth = {'warn': _missing_warn, 'raise': _missing_raise} |
|
|
|
|
|
@_transition_to_rng("seed") |
|
def kmeans2(data, k, iter=10, thresh=1e-5, minit='random', |
|
missing='warn', check_finite=True, *, rng=None): |
|
""" |
|
Classify a set of observations into k clusters using the k-means algorithm. |
|
|
|
The algorithm attempts to minimize the Euclidean distance between |
|
observations and centroids. Several initialization methods are |
|
included. |
|
|
|
Parameters |
|
---------- |
|
data : ndarray |
|
A 'M' by 'N' array of 'M' observations in 'N' dimensions or a length |
|
'M' array of 'M' 1-D observations. |
|
k : int or ndarray |
|
The number of clusters to form as well as the number of |
|
centroids to generate. If `minit` initialization string is |
|
'matrix', or if a ndarray is given instead, it is |
|
interpreted as initial cluster to use instead. |
|
iter : int, optional |
|
Number of iterations of the k-means algorithm to run. Note |
|
that this differs in meaning from the iters parameter to |
|
the kmeans function. |
|
thresh : float, optional |
|
(not used yet) |
|
minit : str, optional |
|
Method for initialization. Available methods are 'random', |
|
'points', '++' and 'matrix': |
|
|
|
'random': generate k centroids from a Gaussian with mean and |
|
variance estimated from the data. |
|
|
|
'points': choose k observations (rows) at random from data for |
|
the initial centroids. |
|
|
|
'++': choose k observations accordingly to the kmeans++ method |
|
(careful seeding) |
|
|
|
'matrix': interpret the k parameter as a k by M (or length k |
|
array for 1-D data) array of initial centroids. |
|
missing : str, optional |
|
Method to deal with empty clusters. Available methods are |
|
'warn' and 'raise': |
|
|
|
'warn': give a warning and continue. |
|
|
|
'raise': raise an ClusterError and terminate the algorithm. |
|
check_finite : bool, optional |
|
Whether to check that the input matrices contain only finite numbers. |
|
Disabling may give a performance gain, but may result in problems |
|
(crashes, non-termination) if the inputs do contain infinities or NaNs. |
|
Default: True |
|
rng : `numpy.random.Generator`, optional |
|
Pseudorandom number generator state. When `rng` is None, a new |
|
`numpy.random.Generator` is created using entropy from the |
|
operating system. Types other than `numpy.random.Generator` are |
|
passed to `numpy.random.default_rng` to instantiate a ``Generator``. |
|
|
|
Returns |
|
------- |
|
centroid : ndarray |
|
A 'k' by 'N' array of centroids found at the last iteration of |
|
k-means. |
|
label : ndarray |
|
label[i] is the code or index of the centroid the |
|
ith observation is closest to. |
|
|
|
See Also |
|
-------- |
|
kmeans |
|
|
|
References |
|
---------- |
|
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of |
|
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium |
|
on Discrete Algorithms, 2007. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.cluster.vq import kmeans2 |
|
>>> import matplotlib.pyplot as plt |
|
>>> import numpy as np |
|
|
|
Create z, an array with shape (100, 2) containing a mixture of samples |
|
from three multivariate normal distributions. |
|
|
|
>>> rng = np.random.default_rng() |
|
>>> a = rng.multivariate_normal([0, 6], [[2, 1], [1, 1.5]], size=45) |
|
>>> b = rng.multivariate_normal([2, 0], [[1, -1], [-1, 3]], size=30) |
|
>>> c = rng.multivariate_normal([6, 4], [[5, 0], [0, 1.2]], size=25) |
|
>>> z = np.concatenate((a, b, c)) |
|
>>> rng.shuffle(z) |
|
|
|
Compute three clusters. |
|
|
|
>>> centroid, label = kmeans2(z, 3, minit='points') |
|
>>> centroid |
|
array([[ 2.22274463, -0.61666946], # may vary |
|
[ 0.54069047, 5.86541444], |
|
[ 6.73846769, 4.01991898]]) |
|
|
|
How many points are in each cluster? |
|
|
|
>>> counts = np.bincount(label) |
|
>>> counts |
|
array([29, 51, 20]) # may vary |
|
|
|
Plot the clusters. |
|
|
|
>>> w0 = z[label == 0] |
|
>>> w1 = z[label == 1] |
|
>>> w2 = z[label == 2] |
|
>>> plt.plot(w0[:, 0], w0[:, 1], 'o', alpha=0.5, label='cluster 0') |
|
>>> plt.plot(w1[:, 0], w1[:, 1], 'd', alpha=0.5, label='cluster 1') |
|
>>> plt.plot(w2[:, 0], w2[:, 1], 's', alpha=0.5, label='cluster 2') |
|
>>> plt.plot(centroid[:, 0], centroid[:, 1], 'k*', label='centroids') |
|
>>> plt.axis('equal') |
|
>>> plt.legend(shadow=True) |
|
>>> plt.show() |
|
|
|
""" |
|
if int(iter) < 1: |
|
raise ValueError(f"Invalid iter ({iter}), must be a positive integer.") |
|
try: |
|
miss_meth = _valid_miss_meth[missing] |
|
except KeyError as e: |
|
raise ValueError(f"Unknown missing method {missing!r}") from e |
|
|
|
if isinstance(k, int): |
|
xp = array_namespace(data) |
|
else: |
|
xp = array_namespace(data, k) |
|
data = _asarray(data, xp=xp, check_finite=check_finite) |
|
code_book = xp_copy(k, xp=xp) |
|
if data.ndim == 1: |
|
d = 1 |
|
elif data.ndim == 2: |
|
d = data.shape[1] |
|
else: |
|
raise ValueError("Input of rank > 2 is not supported.") |
|
|
|
if xp_size(data) < 1 or xp_size(code_book) < 1: |
|
raise ValueError("Empty input is not supported.") |
|
|
|
|
|
if minit == 'matrix' or xp_size(code_book) > 1: |
|
if data.ndim != code_book.ndim: |
|
raise ValueError("k array doesn't match data rank") |
|
nc = code_book.shape[0] |
|
if data.ndim > 1 and code_book.shape[1] != d: |
|
raise ValueError("k array doesn't match data dimension") |
|
else: |
|
nc = int(code_book) |
|
|
|
if nc < 1: |
|
raise ValueError("Cannot ask kmeans2 for %d clusters" |
|
" (k was %s)" % (nc, code_book)) |
|
elif nc != code_book: |
|
warnings.warn("k was not an integer, was converted.", stacklevel=2) |
|
|
|
try: |
|
init_meth = _valid_init_meth[minit] |
|
except KeyError as e: |
|
raise ValueError(f"Unknown init method {minit!r}") from e |
|
else: |
|
rng = check_random_state(rng) |
|
code_book = init_meth(data, code_book, rng, xp) |
|
|
|
data = np.asarray(data) |
|
code_book = np.asarray(code_book) |
|
for i in range(iter): |
|
|
|
label = vq(data, code_book, check_finite=check_finite)[0] |
|
|
|
new_code_book, has_members = _vq.update_cluster_means(data, label, nc) |
|
if not has_members.all(): |
|
miss_meth() |
|
|
|
new_code_book[~has_members] = code_book[~has_members] |
|
code_book = new_code_book |
|
|
|
return xp.asarray(code_book), xp.asarray(label) |
|
|