|
|
|
|
|
|
|
import warnings |
|
from math import sqrt |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy import sparse |
|
|
|
from .._config import config_context |
|
from ..base import ( |
|
BaseEstimator, |
|
ClassNamePrefixFeaturesOutMixin, |
|
ClusterMixin, |
|
TransformerMixin, |
|
_fit_context, |
|
) |
|
from ..exceptions import ConvergenceWarning |
|
from ..metrics import pairwise_distances_argmin |
|
from ..metrics.pairwise import euclidean_distances |
|
from ..utils._param_validation import Hidden, Interval, StrOptions |
|
from ..utils.extmath import row_norms |
|
from ..utils.validation import check_is_fitted, validate_data |
|
from . import AgglomerativeClustering |
|
|
|
|
|
def _iterate_sparse_X(X): |
|
"""This little hack returns a densified row when iterating over a sparse |
|
matrix, instead of constructing a sparse matrix for every row that is |
|
expensive. |
|
""" |
|
n_samples = X.shape[0] |
|
X_indices = X.indices |
|
X_data = X.data |
|
X_indptr = X.indptr |
|
|
|
for i in range(n_samples): |
|
row = np.zeros(X.shape[1]) |
|
startptr, endptr = X_indptr[i], X_indptr[i + 1] |
|
nonzero_indices = X_indices[startptr:endptr] |
|
row[nonzero_indices] = X_data[startptr:endptr] |
|
yield row |
|
|
|
|
|
def _split_node(node, threshold, branching_factor): |
|
"""The node has to be split if there is no place for a new subcluster |
|
in the node. |
|
1. Two empty nodes and two empty subclusters are initialized. |
|
2. The pair of distant subclusters are found. |
|
3. The properties of the empty subclusters and nodes are updated |
|
according to the nearest distance between the subclusters to the |
|
pair of distant subclusters. |
|
4. The two nodes are set as children to the two subclusters. |
|
""" |
|
new_subcluster1 = _CFSubcluster() |
|
new_subcluster2 = _CFSubcluster() |
|
new_node1 = _CFNode( |
|
threshold=threshold, |
|
branching_factor=branching_factor, |
|
is_leaf=node.is_leaf, |
|
n_features=node.n_features, |
|
dtype=node.init_centroids_.dtype, |
|
) |
|
new_node2 = _CFNode( |
|
threshold=threshold, |
|
branching_factor=branching_factor, |
|
is_leaf=node.is_leaf, |
|
n_features=node.n_features, |
|
dtype=node.init_centroids_.dtype, |
|
) |
|
new_subcluster1.child_ = new_node1 |
|
new_subcluster2.child_ = new_node2 |
|
|
|
if node.is_leaf: |
|
if node.prev_leaf_ is not None: |
|
node.prev_leaf_.next_leaf_ = new_node1 |
|
new_node1.prev_leaf_ = node.prev_leaf_ |
|
new_node1.next_leaf_ = new_node2 |
|
new_node2.prev_leaf_ = new_node1 |
|
new_node2.next_leaf_ = node.next_leaf_ |
|
if node.next_leaf_ is not None: |
|
node.next_leaf_.prev_leaf_ = new_node2 |
|
|
|
dist = euclidean_distances( |
|
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True |
|
) |
|
n_clusters = dist.shape[0] |
|
|
|
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters)) |
|
node1_dist, node2_dist = dist[(farthest_idx,)] |
|
|
|
node1_closer = node1_dist < node2_dist |
|
|
|
|
|
|
|
node1_closer[farthest_idx[0]] = True |
|
|
|
for idx, subcluster in enumerate(node.subclusters_): |
|
if node1_closer[idx]: |
|
new_node1.append_subcluster(subcluster) |
|
new_subcluster1.update(subcluster) |
|
else: |
|
new_node2.append_subcluster(subcluster) |
|
new_subcluster2.update(subcluster) |
|
return new_subcluster1, new_subcluster2 |
|
|
|
|
|
class _CFNode: |
|
"""Each node in a CFTree is called a CFNode. |
|
|
|
The CFNode can have a maximum of branching_factor |
|
number of CFSubclusters. |
|
|
|
Parameters |
|
---------- |
|
threshold : float |
|
Threshold needed for a new subcluster to enter a CFSubcluster. |
|
|
|
branching_factor : int |
|
Maximum number of CF subclusters in each node. |
|
|
|
is_leaf : bool |
|
We need to know if the CFNode is a leaf or not, in order to |
|
retrieve the final subclusters. |
|
|
|
n_features : int |
|
The number of features. |
|
|
|
Attributes |
|
---------- |
|
subclusters_ : list |
|
List of subclusters for a particular CFNode. |
|
|
|
prev_leaf_ : _CFNode |
|
Useful only if is_leaf is True. |
|
|
|
next_leaf_ : _CFNode |
|
next_leaf. Useful only if is_leaf is True. |
|
the final subclusters. |
|
|
|
init_centroids_ : ndarray of shape (branching_factor + 1, n_features) |
|
Manipulate ``init_centroids_`` throughout rather than centroids_ since |
|
the centroids are just a view of the ``init_centroids_`` . |
|
|
|
init_sq_norm_ : ndarray of shape (branching_factor + 1,) |
|
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``. |
|
|
|
centroids_ : ndarray of shape (branching_factor + 1, n_features) |
|
View of ``init_centroids_``. |
|
|
|
squared_norm_ : ndarray of shape (branching_factor + 1,) |
|
View of ``init_sq_norm_``. |
|
|
|
""" |
|
|
|
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype): |
|
self.threshold = threshold |
|
self.branching_factor = branching_factor |
|
self.is_leaf = is_leaf |
|
self.n_features = n_features |
|
|
|
|
|
|
|
self.subclusters_ = [] |
|
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype) |
|
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype) |
|
self.squared_norm_ = [] |
|
self.prev_leaf_ = None |
|
self.next_leaf_ = None |
|
|
|
def append_subcluster(self, subcluster): |
|
n_samples = len(self.subclusters_) |
|
self.subclusters_.append(subcluster) |
|
self.init_centroids_[n_samples] = subcluster.centroid_ |
|
self.init_sq_norm_[n_samples] = subcluster.sq_norm_ |
|
|
|
|
|
|
|
|
|
self.centroids_ = self.init_centroids_[: n_samples + 1, :] |
|
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1] |
|
|
|
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2): |
|
"""Remove a subcluster from a node and update it with the |
|
split subclusters. |
|
""" |
|
ind = self.subclusters_.index(subcluster) |
|
self.subclusters_[ind] = new_subcluster1 |
|
self.init_centroids_[ind] = new_subcluster1.centroid_ |
|
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_ |
|
self.append_subcluster(new_subcluster2) |
|
|
|
def insert_cf_subcluster(self, subcluster): |
|
"""Insert a new subcluster into the node.""" |
|
if not self.subclusters_: |
|
self.append_subcluster(subcluster) |
|
return False |
|
|
|
threshold = self.threshold |
|
branching_factor = self.branching_factor |
|
|
|
|
|
dist_matrix = np.dot(self.centroids_, subcluster.centroid_) |
|
dist_matrix *= -2.0 |
|
dist_matrix += self.squared_norm_ |
|
closest_index = np.argmin(dist_matrix) |
|
closest_subcluster = self.subclusters_[closest_index] |
|
|
|
|
|
if closest_subcluster.child_ is not None: |
|
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster) |
|
|
|
if not split_child: |
|
|
|
|
|
closest_subcluster.update(subcluster) |
|
self.init_centroids_[closest_index] = self.subclusters_[ |
|
closest_index |
|
].centroid_ |
|
self.init_sq_norm_[closest_index] = self.subclusters_[ |
|
closest_index |
|
].sq_norm_ |
|
return False |
|
|
|
|
|
|
|
|
|
else: |
|
new_subcluster1, new_subcluster2 = _split_node( |
|
closest_subcluster.child_, |
|
threshold, |
|
branching_factor, |
|
) |
|
self.update_split_subclusters( |
|
closest_subcluster, new_subcluster1, new_subcluster2 |
|
) |
|
|
|
if len(self.subclusters_) > self.branching_factor: |
|
return True |
|
return False |
|
|
|
|
|
else: |
|
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold) |
|
if merged: |
|
self.init_centroids_[closest_index] = closest_subcluster.centroid_ |
|
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_ |
|
return False |
|
|
|
|
|
|
|
elif len(self.subclusters_) < self.branching_factor: |
|
self.append_subcluster(subcluster) |
|
return False |
|
|
|
|
|
|
|
else: |
|
self.append_subcluster(subcluster) |
|
return True |
|
|
|
|
|
class _CFSubcluster: |
|
"""Each subcluster in a CFNode is called a CFSubcluster. |
|
|
|
A CFSubcluster can have a CFNode has its child. |
|
|
|
Parameters |
|
---------- |
|
linear_sum : ndarray of shape (n_features,), default=None |
|
Sample. This is kept optional to allow initialization of empty |
|
subclusters. |
|
|
|
Attributes |
|
---------- |
|
n_samples_ : int |
|
Number of samples that belong to each subcluster. |
|
|
|
linear_sum_ : ndarray |
|
Linear sum of all the samples in a subcluster. Prevents holding |
|
all sample data in memory. |
|
|
|
squared_sum_ : float |
|
Sum of the squared l2 norms of all samples belonging to a subcluster. |
|
|
|
centroid_ : ndarray of shape (branching_factor + 1, n_features) |
|
Centroid of the subcluster. Prevent recomputing of centroids when |
|
``CFNode.centroids_`` is called. |
|
|
|
child_ : _CFNode |
|
Child Node of the subcluster. Once a given _CFNode is set as the child |
|
of the _CFNode, it is set to ``self.child_``. |
|
|
|
sq_norm_ : ndarray of shape (branching_factor + 1,) |
|
Squared norm of the subcluster. Used to prevent recomputing when |
|
pairwise minimum distances are computed. |
|
""" |
|
|
|
def __init__(self, *, linear_sum=None): |
|
if linear_sum is None: |
|
self.n_samples_ = 0 |
|
self.squared_sum_ = 0.0 |
|
self.centroid_ = self.linear_sum_ = 0 |
|
else: |
|
self.n_samples_ = 1 |
|
self.centroid_ = self.linear_sum_ = linear_sum |
|
self.squared_sum_ = self.sq_norm_ = np.dot( |
|
self.linear_sum_, self.linear_sum_ |
|
) |
|
self.child_ = None |
|
|
|
def update(self, subcluster): |
|
self.n_samples_ += subcluster.n_samples_ |
|
self.linear_sum_ += subcluster.linear_sum_ |
|
self.squared_sum_ += subcluster.squared_sum_ |
|
self.centroid_ = self.linear_sum_ / self.n_samples_ |
|
self.sq_norm_ = np.dot(self.centroid_, self.centroid_) |
|
|
|
def merge_subcluster(self, nominee_cluster, threshold): |
|
"""Check if a cluster is worthy enough to be merged. If |
|
yes then merge. |
|
""" |
|
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_ |
|
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_ |
|
new_n = self.n_samples_ + nominee_cluster.n_samples_ |
|
new_centroid = (1 / new_n) * new_ls |
|
new_sq_norm = np.dot(new_centroid, new_centroid) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sq_radius = new_ss / new_n - new_sq_norm |
|
|
|
if sq_radius <= threshold**2: |
|
( |
|
self.n_samples_, |
|
self.linear_sum_, |
|
self.squared_sum_, |
|
self.centroid_, |
|
self.sq_norm_, |
|
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm) |
|
return True |
|
return False |
|
|
|
@property |
|
def radius(self): |
|
"""Return radius of the subcluster""" |
|
|
|
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_ |
|
return sqrt(max(0, sq_radius)) |
|
|
|
|
|
class Birch( |
|
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator |
|
): |
|
"""Implements the BIRCH clustering algorithm. |
|
|
|
It is a memory-efficient, online-learning algorithm provided as an |
|
alternative to :class:`MiniBatchKMeans`. It constructs a tree |
|
data structure with the cluster centroids being read off the leaf. |
|
These can be either the final cluster centroids or can be provided as input |
|
to another clustering algorithm such as :class:`AgglomerativeClustering`. |
|
|
|
Read more in the :ref:`User Guide <birch>`. |
|
|
|
.. versionadded:: 0.16 |
|
|
|
Parameters |
|
---------- |
|
threshold : float, default=0.5 |
|
The radius of the subcluster obtained by merging a new sample and the |
|
closest subcluster should be lesser than the threshold. Otherwise a new |
|
subcluster is started. Setting this value to be very low promotes |
|
splitting and vice-versa. |
|
|
|
branching_factor : int, default=50 |
|
Maximum number of CF subclusters in each node. If a new samples enters |
|
such that the number of subclusters exceed the branching_factor then |
|
that node is split into two nodes with the subclusters redistributed |
|
in each. The parent subcluster of that node is removed and two new |
|
subclusters are added as parents of the 2 split nodes. |
|
|
|
n_clusters : int, instance of sklearn.cluster model or None, default=3 |
|
Number of clusters after the final clustering step, which treats the |
|
subclusters from the leaves as new samples. |
|
|
|
- `None` : the final clustering step is not performed and the |
|
subclusters are returned as they are. |
|
|
|
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model |
|
is fit treating the subclusters as new samples and the initial data |
|
is mapped to the label of the closest subcluster. |
|
|
|
- `int` : the model fit is :class:`AgglomerativeClustering` with |
|
`n_clusters` set to be equal to the int. |
|
|
|
compute_labels : bool, default=True |
|
Whether or not to compute labels for each fit. |
|
|
|
copy : bool, default=True |
|
Whether or not to make a copy of the given data. If set to False, |
|
the initial data will be overwritten. |
|
|
|
.. deprecated:: 1.6 |
|
`copy` was deprecated in 1.6 and will be removed in 1.8. It has no effect |
|
as the estimator does not perform in-place operations on the input data. |
|
|
|
Attributes |
|
---------- |
|
root_ : _CFNode |
|
Root of the CFTree. |
|
|
|
dummy_leaf_ : _CFNode |
|
Start pointer to all the leaves. |
|
|
|
subcluster_centers_ : ndarray |
|
Centroids of all subclusters read directly from the leaves. |
|
|
|
subcluster_labels_ : ndarray |
|
Labels assigned to the centroids of the subclusters after |
|
they are clustered globally. |
|
|
|
labels_ : ndarray of shape (n_samples,) |
|
Array of labels assigned to the input data. |
|
if partial_fit is used instead of fit, they are assigned to the |
|
last batch of data. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
MiniBatchKMeans : Alternative implementation that does incremental updates |
|
of the centers' positions using mini-batches. |
|
|
|
Notes |
|
----- |
|
The tree data structure consists of nodes with each node consisting of |
|
a number of subclusters. The maximum number of subclusters in a node |
|
is determined by the branching factor. Each subcluster maintains a |
|
linear sum, squared sum and the number of samples in that subcluster. |
|
In addition, each subcluster can also have a node as its child, if the |
|
subcluster is not a member of a leaf node. |
|
|
|
For a new point entering the root, it is merged with the subcluster closest |
|
to it and the linear sum, squared sum and the number of samples of that |
|
subcluster are updated. This is done recursively till the properties of |
|
the leaf node are updated. |
|
|
|
See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a |
|
comparison with :class:`~sklearn.cluster.MiniBatchKMeans`. |
|
|
|
References |
|
---------- |
|
* Tian Zhang, Raghu Ramakrishnan, Maron Livny |
|
BIRCH: An efficient data clustering method for large databases. |
|
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf |
|
|
|
* Roberto Perdisci |
|
JBirch - Java implementation of BIRCH clustering algorithm |
|
https://code.google.com/archive/p/jbirch |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.cluster import Birch |
|
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]] |
|
>>> brc = Birch(n_clusters=None) |
|
>>> brc.fit(X) |
|
Birch(n_clusters=None) |
|
>>> brc.predict(X) |
|
array([0, 0, 0, 1, 1, 1]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"threshold": [Interval(Real, 0.0, None, closed="neither")], |
|
"branching_factor": [Interval(Integral, 1, None, closed="neither")], |
|
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")], |
|
"compute_labels": ["boolean"], |
|
"copy": ["boolean", Hidden(StrOptions({"deprecated"}))], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
threshold=0.5, |
|
branching_factor=50, |
|
n_clusters=3, |
|
compute_labels=True, |
|
copy="deprecated", |
|
): |
|
self.threshold = threshold |
|
self.branching_factor = branching_factor |
|
self.n_clusters = n_clusters |
|
self.compute_labels = compute_labels |
|
self.copy = copy |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y=None): |
|
""" |
|
Build a CF Tree for the input data. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input data. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self |
|
Fitted estimator. |
|
""" |
|
return self._fit(X, partial=False) |
|
|
|
def _fit(self, X, partial): |
|
has_root = getattr(self, "root_", None) |
|
first_call = not (partial and has_root) |
|
|
|
if self.copy != "deprecated" and first_call: |
|
warnings.warn( |
|
"`copy` was deprecated in 1.6 and will be removed in 1.8 since it " |
|
"has no effect internally. Simply leave this parameter to its default " |
|
"value to avoid this warning.", |
|
FutureWarning, |
|
) |
|
|
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse="csr", |
|
reset=first_call, |
|
dtype=[np.float64, np.float32], |
|
) |
|
threshold = self.threshold |
|
branching_factor = self.branching_factor |
|
|
|
n_samples, n_features = X.shape |
|
|
|
|
|
|
|
if first_call: |
|
|
|
self.root_ = _CFNode( |
|
threshold=threshold, |
|
branching_factor=branching_factor, |
|
is_leaf=True, |
|
n_features=n_features, |
|
dtype=X.dtype, |
|
) |
|
|
|
|
|
self.dummy_leaf_ = _CFNode( |
|
threshold=threshold, |
|
branching_factor=branching_factor, |
|
is_leaf=True, |
|
n_features=n_features, |
|
dtype=X.dtype, |
|
) |
|
self.dummy_leaf_.next_leaf_ = self.root_ |
|
self.root_.prev_leaf_ = self.dummy_leaf_ |
|
|
|
|
|
if not sparse.issparse(X): |
|
iter_func = iter |
|
else: |
|
iter_func = _iterate_sparse_X |
|
|
|
for sample in iter_func(X): |
|
subcluster = _CFSubcluster(linear_sum=sample) |
|
split = self.root_.insert_cf_subcluster(subcluster) |
|
|
|
if split: |
|
new_subcluster1, new_subcluster2 = _split_node( |
|
self.root_, threshold, branching_factor |
|
) |
|
del self.root_ |
|
self.root_ = _CFNode( |
|
threshold=threshold, |
|
branching_factor=branching_factor, |
|
is_leaf=False, |
|
n_features=n_features, |
|
dtype=X.dtype, |
|
) |
|
self.root_.append_subcluster(new_subcluster1) |
|
self.root_.append_subcluster(new_subcluster2) |
|
|
|
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()]) |
|
self.subcluster_centers_ = centroids |
|
self._n_features_out = self.subcluster_centers_.shape[0] |
|
|
|
self._global_clustering(X) |
|
return self |
|
|
|
def _get_leaves(self): |
|
""" |
|
Retrieve the leaves of the CF Node. |
|
|
|
Returns |
|
------- |
|
leaves : list of shape (n_leaves,) |
|
List of the leaf nodes. |
|
""" |
|
leaf_ptr = self.dummy_leaf_.next_leaf_ |
|
leaves = [] |
|
while leaf_ptr is not None: |
|
leaves.append(leaf_ptr) |
|
leaf_ptr = leaf_ptr.next_leaf_ |
|
return leaves |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def partial_fit(self, X=None, y=None): |
|
""" |
|
Online learning. Prevents rebuilding of CFTree from scratch. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features), \ |
|
default=None |
|
Input data. If X is not provided, only the global clustering |
|
step is done. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self |
|
Fitted estimator. |
|
""" |
|
if X is None: |
|
|
|
self._global_clustering() |
|
return self |
|
else: |
|
return self._fit(X, partial=True) |
|
|
|
def predict(self, X): |
|
""" |
|
Predict data using the ``centroids_`` of subclusters. |
|
|
|
Avoid computation of the row norms of X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input data. |
|
|
|
Returns |
|
------- |
|
labels : ndarray of shape(n_samples,) |
|
Labelled data. |
|
""" |
|
check_is_fitted(self) |
|
X = validate_data(self, X, accept_sparse="csr", reset=False) |
|
return self._predict(X) |
|
|
|
def _predict(self, X): |
|
"""Predict data using the ``centroids_`` of subclusters.""" |
|
kwargs = {"Y_norm_squared": self._subcluster_norms} |
|
|
|
with config_context(assume_finite=True): |
|
argmin = pairwise_distances_argmin( |
|
X, self.subcluster_centers_, metric_kwargs=kwargs |
|
) |
|
return self.subcluster_labels_[argmin] |
|
|
|
def transform(self, X): |
|
""" |
|
Transform X into subcluster centroids dimension. |
|
|
|
Each dimension represents the distance from the sample point to each |
|
cluster centroid. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input data. |
|
|
|
Returns |
|
------- |
|
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters) |
|
Transformed data. |
|
""" |
|
check_is_fitted(self) |
|
X = validate_data(self, X, accept_sparse="csr", reset=False) |
|
with config_context(assume_finite=True): |
|
return euclidean_distances(X, self.subcluster_centers_) |
|
|
|
def _global_clustering(self, X=None): |
|
""" |
|
Global clustering for the subclusters obtained after fitting |
|
""" |
|
clusterer = self.n_clusters |
|
centroids = self.subcluster_centers_ |
|
compute_labels = (X is not None) and self.compute_labels |
|
|
|
|
|
not_enough_centroids = False |
|
if isinstance(clusterer, Integral): |
|
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters) |
|
|
|
if len(centroids) < self.n_clusters: |
|
not_enough_centroids = True |
|
|
|
|
|
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True) |
|
|
|
if clusterer is None or not_enough_centroids: |
|
self.subcluster_labels_ = np.arange(len(centroids)) |
|
if not_enough_centroids: |
|
warnings.warn( |
|
"Number of subclusters found (%d) by BIRCH is less " |
|
"than (%d). Decrease the threshold." |
|
% (len(centroids), self.n_clusters), |
|
ConvergenceWarning, |
|
) |
|
else: |
|
|
|
|
|
|
|
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_) |
|
|
|
if compute_labels: |
|
self.labels_ = self._predict(X) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.transformer_tags.preserves_dtype = ["float64", "float32"] |
|
tags.input_tags.sparse = True |
|
return tags |
|
|