|
"""Isomap for manifold learning""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy.sparse import issparse |
|
from scipy.sparse.csgraph import connected_components, shortest_path |
|
|
|
from ..base import ( |
|
BaseEstimator, |
|
ClassNamePrefixFeaturesOutMixin, |
|
TransformerMixin, |
|
_fit_context, |
|
) |
|
from ..decomposition import KernelPCA |
|
from ..metrics.pairwise import _VALID_METRICS |
|
from ..neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph |
|
from ..preprocessing import KernelCenterer |
|
from ..utils._param_validation import Interval, StrOptions |
|
from ..utils.graph import _fix_connected_components |
|
from ..utils.validation import check_is_fitted |
|
|
|
|
|
class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): |
|
"""Isomap Embedding. |
|
|
|
Non-linear dimensionality reduction through Isometric Mapping |
|
|
|
Read more in the :ref:`User Guide <isomap>`. |
|
|
|
Parameters |
|
---------- |
|
n_neighbors : int or None, default=5 |
|
Number of neighbors to consider for each point. If `n_neighbors` is an int, |
|
then `radius` must be `None`. |
|
|
|
radius : float or None, default=None |
|
Limiting distance of neighbors to return. If `radius` is a float, |
|
then `n_neighbors` must be set to `None`. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
n_components : int, default=2 |
|
Number of coordinates for the manifold. |
|
|
|
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto' |
|
'auto' : Attempt to choose the most efficient solver |
|
for the given problem. |
|
|
|
'arpack' : Use Arnoldi decomposition to find the eigenvalues |
|
and eigenvectors. |
|
|
|
'dense' : Use a direct solver (i.e. LAPACK) |
|
for the eigenvalue decomposition. |
|
|
|
tol : float, default=0 |
|
Convergence tolerance passed to arpack or lobpcg. |
|
not used if eigen_solver == 'dense'. |
|
|
|
max_iter : int, default=None |
|
Maximum number of iterations for the arpack solver. |
|
not used if eigen_solver == 'dense'. |
|
|
|
path_method : {'auto', 'FW', 'D'}, default='auto' |
|
Method to use in finding shortest path. |
|
|
|
'auto' : attempt to choose the best algorithm automatically. |
|
|
|
'FW' : Floyd-Warshall algorithm. |
|
|
|
'D' : Dijkstra's algorithm. |
|
|
|
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \ |
|
default='auto' |
|
Algorithm to use for nearest neighbors search, |
|
passed to neighbors.NearestNeighbors instance. |
|
|
|
n_jobs : int or None, default=None |
|
The number of parallel jobs to run. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
metric : str, or callable, default="minkowski" |
|
The metric to use when calculating distance between instances in a |
|
feature array. If metric is a string or callable, it must be one of |
|
the options allowed by :func:`sklearn.metrics.pairwise_distances` for |
|
its metric parameter. |
|
If metric is "precomputed", X is assumed to be a distance matrix and |
|
must be square. X may be a :term:`Glossary <sparse graph>`. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
p : float, default=2 |
|
Parameter for the Minkowski metric from |
|
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is |
|
equivalent to using manhattan_distance (l1), and euclidean_distance |
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
metric_params : dict, default=None |
|
Additional keyword arguments for the metric function. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
Attributes |
|
---------- |
|
embedding_ : array-like, shape (n_samples, n_components) |
|
Stores the embedding vectors. |
|
|
|
kernel_pca_ : object |
|
:class:`~sklearn.decomposition.KernelPCA` object used to implement the |
|
embedding. |
|
|
|
nbrs_ : sklearn.neighbors.NearestNeighbors instance |
|
Stores nearest neighbors instance, including BallTree or KDtree |
|
if applicable. |
|
|
|
dist_matrix_ : array-like, shape (n_samples, n_samples) |
|
Stores the geodesic distance matrix of training data. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
sklearn.decomposition.PCA : Principal component analysis that is a linear |
|
dimensionality reduction method. |
|
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using |
|
kernels and PCA. |
|
MDS : Manifold learning using multidimensional scaling. |
|
TSNE : T-distributed Stochastic Neighbor Embedding. |
|
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding. |
|
SpectralEmbedding : Spectral embedding for non-linear dimensionality. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric |
|
framework for nonlinear dimensionality reduction. Science 290 (5500) |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import load_digits |
|
>>> from sklearn.manifold import Isomap |
|
>>> X, _ = load_digits(return_X_y=True) |
|
>>> X.shape |
|
(1797, 64) |
|
>>> embedding = Isomap(n_components=2) |
|
>>> X_transformed = embedding.fit_transform(X[:100]) |
|
>>> X_transformed.shape |
|
(100, 2) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None], |
|
"radius": [Interval(Real, 0, None, closed="both"), None], |
|
"n_components": [Interval(Integral, 1, None, closed="left")], |
|
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})], |
|
"tol": [Interval(Real, 0, None, closed="left")], |
|
"max_iter": [Interval(Integral, 1, None, closed="left"), None], |
|
"path_method": [StrOptions({"auto", "FW", "D"})], |
|
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})], |
|
"n_jobs": [Integral, None], |
|
"p": [Interval(Real, 1, None, closed="left")], |
|
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
|
"metric_params": [dict, None], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
n_neighbors=5, |
|
radius=None, |
|
n_components=2, |
|
eigen_solver="auto", |
|
tol=0, |
|
max_iter=None, |
|
path_method="auto", |
|
neighbors_algorithm="auto", |
|
n_jobs=None, |
|
metric="minkowski", |
|
p=2, |
|
metric_params=None, |
|
): |
|
self.n_neighbors = n_neighbors |
|
self.radius = radius |
|
self.n_components = n_components |
|
self.eigen_solver = eigen_solver |
|
self.tol = tol |
|
self.max_iter = max_iter |
|
self.path_method = path_method |
|
self.neighbors_algorithm = neighbors_algorithm |
|
self.n_jobs = n_jobs |
|
self.metric = metric |
|
self.p = p |
|
self.metric_params = metric_params |
|
|
|
def _fit_transform(self, X): |
|
if self.n_neighbors is not None and self.radius is not None: |
|
raise ValueError( |
|
"Both n_neighbors and radius are provided. Use" |
|
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use" |
|
" radius-based neighbors" |
|
) |
|
|
|
self.nbrs_ = NearestNeighbors( |
|
n_neighbors=self.n_neighbors, |
|
radius=self.radius, |
|
algorithm=self.neighbors_algorithm, |
|
metric=self.metric, |
|
p=self.p, |
|
metric_params=self.metric_params, |
|
n_jobs=self.n_jobs, |
|
) |
|
self.nbrs_.fit(X) |
|
self.n_features_in_ = self.nbrs_.n_features_in_ |
|
if hasattr(self.nbrs_, "feature_names_in_"): |
|
self.feature_names_in_ = self.nbrs_.feature_names_in_ |
|
|
|
self.kernel_pca_ = KernelPCA( |
|
n_components=self.n_components, |
|
kernel="precomputed", |
|
eigen_solver=self.eigen_solver, |
|
tol=self.tol, |
|
max_iter=self.max_iter, |
|
n_jobs=self.n_jobs, |
|
).set_output(transform="default") |
|
|
|
if self.n_neighbors is not None: |
|
nbg = kneighbors_graph( |
|
self.nbrs_, |
|
self.n_neighbors, |
|
metric=self.metric, |
|
p=self.p, |
|
metric_params=self.metric_params, |
|
mode="distance", |
|
n_jobs=self.n_jobs, |
|
) |
|
else: |
|
nbg = radius_neighbors_graph( |
|
self.nbrs_, |
|
radius=self.radius, |
|
metric=self.metric, |
|
p=self.p, |
|
metric_params=self.metric_params, |
|
mode="distance", |
|
n_jobs=self.n_jobs, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
n_connected_components, labels = connected_components(nbg) |
|
if n_connected_components > 1: |
|
if self.metric == "precomputed" and issparse(X): |
|
raise RuntimeError( |
|
"The number of connected components of the neighbors graph" |
|
f" is {n_connected_components} > 1. The graph cannot be " |
|
"completed with metric='precomputed', and Isomap cannot be" |
|
"fitted. Increase the number of neighbors to avoid this " |
|
"issue, or precompute the full distance matrix instead " |
|
"of passing a sparse neighbors graph." |
|
) |
|
warnings.warn( |
|
( |
|
"The number of connected components of the neighbors graph " |
|
f"is {n_connected_components} > 1. Completing the graph to fit" |
|
" Isomap might be slow. Increase the number of neighbors to " |
|
"avoid this issue." |
|
), |
|
stacklevel=2, |
|
) |
|
|
|
|
|
nbg = _fix_connected_components( |
|
X=self.nbrs_._fit_X, |
|
graph=nbg, |
|
n_connected_components=n_connected_components, |
|
component_labels=labels, |
|
mode="distance", |
|
metric=self.nbrs_.effective_metric_, |
|
**self.nbrs_.effective_metric_params_, |
|
) |
|
|
|
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False) |
|
|
|
if self.nbrs_._fit_X.dtype == np.float32: |
|
self.dist_matrix_ = self.dist_matrix_.astype( |
|
self.nbrs_._fit_X.dtype, copy=False |
|
) |
|
|
|
G = self.dist_matrix_**2 |
|
G *= -0.5 |
|
|
|
self.embedding_ = self.kernel_pca_.fit_transform(G) |
|
self._n_features_out = self.embedding_.shape[1] |
|
|
|
def reconstruction_error(self): |
|
"""Compute the reconstruction error for the embedding. |
|
|
|
Returns |
|
------- |
|
reconstruction_error : float |
|
Reconstruction error. |
|
|
|
Notes |
|
----- |
|
The cost function of an isomap embedding is |
|
|
|
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples`` |
|
|
|
Where D is the matrix of distances for the input data X, |
|
D_fit is the matrix of distances for the output embedding X_fit, |
|
and K is the isomap kernel: |
|
|
|
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)`` |
|
""" |
|
G = -0.5 * self.dist_matrix_**2 |
|
G_center = KernelCenterer().fit_transform(G) |
|
evals = self.kernel_pca_.eigenvalues_ |
|
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0] |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y=None): |
|
"""Compute the embedding vectors for data X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors} |
|
Sample data, shape = (n_samples, n_features), in the form of a |
|
numpy array, sparse matrix, precomputed tree, or NearestNeighbors |
|
object. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns a fitted instance of self. |
|
""" |
|
self._fit_transform(X) |
|
return self |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit_transform(self, X, y=None): |
|
"""Fit the model from data in X and transform X. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix, BallTree, KDTree} |
|
Training vector, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
X_new : array-like, shape (n_samples, n_components) |
|
X transformed in the new space. |
|
""" |
|
self._fit_transform(X) |
|
return self.embedding_ |
|
|
|
def transform(self, X): |
|
"""Transform X. |
|
|
|
This is implemented by linking the points X into the graph of geodesic |
|
distances of the training data. First the `n_neighbors` nearest |
|
neighbors of X are found in the training data, and from these the |
|
shortest geodesic distances from each point in X to each point in |
|
the training data are computed in order to construct the kernel. |
|
The embedding of X is the projection of this kernel onto the |
|
embedding vectors of the training set. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix}, shape (n_queries, n_features) |
|
If neighbors_algorithm='precomputed', X is assumed to be a |
|
distance matrix or a sparse graph of shape |
|
(n_queries, n_samples_fit). |
|
|
|
Returns |
|
------- |
|
X_new : array-like, shape (n_queries, n_components) |
|
X transformed in the new space. |
|
""" |
|
check_is_fitted(self) |
|
if self.n_neighbors is not None: |
|
distances, indices = self.nbrs_.kneighbors(X, return_distance=True) |
|
else: |
|
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
n_samples_fit = self.nbrs_.n_samples_fit_ |
|
n_queries = distances.shape[0] |
|
|
|
if hasattr(X, "dtype") and X.dtype == np.float32: |
|
dtype = np.float32 |
|
else: |
|
dtype = np.float64 |
|
|
|
G_X = np.zeros((n_queries, n_samples_fit), dtype) |
|
for i in range(n_queries): |
|
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0) |
|
|
|
G_X **= 2 |
|
G_X *= -0.5 |
|
|
|
return self.kernel_pca_.transform(G_X) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.transformer_tags.preserves_dtype = ["float64", "float32"] |
|
tags.input_tags.sparse = True |
|
return tags |
|
|