|
"""Metrics for pairwise distances and affinity of sets of samples.""" |
|
|
|
|
|
|
|
|
|
import itertools |
|
import warnings |
|
from functools import partial |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from joblib import effective_n_jobs |
|
from scipy.sparse import csr_matrix, issparse |
|
from scipy.spatial import distance |
|
|
|
from .. import config_context |
|
from ..exceptions import DataConversionWarning |
|
from ..preprocessing import normalize |
|
from ..utils import check_array, gen_batches, gen_even_slices |
|
from ..utils._array_api import ( |
|
_fill_or_add_to_diagonal, |
|
_find_matching_floating_dtype, |
|
_is_numpy_namespace, |
|
_max_precision_float_dtype, |
|
_modify_in_place_if_numpy, |
|
device, |
|
get_namespace, |
|
get_namespace_and_device, |
|
) |
|
from ..utils._chunking import get_chunk_n_rows |
|
from ..utils._mask import _get_mask |
|
from ..utils._missing import is_scalar_nan |
|
from ..utils._param_validation import ( |
|
Hidden, |
|
Interval, |
|
MissingValues, |
|
Options, |
|
StrOptions, |
|
validate_params, |
|
) |
|
from ..utils.deprecation import _deprecate_force_all_finite |
|
from ..utils.extmath import row_norms, safe_sparse_dot |
|
from ..utils.fixes import parse_version, sp_base_version |
|
from ..utils.parallel import Parallel, delayed |
|
from ..utils.validation import _num_samples, check_non_negative |
|
from ._pairwise_distances_reduction import ArgKmin |
|
from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan |
|
|
|
|
|
|
|
def _return_float_dtype(X, Y): |
|
""" |
|
1. If dtype of X and Y is float32, then dtype float32 is returned. |
|
2. Else dtype float is returned. |
|
""" |
|
if not issparse(X) and not isinstance(X, np.ndarray): |
|
X = np.asarray(X) |
|
|
|
if Y is None: |
|
Y_dtype = X.dtype |
|
elif not issparse(Y) and not isinstance(Y, np.ndarray): |
|
Y = np.asarray(Y) |
|
Y_dtype = Y.dtype |
|
else: |
|
Y_dtype = Y.dtype |
|
|
|
if X.dtype == Y_dtype == np.float32: |
|
dtype = np.float32 |
|
else: |
|
dtype = float |
|
|
|
return X, Y, dtype |
|
|
|
|
|
def check_pairwise_arrays( |
|
X, |
|
Y, |
|
*, |
|
precomputed=False, |
|
dtype="infer_float", |
|
accept_sparse="csr", |
|
force_all_finite="deprecated", |
|
ensure_all_finite=None, |
|
ensure_2d=True, |
|
copy=False, |
|
): |
|
"""Set X and Y appropriately and checks inputs. |
|
|
|
If Y is None, it is set as a pointer to X (i.e. not a copy). |
|
If Y is given, this does not happen. |
|
All distance metrics should use this function first to assert that the |
|
given parameters are correct and safe to use. |
|
|
|
Specifically, this function first ensures that both X and Y are arrays, |
|
then checks that they are at least two dimensional while ensuring that |
|
their elements are floats (or dtype if provided). Finally, the function |
|
checks that the size of the second dimension of the two arrays is equal, or |
|
the equivalent check for a precomputed distance matrix. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) |
|
|
|
precomputed : bool, default=False |
|
True if X is to be treated as precomputed distances to the samples in |
|
Y. |
|
|
|
dtype : str, type, list of type or None default="infer_float" |
|
Data type required for X and Y. If "infer_float", the dtype will be an |
|
appropriate float type selected by _return_float_dtype. If None, the |
|
dtype of the input is preserved. |
|
|
|
.. versionadded:: 0.18 |
|
|
|
accept_sparse : str, bool or list/tuple of str, default='csr' |
|
String[s] representing allowed sparse matrix formats, such as 'csc', |
|
'csr', etc. If the input is sparse but not in the allowed format, |
|
it will be converted to the first listed format. True allows the input |
|
to be any format. False means that a sparse matrix input will |
|
raise an error. |
|
|
|
force_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The |
|
possibilities are: |
|
|
|
- True: Force all values of array to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in array. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values |
|
cannot be infinite. |
|
|
|
.. versionadded:: 0.22 |
|
``force_all_finite`` accepts the string ``'allow-nan'``. |
|
|
|
.. versionchanged:: 0.23 |
|
Accepts `pd.NA` and converts it into `np.nan`. |
|
|
|
.. deprecated:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite` and will be removed |
|
in 1.8. |
|
|
|
ensure_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The |
|
possibilities are: |
|
|
|
- True: Force all values of array to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in array. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values |
|
cannot be infinite. |
|
|
|
.. versionadded:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite`. |
|
|
|
ensure_2d : bool, default=True |
|
Whether to raise an error when the input arrays are not 2-dimensional. Setting |
|
this to `False` is necessary when using a custom metric with certain |
|
non-numerical inputs (e.g. a list of strings). |
|
|
|
.. versionadded:: 1.5 |
|
|
|
copy : bool, default=False |
|
Whether a forced copy will be triggered. If copy=False, a copy might |
|
be triggered by a conversion. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
Returns |
|
------- |
|
safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
An array equal to X, guaranteed to be a numpy array. |
|
|
|
safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) |
|
An array equal to Y if Y was not None, guaranteed to be a numpy array. |
|
If Y was None, safe_Y will be a pointer to X. |
|
""" |
|
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) |
|
|
|
xp, _ = get_namespace(X, Y) |
|
if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp): |
|
X, Y, dtype_float = _return_float_dtype(X, Y) |
|
else: |
|
dtype_float = _find_matching_floating_dtype(X, Y, xp=xp) |
|
|
|
estimator = "check_pairwise_arrays" |
|
if dtype == "infer_float": |
|
dtype = dtype_float |
|
|
|
if Y is X or Y is None: |
|
X = Y = check_array( |
|
X, |
|
accept_sparse=accept_sparse, |
|
dtype=dtype, |
|
copy=copy, |
|
ensure_all_finite=ensure_all_finite, |
|
estimator=estimator, |
|
ensure_2d=ensure_2d, |
|
) |
|
else: |
|
X = check_array( |
|
X, |
|
accept_sparse=accept_sparse, |
|
dtype=dtype, |
|
copy=copy, |
|
ensure_all_finite=ensure_all_finite, |
|
estimator=estimator, |
|
ensure_2d=ensure_2d, |
|
) |
|
Y = check_array( |
|
Y, |
|
accept_sparse=accept_sparse, |
|
dtype=dtype, |
|
copy=copy, |
|
ensure_all_finite=ensure_all_finite, |
|
estimator=estimator, |
|
ensure_2d=ensure_2d, |
|
) |
|
|
|
if precomputed: |
|
if X.shape[1] != Y.shape[0]: |
|
raise ValueError( |
|
"Precomputed metric requires shape " |
|
"(n_queries, n_indexed). Got (%d, %d) " |
|
"for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0]) |
|
) |
|
elif ensure_2d and X.shape[1] != Y.shape[1]: |
|
|
|
|
|
raise ValueError( |
|
"Incompatible dimension for X and Y matrices: " |
|
"X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1]) |
|
) |
|
|
|
return X, Y |
|
|
|
|
|
def check_paired_arrays(X, Y): |
|
"""Set X and Y appropriately and checks inputs for paired distances. |
|
|
|
All paired distance metrics should use this function first to assert that |
|
the given parameters are correct and safe to use. |
|
|
|
Specifically, this function first ensures that both X and Y are arrays, |
|
then checks that they are at least two dimensional while ensuring that |
|
their elements are floats. Finally, the function checks that the size |
|
of the dimensions of the two arrays are equal. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) |
|
|
|
Returns |
|
------- |
|
safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
An array equal to X, guaranteed to be a numpy array. |
|
|
|
safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) |
|
An array equal to Y if Y was not None, guaranteed to be a numpy array. |
|
If Y was None, safe_Y will be a pointer to X. |
|
""" |
|
X, Y = check_pairwise_arrays(X, Y) |
|
if X.shape != Y.shape: |
|
raise ValueError( |
|
"X and Y should be of same shape. They were respectively %r and %r long." |
|
% (X.shape, Y.shape) |
|
) |
|
return X, Y |
|
|
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"Y_norm_squared": ["array-like", None], |
|
"squared": ["boolean"], |
|
"X_norm_squared": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def euclidean_distances( |
|
X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None |
|
): |
|
""" |
|
Compute the distance matrix between each pair from a vector array X and Y. |
|
|
|
For efficiency reasons, the euclidean distance between a pair of row |
|
vector x and y is computed as:: |
|
|
|
dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) |
|
|
|
This formulation has two advantages over other ways of computing distances. |
|
First, it is computationally efficient when dealing with sparse data. |
|
Second, if one argument varies but the other remains unchanged, then |
|
`dot(x, x)` and/or `dot(y, y)` can be pre-computed. |
|
|
|
However, this is not the most precise way of doing this computation, |
|
because this equation potentially suffers from "catastrophic cancellation". |
|
Also, the distance matrix returned by this function may not be exactly |
|
symmetric as required by, e.g., ``scipy.spatial.distance`` functions. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
An array where each row is a sample and each column is a feature. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ |
|
default=None |
|
An array where each row is a sample and each column is a feature. |
|
If `None`, method uses `Y=X`. |
|
|
|
Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \ |
|
or (1, n_samples_Y), default=None |
|
Pre-computed dot-products of vectors in Y (e.g., |
|
``(Y**2).sum(axis=1)``) |
|
May be ignored in some cases, see the note below. |
|
|
|
squared : bool, default=False |
|
Return squared Euclidean distances. |
|
|
|
X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \ |
|
or (1, n_samples_X), default=None |
|
Pre-computed dot-products of vectors in X (e.g., |
|
``(X**2).sum(axis=1)``) |
|
May be ignored in some cases, see the note below. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples_X, n_samples_Y) |
|
Returns the distances between the row vectors of `X` |
|
and the row vectors of `Y`. |
|
|
|
See Also |
|
-------- |
|
paired_distances : Distances between pairs of elements of X and Y. |
|
|
|
Notes |
|
----- |
|
To achieve a better accuracy, `X_norm_squared` and `Y_norm_squared` may be |
|
unused if they are passed as `np.float32`. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import euclidean_distances |
|
>>> X = [[0, 1], [1, 1]] |
|
>>> # distance between rows of X |
|
>>> euclidean_distances(X, X) |
|
array([[0., 1.], |
|
[1., 0.]]) |
|
>>> # get distance to origin |
|
>>> euclidean_distances(X, [[0, 0]]) |
|
array([[1. ], |
|
[1.41421356]]) |
|
""" |
|
xp, _ = get_namespace(X, Y) |
|
X, Y = check_pairwise_arrays(X, Y) |
|
|
|
if X_norm_squared is not None: |
|
X_norm_squared = check_array(X_norm_squared, ensure_2d=False) |
|
original_shape = X_norm_squared.shape |
|
if X_norm_squared.shape == (X.shape[0],): |
|
X_norm_squared = xp.reshape(X_norm_squared, (-1, 1)) |
|
if X_norm_squared.shape == (1, X.shape[0]): |
|
X_norm_squared = X_norm_squared.T |
|
if X_norm_squared.shape != (X.shape[0], 1): |
|
raise ValueError( |
|
f"Incompatible dimensions for X of shape {X.shape} and " |
|
f"X_norm_squared of shape {original_shape}." |
|
) |
|
|
|
if Y_norm_squared is not None: |
|
Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False) |
|
original_shape = Y_norm_squared.shape |
|
if Y_norm_squared.shape == (Y.shape[0],): |
|
Y_norm_squared = xp.reshape(Y_norm_squared, (1, -1)) |
|
if Y_norm_squared.shape == (Y.shape[0], 1): |
|
Y_norm_squared = Y_norm_squared.T |
|
if Y_norm_squared.shape != (1, Y.shape[0]): |
|
raise ValueError( |
|
f"Incompatible dimensions for Y of shape {Y.shape} and " |
|
f"Y_norm_squared of shape {original_shape}." |
|
) |
|
|
|
return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared) |
|
|
|
|
|
def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False): |
|
"""Computational part of euclidean_distances |
|
|
|
Assumes inputs are already checked. |
|
|
|
If norms are passed as float32, they are unused. If arrays are passed as |
|
float32, norms needs to be recomputed on upcast chunks. |
|
TODO: use a float64 accumulator in row_norms to avoid the latter. |
|
""" |
|
xp, _, device_ = get_namespace_and_device(X, Y) |
|
if X_norm_squared is not None and X_norm_squared.dtype != xp.float32: |
|
XX = xp.reshape(X_norm_squared, (-1, 1)) |
|
elif X.dtype != xp.float32: |
|
XX = row_norms(X, squared=True)[:, None] |
|
else: |
|
XX = None |
|
|
|
if Y is X: |
|
YY = None if XX is None else XX.T |
|
else: |
|
if Y_norm_squared is not None and Y_norm_squared.dtype != xp.float32: |
|
YY = xp.reshape(Y_norm_squared, (1, -1)) |
|
elif Y.dtype != xp.float32: |
|
YY = row_norms(Y, squared=True)[None, :] |
|
else: |
|
YY = None |
|
|
|
if X.dtype == xp.float32 or Y.dtype == xp.float32: |
|
|
|
|
|
distances = _euclidean_distances_upcast(X, XX, Y, YY) |
|
else: |
|
|
|
distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True) |
|
distances += XX |
|
distances += YY |
|
|
|
xp_zero = xp.asarray(0, device=device_, dtype=distances.dtype) |
|
distances = _modify_in_place_if_numpy( |
|
xp, xp.maximum, distances, xp_zero, out=distances |
|
) |
|
|
|
|
|
|
|
if X is Y: |
|
_fill_or_add_to_diagonal(distances, 0, xp=xp, add_value=False) |
|
|
|
if squared: |
|
return distances |
|
|
|
distances = _modify_in_place_if_numpy(xp, xp.sqrt, distances, out=distances) |
|
return distances |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like"], |
|
"Y": ["array-like", None], |
|
"squared": ["boolean"], |
|
"missing_values": [MissingValues(numeric_only=True)], |
|
"copy": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def nan_euclidean_distances( |
|
X, Y=None, *, squared=False, missing_values=np.nan, copy=True |
|
): |
|
"""Calculate the euclidean distances in the presence of missing values. |
|
|
|
Compute the euclidean distance between each pair of samples in X and Y, |
|
where Y=X is assumed if Y=None. When calculating the distance between a |
|
pair of samples, this formulation ignores feature coordinates with a |
|
missing value in either sample and scales up the weight of the remaining |
|
coordinates: |
|
|
|
.. code-block:: text |
|
|
|
dist(x,y) = sqrt(weight * sq. distance from present coordinates) |
|
|
|
where: |
|
|
|
.. code-block:: text |
|
|
|
weight = Total # of coordinates / # of present coordinates |
|
|
|
For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]`` is: |
|
|
|
.. math:: |
|
\\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)} |
|
|
|
If all the coordinates are missing or if there are no common present |
|
coordinates then NaN is returned for that pair. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
.. versionadded:: 0.22 |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples_X, n_features) |
|
An array where each row is a sample and each column is a feature. |
|
|
|
Y : array-like of shape (n_samples_Y, n_features), default=None |
|
An array where each row is a sample and each column is a feature. |
|
If `None`, method uses `Y=X`. |
|
|
|
squared : bool, default=False |
|
Return squared Euclidean distances. |
|
|
|
missing_values : np.nan, float or int, default=np.nan |
|
Representation of missing value. |
|
|
|
copy : bool, default=True |
|
Make and use a deep copy of X and Y (if Y exists). |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples_X, n_samples_Y) |
|
Returns the distances between the row vectors of `X` |
|
and the row vectors of `Y`. |
|
|
|
See Also |
|
-------- |
|
paired_distances : Distances between pairs of elements of X and Y. |
|
|
|
References |
|
---------- |
|
* John K. Dixon, "Pattern Recognition with Partly Missing Data", |
|
IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: |
|
10, pp. 617 - 621, Oct. 1979. |
|
http://ieeexplore.ieee.org/abstract/document/4310090/ |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import nan_euclidean_distances |
|
>>> nan = float("NaN") |
|
>>> X = [[0, 1], [1, nan]] |
|
>>> nan_euclidean_distances(X, X) # distance between rows of X |
|
array([[0. , 1.41421356], |
|
[1.41421356, 0. ]]) |
|
|
|
>>> # get distance to origin |
|
>>> nan_euclidean_distances(X, [[0, 0]]) |
|
array([[1. ], |
|
[1.41421356]]) |
|
""" |
|
|
|
ensure_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True |
|
X, Y = check_pairwise_arrays( |
|
X, Y, accept_sparse=False, ensure_all_finite=ensure_all_finite, copy=copy |
|
) |
|
|
|
missing_X = _get_mask(X, missing_values) |
|
|
|
|
|
missing_Y = missing_X if Y is X else _get_mask(Y, missing_values) |
|
|
|
|
|
X[missing_X] = 0 |
|
Y[missing_Y] = 0 |
|
|
|
distances = euclidean_distances(X, Y, squared=True) |
|
|
|
|
|
XX = X * X |
|
YY = Y * Y |
|
distances -= np.dot(XX, missing_Y.T) |
|
distances -= np.dot(missing_X, YY.T) |
|
|
|
np.clip(distances, 0, None, out=distances) |
|
|
|
if X is Y: |
|
|
|
|
|
np.fill_diagonal(distances, 0.0) |
|
|
|
present_X = 1 - missing_X |
|
present_Y = present_X if Y is X else ~missing_Y |
|
present_count = np.dot(present_X, present_Y.T) |
|
distances[present_count == 0] = np.nan |
|
|
|
np.maximum(1, present_count, out=present_count) |
|
distances /= present_count |
|
distances *= X.shape[1] |
|
|
|
if not squared: |
|
np.sqrt(distances, out=distances) |
|
|
|
return distances |
|
|
|
|
|
def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): |
|
"""Euclidean distances between X and Y. |
|
|
|
Assumes X and Y have float32 dtype. |
|
Assumes XX and YY have float64 dtype or are None. |
|
|
|
X and Y are upcast to float64 by chunks, which size is chosen to limit |
|
memory increase by approximately 10% (at least 10MiB). |
|
""" |
|
xp, _, device_ = get_namespace_and_device(X, Y) |
|
n_samples_X = X.shape[0] |
|
n_samples_Y = Y.shape[0] |
|
n_features = X.shape[1] |
|
|
|
distances = xp.empty((n_samples_X, n_samples_Y), dtype=xp.float32, device=device_) |
|
|
|
if batch_size is None: |
|
x_density = ( |
|
X.nnz / xp.prod(X.shape) if issparse(X) else xp.asarray(1, device=device_) |
|
) |
|
y_density = ( |
|
Y.nnz / xp.prod(Y.shape) if issparse(Y) else xp.asarray(1, device=device_) |
|
) |
|
|
|
|
|
|
|
maxmem = max( |
|
( |
|
(x_density * n_samples_X + y_density * n_samples_Y) * n_features |
|
+ (x_density * n_samples_X * y_density * n_samples_Y) |
|
) |
|
/ 10, |
|
10 * 2**17, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tmp = (x_density + y_density) * n_features |
|
batch_size = (-tmp + xp.sqrt(tmp**2 + 4 * maxmem)) / 2 |
|
batch_size = max(int(batch_size), 1) |
|
|
|
x_batches = gen_batches(n_samples_X, batch_size) |
|
xp_max_float = _max_precision_float_dtype(xp=xp, device=device_) |
|
for i, x_slice in enumerate(x_batches): |
|
X_chunk = xp.astype(X[x_slice], xp_max_float) |
|
if XX is None: |
|
XX_chunk = row_norms(X_chunk, squared=True)[:, None] |
|
else: |
|
XX_chunk = XX[x_slice] |
|
|
|
y_batches = gen_batches(n_samples_Y, batch_size) |
|
|
|
for j, y_slice in enumerate(y_batches): |
|
if X is Y and j < i: |
|
|
|
|
|
d = distances[y_slice, x_slice].T |
|
|
|
else: |
|
Y_chunk = xp.astype(Y[y_slice], xp_max_float) |
|
if YY is None: |
|
YY_chunk = row_norms(Y_chunk, squared=True)[None, :] |
|
else: |
|
YY_chunk = YY[:, y_slice] |
|
|
|
d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True) |
|
d += XX_chunk |
|
d += YY_chunk |
|
|
|
distances[x_slice, y_slice] = xp.astype(d, xp.float32, copy=False) |
|
|
|
return distances |
|
|
|
|
|
def _argmin_min_reduce(dist, start): |
|
|
|
|
|
|
|
indices = dist.argmin(axis=1) |
|
values = dist[np.arange(dist.shape[0]), indices] |
|
return indices, values |
|
|
|
|
|
def _argmin_reduce(dist, start): |
|
|
|
|
|
|
|
return dist.argmin(axis=1) |
|
|
|
|
|
_VALID_METRICS = [ |
|
"euclidean", |
|
"l2", |
|
"l1", |
|
"manhattan", |
|
"cityblock", |
|
"braycurtis", |
|
"canberra", |
|
"chebyshev", |
|
"correlation", |
|
"cosine", |
|
"dice", |
|
"hamming", |
|
"jaccard", |
|
"mahalanobis", |
|
"matching", |
|
"minkowski", |
|
"rogerstanimoto", |
|
"russellrao", |
|
"seuclidean", |
|
"sokalsneath", |
|
"sqeuclidean", |
|
"yule", |
|
"wminkowski", |
|
"nan_euclidean", |
|
"haversine", |
|
] |
|
if sp_base_version < parse_version("1.17"): |
|
|
|
_VALID_METRICS += ["sokalmichener"] |
|
if sp_base_version < parse_version("1.11"): |
|
|
|
_VALID_METRICS += ["kulsinski"] |
|
if sp_base_version < parse_version("1.9"): |
|
|
|
_VALID_METRICS += ["matching"] |
|
|
|
_NAN_METRICS = ["nan_euclidean"] |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix"], |
|
"axis": [Options(Integral, {0, 1})], |
|
"metric": [ |
|
StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())), |
|
callable, |
|
], |
|
"metric_kwargs": [dict, None], |
|
}, |
|
prefer_skip_nested_validation=False, |
|
) |
|
def pairwise_distances_argmin_min( |
|
X, Y, *, axis=1, metric="euclidean", metric_kwargs=None |
|
): |
|
"""Compute minimum distances between one point and a set of points. |
|
|
|
This function computes for each row in X, the index of the row of Y which |
|
is closest (according to the specified distance). The minimal distances are |
|
also returned. |
|
|
|
This is mostly equivalent to calling:: |
|
|
|
(pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), |
|
pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) |
|
|
|
but uses much less memory, and is faster for large arrays. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
Array containing points. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) |
|
Array containing points. |
|
|
|
axis : int, default=1 |
|
Axis along which the argmin and distances are to be computed. |
|
|
|
metric : str or callable, default='euclidean' |
|
Metric to use for distance computation. Any metric from scikit-learn |
|
or scipy.spatial.distance can be used. |
|
|
|
If metric is a callable function, it is called on each |
|
pair of instances (rows) and the resulting value recorded. The callable |
|
should take two arrays as input and return one value indicating the |
|
distance between them. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
Distance matrices are not supported. |
|
|
|
Valid values for metric are: |
|
|
|
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', |
|
'manhattan', 'nan_euclidean'] |
|
|
|
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', |
|
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', |
|
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', |
|
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', |
|
'yule'] |
|
|
|
See the documentation for scipy.spatial.distance for details on these |
|
metrics. |
|
|
|
.. note:: |
|
`'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. |
|
|
|
.. note:: |
|
`'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead). |
|
|
|
metric_kwargs : dict, default=None |
|
Keyword arguments to pass to specified metric function. |
|
|
|
Returns |
|
------- |
|
argmin : ndarray |
|
Y[argmin[i], :] is the row in Y that is closest to X[i, :]. |
|
|
|
distances : ndarray |
|
The array of minimum distances. `distances[i]` is the distance between |
|
the i-th row in X and the argmin[i]-th row in Y. |
|
|
|
See Also |
|
-------- |
|
pairwise_distances : Distances between every pair of samples of X and Y. |
|
pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only |
|
returns the argmins. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import pairwise_distances_argmin_min |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> argmin, distances = pairwise_distances_argmin_min(X, Y) |
|
>>> argmin |
|
array([0, 1]) |
|
>>> distances |
|
array([1., 1.]) |
|
""" |
|
ensure_all_finite = "allow-nan" if metric == "nan_euclidean" else True |
|
X, Y = check_pairwise_arrays(X, Y, ensure_all_finite=ensure_all_finite) |
|
|
|
if axis == 0: |
|
X, Y = Y, X |
|
|
|
if metric_kwargs is None: |
|
metric_kwargs = {} |
|
|
|
if ArgKmin.is_usable_for(X, Y, metric): |
|
|
|
|
|
if metric_kwargs.get("squared", False) and metric == "euclidean": |
|
metric = "sqeuclidean" |
|
metric_kwargs = {} |
|
|
|
values, indices = ArgKmin.compute( |
|
X=X, |
|
Y=Y, |
|
k=1, |
|
metric=metric, |
|
metric_kwargs=metric_kwargs, |
|
strategy="auto", |
|
return_distance=True, |
|
) |
|
values = values.flatten() |
|
indices = indices.flatten() |
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with config_context(assume_finite=True): |
|
indices, values = zip( |
|
*pairwise_distances_chunked( |
|
X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs |
|
) |
|
) |
|
indices = np.concatenate(indices) |
|
values = np.concatenate(values) |
|
|
|
return indices, values |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix"], |
|
"axis": [Options(Integral, {0, 1})], |
|
"metric": [ |
|
StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())), |
|
callable, |
|
], |
|
"metric_kwargs": [dict, None], |
|
}, |
|
prefer_skip_nested_validation=False, |
|
) |
|
def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): |
|
"""Compute minimum distances between one point and a set of points. |
|
|
|
This function computes for each row in X, the index of the row of Y which |
|
is closest (according to the specified distance). |
|
|
|
This is mostly equivalent to calling:: |
|
|
|
pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis) |
|
|
|
but uses much less memory, and is faster for large arrays. |
|
|
|
This function works with dense 2D arrays only. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
Array containing points. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) |
|
Arrays containing points. |
|
|
|
axis : int, default=1 |
|
Axis along which the argmin and distances are to be computed. |
|
|
|
metric : str or callable, default="euclidean" |
|
Metric to use for distance computation. Any metric from scikit-learn |
|
or scipy.spatial.distance can be used. |
|
|
|
If metric is a callable function, it is called on each |
|
pair of instances (rows) and the resulting value recorded. The callable |
|
should take two arrays as input and return one value indicating the |
|
distance between them. This works for Scipy's metrics, but is less |
|
efficient than passing the metric name as a string. |
|
|
|
Distance matrices are not supported. |
|
|
|
Valid values for metric are: |
|
|
|
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', |
|
'manhattan', 'nan_euclidean'] |
|
|
|
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', |
|
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', |
|
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', |
|
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', |
|
'yule'] |
|
|
|
See the documentation for scipy.spatial.distance for details on these |
|
metrics. |
|
|
|
.. note:: |
|
`'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. |
|
|
|
.. note:: |
|
`'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead). |
|
|
|
metric_kwargs : dict, default=None |
|
Keyword arguments to pass to specified metric function. |
|
|
|
Returns |
|
------- |
|
argmin : numpy.ndarray |
|
Y[argmin[i], :] is the row in Y that is closest to X[i, :]. |
|
|
|
See Also |
|
-------- |
|
pairwise_distances : Distances between every pair of samples of X and Y. |
|
pairwise_distances_argmin_min : Same as `pairwise_distances_argmin` but also |
|
returns the distances. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import pairwise_distances_argmin |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> pairwise_distances_argmin(X, Y) |
|
array([0, 1]) |
|
""" |
|
ensure_all_finite = "allow-nan" if metric == "nan_euclidean" else True |
|
X, Y = check_pairwise_arrays(X, Y, ensure_all_finite=ensure_all_finite) |
|
|
|
if axis == 0: |
|
X, Y = Y, X |
|
|
|
if metric_kwargs is None: |
|
metric_kwargs = {} |
|
|
|
if ArgKmin.is_usable_for(X, Y, metric): |
|
|
|
|
|
if metric_kwargs.get("squared", False) and metric == "euclidean": |
|
metric = "sqeuclidean" |
|
metric_kwargs = {} |
|
|
|
indices = ArgKmin.compute( |
|
X=X, |
|
Y=Y, |
|
k=1, |
|
metric=metric, |
|
metric_kwargs=metric_kwargs, |
|
strategy="auto", |
|
return_distance=False, |
|
) |
|
indices = indices.flatten() |
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with config_context(assume_finite=True): |
|
indices = np.concatenate( |
|
list( |
|
|
|
|
|
pairwise_distances_chunked( |
|
X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs |
|
) |
|
) |
|
) |
|
|
|
return indices |
|
|
|
|
|
@validate_params( |
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix", None]}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def haversine_distances(X, Y=None): |
|
"""Compute the Haversine distance between samples in X and Y. |
|
|
|
The Haversine (or great circle) distance is the angular distance between |
|
two points on the surface of a sphere. The first coordinate of each point |
|
is assumed to be the latitude, the second is the longitude, given |
|
in radians. The dimension of the data must be 2. |
|
|
|
.. math:: |
|
D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2) |
|
+ \\cos(x_{lat})\\cos(y_{lat})\\ |
|
sin^2((x_{lon} - y_{lon}) / 2)}] |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, 2) |
|
A feature array. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, 2), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples_X, n_samples_Y) |
|
The distance matrix. |
|
|
|
Notes |
|
----- |
|
As the Earth is nearly spherical, the haversine formula provides a good |
|
approximation of the distance between two points of the Earth surface, with |
|
a less than 1% error on average. |
|
|
|
Examples |
|
-------- |
|
We want to calculate the distance between the Ezeiza Airport |
|
(Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, |
|
France). |
|
|
|
>>> from sklearn.metrics.pairwise import haversine_distances |
|
>>> from math import radians |
|
>>> bsas = [-34.83333, -58.5166646] |
|
>>> paris = [49.0083899664, 2.53844117956] |
|
>>> bsas_in_radians = [radians(_) for _ in bsas] |
|
>>> paris_in_radians = [radians(_) for _ in paris] |
|
>>> result = haversine_distances([bsas_in_radians, paris_in_radians]) |
|
>>> result * 6371000/1000 # multiply by Earth radius to get kilometers |
|
array([[ 0. , 11099.54035582], |
|
[11099.54035582, 0. ]]) |
|
""" |
|
from ..metrics import DistanceMetric |
|
|
|
return DistanceMetric.get_metric("haversine").pairwise(X, Y) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def manhattan_distances(X, Y=None): |
|
"""Compute the L1 distances between the vectors in X and Y. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
An array where each row is a sample and each column is a feature. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An array where each row is a sample and each column is a feature. |
|
If `None`, method uses `Y=X`. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples_X, n_samples_Y) |
|
Pairwise L1 distances. |
|
|
|
Notes |
|
----- |
|
When X and/or Y are CSR sparse matrices and they are not already |
|
in canonical format, this function modifies them in-place to |
|
make them canonical. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import manhattan_distances |
|
>>> manhattan_distances([[3]], [[3]]) |
|
array([[0.]]) |
|
>>> manhattan_distances([[3]], [[2]]) |
|
array([[1.]]) |
|
>>> manhattan_distances([[2]], [[3]]) |
|
array([[1.]]) |
|
>>> manhattan_distances([[1, 2], [3, 4]],\ |
|
[[1, 2], [0, 3]]) |
|
array([[0., 2.], |
|
[4., 4.]]) |
|
""" |
|
X, Y = check_pairwise_arrays(X, Y) |
|
|
|
if issparse(X) or issparse(Y): |
|
X = csr_matrix(X, copy=False) |
|
Y = csr_matrix(Y, copy=False) |
|
X.sum_duplicates() |
|
Y.sum_duplicates() |
|
D = np.zeros((X.shape[0], Y.shape[0])) |
|
_sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D) |
|
return D |
|
|
|
return distance.cdist(X, Y, "cityblock") |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def cosine_distances(X, Y=None): |
|
"""Compute cosine distance between samples in X and Y. |
|
|
|
Cosine distance is defined as 1.0 minus the cosine similarity. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
Matrix `X`. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ |
|
default=None |
|
Matrix `Y`. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples_X, n_samples_Y) |
|
Returns the cosine distance between samples in X and Y. |
|
|
|
See Also |
|
-------- |
|
cosine_similarity : Compute cosine similarity between samples in X and Y. |
|
scipy.spatial.distance.cosine : Dense matrices only. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import cosine_distances |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> cosine_distances(X, Y) |
|
array([[1. , 1. ], |
|
[0.42..., 0.18...]]) |
|
""" |
|
xp, _ = get_namespace(X, Y) |
|
|
|
|
|
S = cosine_similarity(X, Y) |
|
S *= -1 |
|
S += 1 |
|
|
|
|
|
device_ = device(S) |
|
S = xp.clip( |
|
S, |
|
xp.asarray(0.0, device=device_, dtype=S.dtype), |
|
xp.asarray(2.0, device=device_, dtype=S.dtype), |
|
) |
|
if X is Y or Y is None: |
|
|
|
|
|
_fill_or_add_to_diagonal(S, 0.0, xp, add_value=False) |
|
return S |
|
|
|
|
|
|
|
@validate_params( |
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def paired_euclidean_distances(X, Y): |
|
"""Compute the paired euclidean distances between X and Y. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input array/matrix X. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input array/matrix Y. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples,) |
|
Output array/matrix containing the calculated paired euclidean |
|
distances. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import paired_euclidean_distances |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> paired_euclidean_distances(X, Y) |
|
array([1., 1.]) |
|
""" |
|
X, Y = check_paired_arrays(X, Y) |
|
return row_norms(X - Y) |
|
|
|
|
|
@validate_params( |
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def paired_manhattan_distances(X, Y): |
|
"""Compute the paired L1 distances between X and Y. |
|
|
|
Distances are calculated between (X[0], Y[0]), (X[1], Y[1]), ..., |
|
(X[n_samples], Y[n_samples]). |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
An array-like where each row is a sample and each column is a feature. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
An array-like where each row is a sample and each column is a feature. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples,) |
|
L1 paired distances between the row vectors of `X` |
|
and the row vectors of `Y`. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import paired_manhattan_distances |
|
>>> import numpy as np |
|
>>> X = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]]) |
|
>>> Y = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]]) |
|
>>> paired_manhattan_distances(X, Y) |
|
array([1., 2., 1.]) |
|
""" |
|
X, Y = check_paired_arrays(X, Y) |
|
diff = X - Y |
|
if issparse(diff): |
|
diff.data = np.abs(diff.data) |
|
return np.squeeze(np.array(diff.sum(axis=1))) |
|
else: |
|
return np.abs(diff).sum(axis=-1) |
|
|
|
|
|
@validate_params( |
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def paired_cosine_distances(X, Y): |
|
""" |
|
Compute the paired cosine distances between X and Y. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
An array where each row is a sample and each column is a feature. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
An array where each row is a sample and each column is a feature. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples,) |
|
Returns the distances between the row vectors of `X` |
|
and the row vectors of `Y`, where `distances[i]` is the |
|
distance between `X[i]` and `Y[i]`. |
|
|
|
Notes |
|
----- |
|
The cosine distance is equivalent to the half the squared |
|
euclidean distance if each sample is normalized to unit norm. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import paired_cosine_distances |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> paired_cosine_distances(X, Y) |
|
array([0.5 , 0.18...]) |
|
""" |
|
X, Y = check_paired_arrays(X, Y) |
|
return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True) |
|
|
|
|
|
PAIRED_DISTANCES = { |
|
"cosine": paired_cosine_distances, |
|
"euclidean": paired_euclidean_distances, |
|
"l2": paired_euclidean_distances, |
|
"l1": paired_manhattan_distances, |
|
"manhattan": paired_manhattan_distances, |
|
"cityblock": paired_manhattan_distances, |
|
} |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like"], |
|
"Y": ["array-like"], |
|
"metric": [StrOptions(set(PAIRED_DISTANCES)), callable], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def paired_distances(X, Y, *, metric="euclidean", **kwds): |
|
""" |
|
Compute the paired distances between X and Y. |
|
|
|
Compute the distances between (X[0], Y[0]), (X[1], Y[1]), etc... |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : ndarray of shape (n_samples, n_features) |
|
Array 1 for distance computation. |
|
|
|
Y : ndarray of shape (n_samples, n_features) |
|
Array 2 for distance computation. |
|
|
|
metric : str or callable, default="euclidean" |
|
The metric to use when calculating distance between instances in a |
|
feature array. If metric is a string, it must be one of the options |
|
specified in PAIRED_DISTANCES, including "euclidean", |
|
"manhattan", or "cosine". |
|
Alternatively, if metric is a callable function, it is called on each |
|
pair of instances (rows) and the resulting value recorded. The callable |
|
should take two arrays from `X` as input and return a value indicating |
|
the distance between them. |
|
|
|
**kwds : dict |
|
Unused parameters. |
|
|
|
Returns |
|
------- |
|
distances : ndarray of shape (n_samples,) |
|
Returns the distances between the row vectors of `X` |
|
and the row vectors of `Y`. |
|
|
|
See Also |
|
-------- |
|
sklearn.metrics.pairwise_distances : Computes the distance between every pair of |
|
samples. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import paired_distances |
|
>>> X = [[0, 1], [1, 1]] |
|
>>> Y = [[0, 1], [2, 1]] |
|
>>> paired_distances(X, Y) |
|
array([0., 1.]) |
|
""" |
|
|
|
if metric in PAIRED_DISTANCES: |
|
func = PAIRED_DISTANCES[metric] |
|
return func(X, Y) |
|
elif callable(metric): |
|
|
|
X, Y = check_paired_arrays(X, Y) |
|
distances = np.zeros(len(X)) |
|
for i in range(len(X)): |
|
distances[i] = metric(X[i], Y[i]) |
|
return distances |
|
|
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"dense_output": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def linear_kernel(X, Y=None, dense_output=True): |
|
""" |
|
Compute the linear kernel between X and Y. |
|
|
|
Read more in the :ref:`User Guide <linear_kernel>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
dense_output : bool, default=True |
|
Whether to return dense output even when the input is sparse. If |
|
``False``, the output is sparse if both input arrays are sparse. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Returns |
|
------- |
|
kernel : ndarray of shape (n_samples_X, n_samples_Y) |
|
The Gram matrix of the linear kernel, i.e. `X @ Y.T`. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import linear_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> linear_kernel(X, Y) |
|
array([[0., 0.], |
|
[1., 2.]]) |
|
""" |
|
X, Y = check_pairwise_arrays(X, Y) |
|
return safe_sparse_dot(X, Y.T, dense_output=dense_output) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"degree": [Interval(Real, 1, None, closed="left")], |
|
"gamma": [ |
|
Interval(Real, 0, None, closed="left"), |
|
None, |
|
Hidden(np.ndarray), |
|
], |
|
"coef0": [Interval(Real, None, None, closed="neither")], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1): |
|
""" |
|
Compute the polynomial kernel between X and Y. |
|
|
|
.. code-block:: text |
|
|
|
K(X, Y) = (gamma <X, Y> + coef0) ^ degree |
|
|
|
Read more in the :ref:`User Guide <polynomial_kernel>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
degree : float, default=3 |
|
Kernel degree. |
|
|
|
gamma : float, default=None |
|
Coefficient of the vector inner product. If None, defaults to 1.0 / n_features. |
|
|
|
coef0 : float, default=1 |
|
Constant offset added to scaled inner product. |
|
|
|
Returns |
|
------- |
|
kernel : ndarray of shape (n_samples_X, n_samples_Y) |
|
The polynomial kernel. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import polynomial_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> polynomial_kernel(X, Y, degree=2) |
|
array([[1. , 1. ], |
|
[1.77..., 2.77...]]) |
|
""" |
|
X, Y = check_pairwise_arrays(X, Y) |
|
if gamma is None: |
|
gamma = 1.0 / X.shape[1] |
|
|
|
K = safe_sparse_dot(X, Y.T, dense_output=True) |
|
K *= gamma |
|
K += coef0 |
|
K **= degree |
|
return K |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"gamma": [ |
|
Interval(Real, 0, None, closed="left"), |
|
None, |
|
Hidden(np.ndarray), |
|
], |
|
"coef0": [Interval(Real, None, None, closed="neither")], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def sigmoid_kernel(X, Y=None, gamma=None, coef0=1): |
|
"""Compute the sigmoid kernel between X and Y. |
|
|
|
.. code-block:: text |
|
|
|
K(X, Y) = tanh(gamma <X, Y> + coef0) |
|
|
|
Read more in the :ref:`User Guide <sigmoid_kernel>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
gamma : float, default=None |
|
Coefficient of the vector inner product. If None, defaults to 1.0 / n_features. |
|
|
|
coef0 : float, default=1 |
|
Constant offset added to scaled inner product. |
|
|
|
Returns |
|
------- |
|
kernel : ndarray of shape (n_samples_X, n_samples_Y) |
|
Sigmoid kernel between two arrays. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import sigmoid_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> sigmoid_kernel(X, Y) |
|
array([[0.76..., 0.76...], |
|
[0.87..., 0.93...]]) |
|
""" |
|
xp, _ = get_namespace(X, Y) |
|
X, Y = check_pairwise_arrays(X, Y) |
|
if gamma is None: |
|
gamma = 1.0 / X.shape[1] |
|
|
|
K = safe_sparse_dot(X, Y.T, dense_output=True) |
|
K *= gamma |
|
K += coef0 |
|
|
|
K = _modify_in_place_if_numpy(xp, xp.tanh, K, out=K) |
|
return K |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"gamma": [ |
|
Interval(Real, 0, None, closed="left"), |
|
None, |
|
Hidden(np.ndarray), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def rbf_kernel(X, Y=None, gamma=None): |
|
"""Compute the rbf (gaussian) kernel between X and Y. |
|
|
|
.. code-block:: text |
|
|
|
K(x, y) = exp(-gamma ||x-y||^2) |
|
|
|
for each pair of rows x in X and y in Y. |
|
|
|
Read more in the :ref:`User Guide <rbf_kernel>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
gamma : float, default=None |
|
If None, defaults to 1.0 / n_features. |
|
|
|
Returns |
|
------- |
|
kernel : ndarray of shape (n_samples_X, n_samples_Y) |
|
The RBF kernel. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import rbf_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> rbf_kernel(X, Y) |
|
array([[0.71..., 0.51...], |
|
[0.51..., 0.71...]]) |
|
""" |
|
xp, _ = get_namespace(X, Y) |
|
X, Y = check_pairwise_arrays(X, Y) |
|
if gamma is None: |
|
gamma = 1.0 / X.shape[1] |
|
|
|
K = euclidean_distances(X, Y, squared=True) |
|
K *= -gamma |
|
|
|
K = _modify_in_place_if_numpy(xp, xp.exp, K, out=K) |
|
return K |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"gamma": [ |
|
Interval(Real, 0, None, closed="neither"), |
|
Hidden(np.ndarray), |
|
None, |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def laplacian_kernel(X, Y=None, gamma=None): |
|
"""Compute the laplacian kernel between X and Y. |
|
|
|
The laplacian kernel is defined as: |
|
|
|
.. code-block:: text |
|
|
|
K(x, y) = exp(-gamma ||x-y||_1) |
|
|
|
for each pair of rows x in X and y in Y. |
|
Read more in the :ref:`User Guide <laplacian_kernel>`. |
|
|
|
.. versionadded:: 0.17 |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
gamma : float, default=None |
|
If None, defaults to 1.0 / n_features. Otherwise it should be strictly positive. |
|
|
|
Returns |
|
------- |
|
kernel : ndarray of shape (n_samples_X, n_samples_Y) |
|
The kernel matrix. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import laplacian_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> laplacian_kernel(X, Y) |
|
array([[0.71..., 0.51...], |
|
[0.51..., 0.71...]]) |
|
""" |
|
X, Y = check_pairwise_arrays(X, Y) |
|
if gamma is None: |
|
gamma = 1.0 / X.shape[1] |
|
|
|
K = -gamma * manhattan_distances(X, Y) |
|
np.exp(K, K) |
|
return K |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"dense_output": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def cosine_similarity(X, Y=None, dense_output=True): |
|
"""Compute cosine similarity between samples in X and Y. |
|
|
|
Cosine similarity, or the cosine kernel, computes similarity as the |
|
normalized dot product of X and Y: |
|
|
|
.. code-block:: text |
|
|
|
K(X, Y) = <X, Y> / (||X||*||Y||) |
|
|
|
On L2-normalized data, this function is equivalent to linear_kernel. |
|
|
|
Read more in the :ref:`User Guide <cosine_similarity>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features) |
|
Input data. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ |
|
default=None |
|
Input data. If ``None``, the output will be the pairwise |
|
similarities between all samples in ``X``. |
|
|
|
dense_output : bool, default=True |
|
Whether to return dense output even when the input is sparse. If |
|
``False``, the output is sparse if both input arrays are sparse. |
|
|
|
.. versionadded:: 0.17 |
|
parameter ``dense_output`` for dense output. |
|
|
|
Returns |
|
------- |
|
similarities : ndarray or sparse matrix of shape (n_samples_X, n_samples_Y) |
|
Returns the cosine similarity between samples in X and Y. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import cosine_similarity |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> cosine_similarity(X, Y) |
|
array([[0. , 0. ], |
|
[0.57..., 0.81...]]) |
|
""" |
|
|
|
|
|
X, Y = check_pairwise_arrays(X, Y) |
|
|
|
X_normalized = normalize(X, copy=True) |
|
if X is Y: |
|
Y_normalized = X_normalized |
|
else: |
|
Y_normalized = normalize(Y, copy=True) |
|
|
|
K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output) |
|
|
|
return K |
|
|
|
|
|
@validate_params( |
|
{"X": ["array-like"], "Y": ["array-like", None]}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def additive_chi2_kernel(X, Y=None): |
|
"""Compute the additive chi-squared kernel between observations in X and Y. |
|
|
|
The chi-squared kernel is computed between each pair of rows in X and Y. X |
|
and Y have to be non-negative. This kernel is most commonly applied to |
|
histograms. |
|
|
|
The chi-squared kernel is given by: |
|
|
|
.. code-block:: text |
|
|
|
k(x, y) = -Sum [(x - y)^2 / (x + y)] |
|
|
|
It can be interpreted as a weighted difference per entry. |
|
|
|
Read more in the :ref:`User Guide <chi2_kernel>`. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : array-like of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
Returns |
|
------- |
|
kernel : array-like of shape (n_samples_X, n_samples_Y) |
|
The kernel matrix. |
|
|
|
See Also |
|
-------- |
|
chi2_kernel : The exponentiated version of the kernel, which is usually |
|
preferable. |
|
sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation |
|
to this kernel. |
|
|
|
Notes |
|
----- |
|
As the negative of a distance, this kernel is only conditionally positive |
|
definite. |
|
|
|
References |
|
---------- |
|
* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. |
|
Local features and kernels for classification of texture and object |
|
categories: A comprehensive study |
|
International Journal of Computer Vision 2007 |
|
https://hal.archives-ouvertes.fr/hal-00171412/document |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import additive_chi2_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> additive_chi2_kernel(X, Y) |
|
array([[-1., -2.], |
|
[-2., -1.]]) |
|
""" |
|
xp, _ = get_namespace(X, Y) |
|
X, Y = check_pairwise_arrays(X, Y, accept_sparse=False) |
|
if xp.any(X < 0): |
|
raise ValueError("X contains negative values.") |
|
if Y is not X and xp.any(Y < 0): |
|
raise ValueError("Y contains negative values.") |
|
|
|
if _is_numpy_namespace(xp): |
|
result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype) |
|
_chi2_kernel_fast(X, Y, result) |
|
return result |
|
else: |
|
dtype = _find_matching_floating_dtype(X, Y, xp=xp) |
|
xb = X[:, None, :] |
|
yb = Y[None, :, :] |
|
nom = -((xb - yb) ** 2) |
|
denom = xb + yb |
|
nom = xp.where(denom == 0, xp.asarray(0, dtype=dtype), nom) |
|
denom = xp.where(denom == 0, xp.asarray(1, dtype=dtype), denom) |
|
return xp.sum(nom / denom, axis=2) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like"], |
|
"Y": ["array-like", None], |
|
"gamma": [Interval(Real, 0, None, closed="neither"), Hidden(np.ndarray)], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def chi2_kernel(X, Y=None, gamma=1.0): |
|
"""Compute the exponential chi-squared kernel between X and Y. |
|
|
|
The chi-squared kernel is computed between each pair of rows in X and Y. X |
|
and Y have to be non-negative. This kernel is most commonly applied to |
|
histograms. |
|
|
|
The chi-squared kernel is given by: |
|
|
|
.. code-block:: text |
|
|
|
k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)]) |
|
|
|
It can be interpreted as a weighted difference per entry. |
|
|
|
Read more in the :ref:`User Guide <chi2_kernel>`. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples_X, n_features) |
|
A feature array. |
|
|
|
Y : array-like of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. If `None`, uses `Y=X`. |
|
|
|
gamma : float, default=1 |
|
Scaling parameter of the chi2 kernel. |
|
|
|
Returns |
|
------- |
|
kernel : ndarray of shape (n_samples_X, n_samples_Y) |
|
The kernel matrix. |
|
|
|
See Also |
|
-------- |
|
additive_chi2_kernel : The additive version of this kernel. |
|
sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation |
|
to the additive version of this kernel. |
|
|
|
References |
|
---------- |
|
* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. |
|
Local features and kernels for classification of texture and object |
|
categories: A comprehensive study |
|
International Journal of Computer Vision 2007 |
|
https://hal.archives-ouvertes.fr/hal-00171412/document |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import chi2_kernel |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> chi2_kernel(X, Y) |
|
array([[0.36..., 0.13...], |
|
[0.13..., 0.36...]]) |
|
""" |
|
xp, _ = get_namespace(X, Y) |
|
K = additive_chi2_kernel(X, Y) |
|
K *= gamma |
|
if _is_numpy_namespace(xp): |
|
return np.exp(K, out=K) |
|
return xp.exp(K) |
|
|
|
|
|
|
|
PAIRWISE_DISTANCE_FUNCTIONS = { |
|
|
|
|
|
"cityblock": manhattan_distances, |
|
"cosine": cosine_distances, |
|
"euclidean": euclidean_distances, |
|
"haversine": haversine_distances, |
|
"l2": euclidean_distances, |
|
"l1": manhattan_distances, |
|
"manhattan": manhattan_distances, |
|
"precomputed": None, |
|
"nan_euclidean": nan_euclidean_distances, |
|
} |
|
|
|
|
|
def distance_metrics(): |
|
"""Valid metrics for pairwise_distances. |
|
|
|
This function simply returns the valid pairwise distance metrics. |
|
It exists to allow for a description of the mapping for |
|
each of the valid strings. |
|
|
|
The valid distance metrics, and the function they map to, are: |
|
|
|
=============== ======================================== |
|
metric Function |
|
=============== ======================================== |
|
'cityblock' metrics.pairwise.manhattan_distances |
|
'cosine' metrics.pairwise.cosine_distances |
|
'euclidean' metrics.pairwise.euclidean_distances |
|
'haversine' metrics.pairwise.haversine_distances |
|
'l1' metrics.pairwise.manhattan_distances |
|
'l2' metrics.pairwise.euclidean_distances |
|
'manhattan' metrics.pairwise.manhattan_distances |
|
'nan_euclidean' metrics.pairwise.nan_euclidean_distances |
|
=============== ======================================== |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Returns |
|
------- |
|
distance_metrics : dict |
|
Returns valid metrics for pairwise_distances. |
|
""" |
|
return PAIRWISE_DISTANCE_FUNCTIONS |
|
|
|
|
|
def _dist_wrapper(dist_func, dist_matrix, slice_, *args, **kwargs): |
|
"""Write in-place to a slice of a distance matrix.""" |
|
dist_matrix[:, slice_] = dist_func(*args, **kwargs) |
|
|
|
|
|
def _parallel_pairwise(X, Y, func, n_jobs, **kwds): |
|
"""Break the pairwise matrix in n_jobs even slices |
|
and compute them using multithreading.""" |
|
|
|
if Y is None: |
|
Y = X |
|
X, Y, dtype = _return_float_dtype(X, Y) |
|
|
|
if effective_n_jobs(n_jobs) == 1: |
|
return func(X, Y, **kwds) |
|
|
|
|
|
fd = delayed(_dist_wrapper) |
|
ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order="F") |
|
Parallel(backend="threading", n_jobs=n_jobs)( |
|
fd(func, ret, s, X, Y[s], **kwds) |
|
for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs)) |
|
) |
|
|
|
if (X is Y or Y is None) and func is euclidean_distances: |
|
|
|
|
|
np.fill_diagonal(ret, 0) |
|
|
|
return ret |
|
|
|
|
|
def _pairwise_callable(X, Y, metric, ensure_all_finite=True, **kwds): |
|
"""Handle the callable case for pairwise_{distances,kernels}.""" |
|
X, Y = check_pairwise_arrays( |
|
X, |
|
Y, |
|
dtype=None, |
|
ensure_all_finite=ensure_all_finite, |
|
ensure_2d=False, |
|
) |
|
|
|
if X is Y: |
|
|
|
out = np.zeros((X.shape[0], Y.shape[0]), dtype="float") |
|
iterator = itertools.combinations(range(X.shape[0]), 2) |
|
for i, j in iterator: |
|
|
|
|
|
x = X[[i], :] if issparse(X) else X[i] |
|
y = Y[[j], :] if issparse(Y) else Y[j] |
|
out[i, j] = metric(x, y, **kwds) |
|
|
|
|
|
|
|
out = out + out.T |
|
|
|
|
|
|
|
for i in range(X.shape[0]): |
|
|
|
|
|
x = X[[i], :] if issparse(X) else X[i] |
|
out[i, i] = metric(x, x, **kwds) |
|
|
|
else: |
|
|
|
out = np.empty((X.shape[0], Y.shape[0]), dtype="float") |
|
iterator = itertools.product(range(X.shape[0]), range(Y.shape[0])) |
|
for i, j in iterator: |
|
|
|
|
|
x = X[[i], :] if issparse(X) else X[i] |
|
y = Y[[j], :] if issparse(Y) else Y[j] |
|
out[i, j] = metric(x, y, **kwds) |
|
|
|
return out |
|
|
|
|
|
def _check_chunk_size(reduced, chunk_size): |
|
"""Checks chunk is a sequence of expected size or a tuple of same.""" |
|
if reduced is None: |
|
return |
|
is_tuple = isinstance(reduced, tuple) |
|
if not is_tuple: |
|
reduced = (reduced,) |
|
if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced): |
|
raise TypeError( |
|
"reduce_func returned %r. Expected sequence(s) of length %d." |
|
% (reduced if is_tuple else reduced[0], chunk_size) |
|
) |
|
if any(_num_samples(r) != chunk_size for r in reduced): |
|
actual_size = tuple(_num_samples(r) for r in reduced) |
|
raise ValueError( |
|
"reduce_func returned object of length %s. " |
|
"Expected same length as input: %d." |
|
% (actual_size if is_tuple else actual_size[0], chunk_size) |
|
) |
|
|
|
|
|
def _precompute_metric_params(X, Y, metric=None, **kwds): |
|
"""Precompute data-derived metric parameters if not provided.""" |
|
if metric == "seuclidean" and "V" not in kwds: |
|
if X is Y: |
|
V = np.var(X, axis=0, ddof=1) |
|
else: |
|
raise ValueError( |
|
"The 'V' parameter is required for the seuclidean metric " |
|
"when Y is passed." |
|
) |
|
return {"V": V} |
|
if metric == "mahalanobis" and "VI" not in kwds: |
|
if X is Y: |
|
VI = np.linalg.inv(np.cov(X.T)).T |
|
else: |
|
raise ValueError( |
|
"The 'VI' parameter is required for the mahalanobis metric " |
|
"when Y is passed." |
|
) |
|
return {"VI": VI} |
|
return {} |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"reduce_func": [callable, None], |
|
"metric": [StrOptions({"precomputed"}.union(_VALID_METRICS)), callable], |
|
"n_jobs": [Integral, None], |
|
"working_memory": [Interval(Real, 0, None, closed="left"), None], |
|
}, |
|
prefer_skip_nested_validation=False, |
|
) |
|
def pairwise_distances_chunked( |
|
X, |
|
Y=None, |
|
*, |
|
reduce_func=None, |
|
metric="euclidean", |
|
n_jobs=None, |
|
working_memory=None, |
|
**kwds, |
|
): |
|
"""Generate a distance matrix chunk by chunk with optional reduction. |
|
|
|
In cases where not all of a pairwise distance matrix needs to be |
|
stored at once, this is used to calculate pairwise distances in |
|
``working_memory``-sized chunks. If ``reduce_func`` is given, it is |
|
run on each chunk and its return values are concatenated into lists, |
|
arrays or sparse matrices. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \ |
|
(n_samples_X, n_features) |
|
Array of pairwise distances between samples, or a feature array. |
|
The shape the array should be (n_samples_X, n_samples_X) if |
|
metric='precomputed' and (n_samples_X, n_features) otherwise. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. Only allowed if |
|
metric != "precomputed". |
|
|
|
reduce_func : callable, default=None |
|
The function which is applied on each chunk of the distance matrix, |
|
reducing it to needed values. ``reduce_func(D_chunk, start)`` |
|
is called repeatedly, where ``D_chunk`` is a contiguous vertical |
|
slice of the pairwise distance matrix, starting at row ``start``. |
|
It should return one of: None; an array, a list, or a sparse matrix |
|
of length ``D_chunk.shape[0]``; or a tuple of such objects. |
|
Returning None is useful for in-place operations, rather than |
|
reductions. |
|
|
|
If None, pairwise_distances_chunked returns a generator of vertical |
|
chunks of the distance matrix. |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. If metric is a string, it must be one of the options |
|
allowed by scipy.spatial.distance.pdist for its metric parameter, |
|
or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. |
|
If metric is "precomputed", X is assumed to be a distance matrix. |
|
Alternatively, if metric is a callable function, it is called on |
|
each pair of instances (rows) and the resulting value recorded. |
|
The callable should take two arrays from X as input and return a |
|
value indicating the distance between them. |
|
|
|
n_jobs : int, default=None |
|
The number of jobs to use for the computation. This works by |
|
breaking down the pairwise matrix into n_jobs even slices and |
|
computing them in parallel. |
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
working_memory : float, default=None |
|
The sought maximum memory for temporary distance matrix chunks. |
|
When None (default), the value of |
|
``sklearn.get_config()['working_memory']`` is used. |
|
|
|
**kwds : optional keyword parameters |
|
Any further parameters are passed directly to the distance function. |
|
If using a scipy.spatial.distance metric, the parameters are still |
|
metric dependent. See the scipy docs for usage examples. |
|
|
|
Yields |
|
------ |
|
D_chunk : {ndarray, sparse matrix} |
|
A contiguous slice of distance matrix, optionally processed by |
|
``reduce_func``. |
|
|
|
Examples |
|
-------- |
|
Without reduce_func: |
|
|
|
>>> import numpy as np |
|
>>> from sklearn.metrics import pairwise_distances_chunked |
|
>>> X = np.random.RandomState(0).rand(5, 3) |
|
>>> D_chunk = next(pairwise_distances_chunked(X)) |
|
>>> D_chunk |
|
array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...], |
|
[0.29..., 0. ..., 0.57..., 0.41..., 0.76...], |
|
[0.41..., 0.57..., 0. ..., 0.44..., 0.90...], |
|
[0.19..., 0.41..., 0.44..., 0. ..., 0.51...], |
|
[0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) |
|
|
|
Retrieve all neighbors and average distance within radius r: |
|
|
|
>>> r = .2 |
|
>>> def reduce_func(D_chunk, start): |
|
... neigh = [np.flatnonzero(d < r) for d in D_chunk] |
|
... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1) |
|
... return neigh, avg_dist |
|
>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func) |
|
>>> neigh, avg_dist = next(gen) |
|
>>> neigh |
|
[array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])] |
|
>>> avg_dist |
|
array([0.039..., 0. , 0. , 0.039..., 0. ]) |
|
|
|
Where r is defined per sample, we need to make use of ``start``: |
|
|
|
>>> r = [.2, .4, .4, .3, .1] |
|
>>> def reduce_func(D_chunk, start): |
|
... neigh = [np.flatnonzero(d < r[i]) |
|
... for i, d in enumerate(D_chunk, start)] |
|
... return neigh |
|
>>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func)) |
|
>>> neigh |
|
[array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])] |
|
|
|
Force row-by-row generation by reducing ``working_memory``: |
|
|
|
>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func, |
|
... working_memory=0) |
|
>>> next(gen) |
|
[array([0, 3])] |
|
>>> next(gen) |
|
[array([0, 1])] |
|
""" |
|
n_samples_X = _num_samples(X) |
|
if metric == "precomputed": |
|
slices = (slice(0, n_samples_X),) |
|
else: |
|
if Y is None: |
|
Y = X |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chunk_n_rows = get_chunk_n_rows( |
|
row_bytes=8 * _num_samples(Y), |
|
max_n_rows=n_samples_X, |
|
working_memory=working_memory, |
|
) |
|
slices = gen_batches(n_samples_X, chunk_n_rows) |
|
|
|
|
|
params = _precompute_metric_params(X, Y, metric=metric, **kwds) |
|
kwds.update(**params) |
|
|
|
for sl in slices: |
|
if sl.start == 0 and sl.stop == n_samples_X: |
|
X_chunk = X |
|
else: |
|
X_chunk = X[sl] |
|
D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds) |
|
if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get( |
|
metric, None |
|
) is euclidean_distances: |
|
|
|
|
|
D_chunk.flat[sl.start :: _num_samples(X) + 1] = 0 |
|
if reduce_func is not None: |
|
chunk_size = D_chunk.shape[0] |
|
D_chunk = reduce_func(D_chunk, sl.start) |
|
_check_chunk_size(D_chunk, chunk_size) |
|
yield D_chunk |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], |
|
"n_jobs": [Integral, None], |
|
"force_all_finite": [ |
|
"boolean", |
|
StrOptions({"allow-nan"}), |
|
Hidden(StrOptions({"deprecated"})), |
|
], |
|
"ensure_all_finite": ["boolean", StrOptions({"allow-nan"}), Hidden(None)], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def pairwise_distances( |
|
X, |
|
Y=None, |
|
metric="euclidean", |
|
*, |
|
n_jobs=None, |
|
force_all_finite="deprecated", |
|
ensure_all_finite=None, |
|
**kwds, |
|
): |
|
"""Compute the distance matrix from a vector array X and optional Y. |
|
|
|
This method takes either a vector array or a distance matrix, and returns |
|
a distance matrix. |
|
If the input is a vector array, the distances are computed. |
|
If the input is a distances matrix, it is returned instead. |
|
If the input is a collection of non-numeric data (e.g. a list of strings or a |
|
boolean array), a custom metric must be passed. |
|
|
|
This method provides a safe way to take a distance matrix as input, while |
|
preserving compatibility with many other algorithms that take a vector |
|
array. |
|
|
|
If Y is given (default is None), then the returned matrix is the pairwise |
|
distance between the arrays from both X and Y. |
|
|
|
Valid values for metric are: |
|
|
|
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', |
|
'manhattan']. These metrics support sparse matrix |
|
inputs. |
|
['nan_euclidean'] but it does not yet support sparse matrices. |
|
|
|
- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', |
|
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', |
|
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', |
|
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] |
|
See the documentation for scipy.spatial.distance for details on these |
|
metrics. These metrics do not support sparse matrix inputs. |
|
|
|
.. note:: |
|
`'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. |
|
|
|
.. note:: |
|
`'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead). |
|
|
|
Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are |
|
valid scipy.spatial.distance metrics), the scikit-learn implementation |
|
will be used, which is faster and has support for sparse matrices (except |
|
for 'cityblock'). For a verbose description of the metrics from |
|
scikit-learn, see :func:`sklearn.metrics.pairwise.distance_metrics` |
|
function. |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \ |
|
(n_samples_X, n_features) |
|
Array of pairwise distances between samples, or a feature array. |
|
The shape of the array should be (n_samples_X, n_samples_X) if |
|
metric == "precomputed" and (n_samples_X, n_features) otherwise. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
An optional second feature array. Only allowed if |
|
metric != "precomputed". |
|
|
|
metric : str or callable, default='euclidean' |
|
The metric to use when calculating distance between instances in a |
|
feature array. If metric is a string, it must be one of the options |
|
allowed by scipy.spatial.distance.pdist for its metric parameter, or |
|
a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. |
|
If metric is "precomputed", X is assumed to be a distance matrix. |
|
Alternatively, if metric is a callable function, it is called on each |
|
pair of instances (rows) and the resulting value recorded. The callable |
|
should take two arrays from X as input and return a value indicating |
|
the distance between them. |
|
|
|
n_jobs : int, default=None |
|
The number of jobs to use for the computation. This works by breaking |
|
down the pairwise matrix into n_jobs even slices and computing them |
|
using multithreading. |
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
The "euclidean" and "cosine" metrics rely heavily on BLAS which is already |
|
multithreaded. So, increasing `n_jobs` would likely cause oversubscription |
|
and quickly degrade performance. |
|
|
|
force_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored |
|
for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The |
|
possibilities are: |
|
|
|
- True: Force all values of array to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in array. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values |
|
cannot be infinite. |
|
|
|
.. versionadded:: 0.22 |
|
``force_all_finite`` accepts the string ``'allow-nan'``. |
|
|
|
.. versionchanged:: 0.23 |
|
Accepts `pd.NA` and converts it into `np.nan`. |
|
|
|
.. deprecated:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite` and will be removed |
|
in 1.8. |
|
|
|
ensure_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored |
|
for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The |
|
possibilities are: |
|
|
|
- True: Force all values of array to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in array. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values |
|
cannot be infinite. |
|
|
|
.. versionadded:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite`. |
|
|
|
**kwds : optional keyword parameters |
|
Any further parameters are passed directly to the distance function. |
|
If using a scipy.spatial.distance metric, the parameters are still |
|
metric dependent. See the scipy docs for usage examples. |
|
|
|
Returns |
|
------- |
|
D : ndarray of shape (n_samples_X, n_samples_X) or \ |
|
(n_samples_X, n_samples_Y) |
|
A distance matrix D such that D_{i, j} is the distance between the |
|
ith and jth vectors of the given matrix X, if Y is None. |
|
If Y is not None, then D_{i, j} is the distance between the ith array |
|
from X and the jth array from Y. |
|
|
|
See Also |
|
-------- |
|
pairwise_distances_chunked : Performs the same calculation as this |
|
function, but returns a generator of chunks of the distance matrix, in |
|
order to limit memory usage. |
|
sklearn.metrics.pairwise.paired_distances : Computes the distances between |
|
corresponding elements of two arrays. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import pairwise_distances |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> pairwise_distances(X, Y, metric='sqeuclidean') |
|
array([[1., 2.], |
|
[2., 1.]]) |
|
""" |
|
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) |
|
|
|
if metric == "precomputed": |
|
X, _ = check_pairwise_arrays( |
|
X, Y, precomputed=True, ensure_all_finite=ensure_all_finite |
|
) |
|
|
|
whom = ( |
|
"`pairwise_distances`. Precomputed distance " |
|
" need to have non-negative values." |
|
) |
|
check_non_negative(X, whom=whom) |
|
return X |
|
elif metric in PAIRWISE_DISTANCE_FUNCTIONS: |
|
func = PAIRWISE_DISTANCE_FUNCTIONS[metric] |
|
elif callable(metric): |
|
func = partial( |
|
_pairwise_callable, |
|
metric=metric, |
|
ensure_all_finite=ensure_all_finite, |
|
**kwds, |
|
) |
|
else: |
|
if issparse(X) or issparse(Y): |
|
raise TypeError("scipy distance metrics do not support sparse matrices.") |
|
|
|
dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else "infer_float" |
|
|
|
if dtype is bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)): |
|
msg = "Data was converted to boolean for metric %s" % metric |
|
warnings.warn(msg, DataConversionWarning) |
|
|
|
X, Y = check_pairwise_arrays( |
|
X, Y, dtype=dtype, ensure_all_finite=ensure_all_finite |
|
) |
|
|
|
|
|
params = _precompute_metric_params(X, Y, metric=metric, **kwds) |
|
kwds.update(**params) |
|
|
|
if effective_n_jobs(n_jobs) == 1 and X is Y: |
|
return distance.squareform(distance.pdist(X, metric=metric, **kwds)) |
|
func = partial(distance.cdist, metric=metric, **kwds) |
|
|
|
return _parallel_pairwise(X, Y, func, n_jobs, **kwds) |
|
|
|
|
|
|
|
PAIRWISE_BOOLEAN_FUNCTIONS = [ |
|
"dice", |
|
"jaccard", |
|
"rogerstanimoto", |
|
"russellrao", |
|
"sokalsneath", |
|
"yule", |
|
] |
|
if sp_base_version < parse_version("1.17"): |
|
|
|
PAIRWISE_BOOLEAN_FUNCTIONS += ["sokalmichener"] |
|
if sp_base_version < parse_version("1.11"): |
|
|
|
PAIRWISE_BOOLEAN_FUNCTIONS += ["kulsinski"] |
|
if sp_base_version < parse_version("1.9"): |
|
|
|
PAIRWISE_BOOLEAN_FUNCTIONS += ["matching"] |
|
|
|
|
|
PAIRWISE_KERNEL_FUNCTIONS = { |
|
|
|
|
|
"additive_chi2": additive_chi2_kernel, |
|
"chi2": chi2_kernel, |
|
"linear": linear_kernel, |
|
"polynomial": polynomial_kernel, |
|
"poly": polynomial_kernel, |
|
"rbf": rbf_kernel, |
|
"laplacian": laplacian_kernel, |
|
"sigmoid": sigmoid_kernel, |
|
"cosine": cosine_similarity, |
|
} |
|
|
|
|
|
def kernel_metrics(): |
|
"""Valid metrics for pairwise_kernels. |
|
|
|
This function simply returns the valid pairwise distance metrics. |
|
It exists, however, to allow for a verbose description of the mapping for |
|
each of the valid strings. |
|
|
|
The valid distance metrics, and the function they map to, are: |
|
=============== ======================================== |
|
metric Function |
|
=============== ======================================== |
|
'additive_chi2' sklearn.pairwise.additive_chi2_kernel |
|
'chi2' sklearn.pairwise.chi2_kernel |
|
'linear' sklearn.pairwise.linear_kernel |
|
'poly' sklearn.pairwise.polynomial_kernel |
|
'polynomial' sklearn.pairwise.polynomial_kernel |
|
'rbf' sklearn.pairwise.rbf_kernel |
|
'laplacian' sklearn.pairwise.laplacian_kernel |
|
'sigmoid' sklearn.pairwise.sigmoid_kernel |
|
'cosine' sklearn.pairwise.cosine_similarity |
|
=============== ======================================== |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Returns |
|
------- |
|
kernel_metrics : dict |
|
Returns valid metrics for pairwise_kernels. |
|
""" |
|
return PAIRWISE_KERNEL_FUNCTIONS |
|
|
|
|
|
KERNEL_PARAMS = { |
|
"additive_chi2": (), |
|
"chi2": frozenset(["gamma"]), |
|
"cosine": (), |
|
"linear": (), |
|
"poly": frozenset(["gamma", "degree", "coef0"]), |
|
"polynomial": frozenset(["gamma", "degree", "coef0"]), |
|
"rbf": frozenset(["gamma"]), |
|
"laplacian": frozenset(["gamma"]), |
|
"sigmoid": frozenset(["gamma", "coef0"]), |
|
} |
|
|
|
|
|
@validate_params( |
|
{ |
|
"X": ["array-like", "sparse matrix"], |
|
"Y": ["array-like", "sparse matrix", None], |
|
"metric": [ |
|
StrOptions(set(PAIRWISE_KERNEL_FUNCTIONS) | {"precomputed"}), |
|
callable, |
|
], |
|
"filter_params": ["boolean"], |
|
"n_jobs": [Integral, None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def pairwise_kernels( |
|
X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds |
|
): |
|
"""Compute the kernel between arrays X and optional array Y. |
|
|
|
This method takes either a vector array or a kernel matrix, and returns |
|
a kernel matrix. If the input is a vector array, the kernels are |
|
computed. If the input is a kernel matrix, it is returned instead. |
|
|
|
This method provides a safe way to take a kernel matrix as input, while |
|
preserving compatibility with many other algorithms that take a vector |
|
array. |
|
|
|
If Y is given (default is None), then the returned matrix is the pairwise |
|
kernel between the arrays from both X and Y. |
|
|
|
Valid values for metric are: |
|
['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf', |
|
'laplacian', 'sigmoid', 'cosine'] |
|
|
|
Read more in the :ref:`User Guide <metrics>`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \ |
|
(n_samples_X, n_features) |
|
Array of pairwise kernels between samples, or a feature array. |
|
The shape of the array should be (n_samples_X, n_samples_X) if |
|
metric == "precomputed" and (n_samples_X, n_features) otherwise. |
|
|
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None |
|
A second feature array only if X has shape (n_samples_X, n_features). |
|
|
|
metric : str or callable, default="linear" |
|
The metric to use when calculating kernel between instances in a |
|
feature array. If metric is a string, it must be one of the metrics |
|
in ``pairwise.PAIRWISE_KERNEL_FUNCTIONS``. |
|
If metric is "precomputed", X is assumed to be a kernel matrix. |
|
Alternatively, if metric is a callable function, it is called on each |
|
pair of instances (rows) and the resulting value recorded. The callable |
|
should take two rows from X as input and return the corresponding |
|
kernel value as a single number. This means that callables from |
|
:mod:`sklearn.metrics.pairwise` are not allowed, as they operate on |
|
matrices, not single samples. Use the string identifying the kernel |
|
instead. |
|
|
|
filter_params : bool, default=False |
|
Whether to filter invalid parameters or not. |
|
|
|
n_jobs : int, default=None |
|
The number of jobs to use for the computation. This works by breaking |
|
down the pairwise matrix into n_jobs even slices and computing them |
|
using multithreading. |
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
**kwds : optional keyword parameters |
|
Any further parameters are passed directly to the kernel function. |
|
|
|
Returns |
|
------- |
|
K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y) |
|
A kernel matrix K such that K_{i, j} is the kernel between the |
|
ith and jth vectors of the given matrix X, if Y is None. |
|
If Y is not None, then K_{i, j} is the kernel between the ith array |
|
from X and the jth array from Y. |
|
|
|
Notes |
|
----- |
|
If metric is 'precomputed', Y is ignored and X is returned. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics.pairwise import pairwise_kernels |
|
>>> X = [[0, 0, 0], [1, 1, 1]] |
|
>>> Y = [[1, 0, 0], [1, 1, 0]] |
|
>>> pairwise_kernels(X, Y, metric='linear') |
|
array([[0., 0.], |
|
[1., 2.]]) |
|
""" |
|
|
|
from ..gaussian_process.kernels import Kernel as GPKernel |
|
|
|
if metric == "precomputed": |
|
X, _ = check_pairwise_arrays(X, Y, precomputed=True) |
|
return X |
|
elif isinstance(metric, GPKernel): |
|
func = metric.__call__ |
|
elif metric in PAIRWISE_KERNEL_FUNCTIONS: |
|
if filter_params: |
|
kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]} |
|
func = PAIRWISE_KERNEL_FUNCTIONS[metric] |
|
elif callable(metric): |
|
func = partial(_pairwise_callable, metric=metric, **kwds) |
|
|
|
return _parallel_pairwise(X, Y, func, n_jobs, **kwds) |
|
|