{{py: | |
implementation_specific_values = [ | |
# Values are the following ones: | |
# | |
# name_suffix, INPUT_DTYPE_t, INPUT_DTYPE | |
('64', 'float64_t', 'np.float64'), | |
('32', 'float32_t', 'np.float32') | |
] | |
}} | |
# By Jake Vanderplas (2013) <[email protected]> | |
# written for the scikit-learn project | |
# SPDX-License-Identifier: BSD-3-Clause | |
import numpy as np | |
cimport numpy as cnp | |
cnp.import_array() # required in order to use C-API | |
from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin | |
from scipy.sparse import csr_matrix, issparse | |
from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t | |
from ..utils import check_array | |
from ..utils.fixes import parse_version, sp_base_version | |
cdef inline double fmax(double a, double b) noexcept nogil: | |
return max(a, b) | |
###################################################################### | |
# newObj function | |
# this is a helper function for pickling | |
def newObj(obj): | |
return obj.__new__(obj) | |
BOOL_METRICS = [ | |
"hamming", | |
"jaccard", | |
"dice", | |
"rogerstanimoto", | |
"russellrao", | |
"sokalsneath", | |
] | |
DEPRECATED_METRICS = [] | |
if sp_base_version < parse_version("1.17"): | |
# Deprecated in SciPy 1.15 and removed in SciPy 1.17 | |
BOOL_METRICS += ["sokalmichener"] | |
if sp_base_version >= parse_version("1.15"): | |
DEPRECATED_METRICS.append("sokalmichener") | |
if sp_base_version < parse_version("1.11"): | |
# Deprecated in SciPy 1.9 and removed in SciPy 1.11 | |
BOOL_METRICS += ["kulsinski"] | |
if sp_base_version >= parse_version("1.9"): | |
DEPRECATED_METRICS.append("kulsinski") | |
if sp_base_version < parse_version("1.9"): | |
# Deprecated in SciPy 1.0 and removed in SciPy 1.9 | |
BOOL_METRICS += ["matching"] | |
if sp_base_version >= parse_version("1.0"): | |
DEPRECATED_METRICS.append("matching") | |
def get_valid_metric_ids(L): | |
"""Given an iterable of metric class names or class identifiers, | |
return a list of metric IDs which map to those classes. | |
Example: | |
>>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) | |
>>> sorted(L) | |
['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] | |
""" | |
return [key for (key, val) in METRIC_MAPPING64.items() | |
if (val.__name__ in L) or (val in L)] | |
cdef class DistanceMetric: | |
"""Uniform interface for fast distance metric functions. | |
The `DistanceMetric` class provides a convenient way to compute pairwise distances | |
between samples. It supports various distance metrics, such as Euclidean distance, | |
Manhattan distance, and more. | |
The `pairwise` method can be used to compute pairwise distances between samples in | |
the input arrays. It returns a distance matrix representing the distances between | |
all pairs of samples. | |
The :meth:`get_metric` method allows you to retrieve a specific metric using its | |
string identifier. | |
Examples | |
-------- | |
>>> from sklearn.metrics import DistanceMetric | |
>>> dist = DistanceMetric.get_metric('euclidean') | |
>>> X = [[1, 2], [3, 4], [5, 6]] | |
>>> Y = [[7, 8], [9, 10]] | |
>>> dist.pairwise(X,Y) | |
array([[7.81..., 10.63...] | |
[5.65..., 8.48...] | |
[1.41..., 4.24...]]) | |
.. rubric:: Available Metrics | |
The following lists the string metric identifiers and the associated | |
distance metric classes: | |
**Metrics intended for real-valued vector spaces:** | |
============== ==================== ======== =============================== | |
identifier class name args distance function | |
-------------- -------------------- -------- ------------------------------- | |
"euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` | |
"manhattan" ManhattanDistance - ``sum(|x - y|)`` | |
"chebyshev" ChebyshevDistance - ``max(|x - y|)`` | |
"minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` | |
"seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` | |
"mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` | |
============== ==================== ======== =============================== | |
**Metrics intended for two-dimensional vector spaces:** Note that the haversine | |
distance metric requires data in the form of [latitude, longitude] and both | |
inputs and outputs are in units of radians. | |
============ ================== =============================================================== | |
identifier class name distance function | |
------------ ------------------ --------------------------------------------------------------- | |
"haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` | |
============ ================== =============================================================== | |
**Metrics intended for integer-valued vector spaces:** Though intended | |
for integer-valued vectors, these are also valid metrics in the case of | |
real-valued vectors. | |
============= ==================== ======================================== | |
identifier class name distance function | |
------------- -------------------- ---------------------------------------- | |
"hamming" HammingDistance ``N_unequal(x, y) / N_tot`` | |
"canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` | |
"braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` | |
============= ==================== ======================================== | |
**Metrics intended for boolean-valued vector spaces:** Any nonzero entry | |
is evaluated to "True". In the listings below, the following | |
abbreviations are used: | |
- N: number of dimensions | |
- NTT: number of dims in which both values are True | |
- NTF: number of dims in which the first value is True, second is False | |
- NFT: number of dims in which the first value is False, second is True | |
- NFF: number of dims in which both values are False | |
- NNEQ: number of non-equal dimensions, NNEQ = NTF + NFT | |
- NNZ: number of nonzero dimensions, NNZ = NTF + NFT + NTT | |
================= ======================= =============================== | |
identifier class name distance function | |
----------------- ----------------------- ------------------------------- | |
"jaccard" JaccardDistance NNEQ / NNZ | |
"matching" MatchingDistance NNEQ / N | |
"dice" DiceDistance NNEQ / (NTT + NNZ) | |
"kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) | |
"rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) | |
"russellrao" RussellRaoDistance (N - NTT) / N | |
"sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) | |
"sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) | |
================= ======================= =============================== | |
**User-defined distance:** | |
=========== =============== ======= | |
identifier class name args | |
----------- --------------- ------- | |
"pyfunc" PyFuncDistance func | |
=========== =============== ======= | |
Here ``func`` is a function which takes two one-dimensional numpy | |
arrays, and returns a distance. Note that in order to be used within | |
the BallTree, the distance must be a true metric: | |
i.e. it must satisfy the following properties | |
1) Non-negativity: d(x, y) >= 0 | |
2) Identity: d(x, y) = 0 if and only if x == y | |
3) Symmetry: d(x, y) = d(y, x) | |
4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) | |
Because of the Python object overhead involved in calling the python | |
function, this will be fairly slow, but it will have the same | |
scaling as other distances. | |
""" | |
@classmethod | |
def get_metric(cls, metric, dtype=np.float64, **kwargs): | |
"""Get the given distance metric from the string identifier. | |
See the docstring of DistanceMetric for a list of available metrics. | |
Parameters | |
---------- | |
metric : str or class name | |
The string identifier or class name of the desired distance metric. | |
See the documentation of the `DistanceMetric` class for a list of | |
available metrics. | |
dtype : {np.float32, np.float64}, default=np.float64 | |
The data type of the input on which the metric will be applied. | |
This affects the precision of the computed distances. | |
By default, it is set to `np.float64`. | |
**kwargs | |
Additional keyword arguments that will be passed to the requested metric. | |
These arguments can be used to customize the behavior of the specific | |
metric. | |
Returns | |
------- | |
metric_obj : instance of the requested metric | |
An instance of the requested distance metric class. | |
""" | |
if dtype == np.float32: | |
specialized_class = DistanceMetric32 | |
elif dtype == np.float64: | |
specialized_class = DistanceMetric64 | |
else: | |
raise ValueError( | |
f"Unexpected dtype {dtype} provided. Please select a dtype from" | |
" {np.float32, np.float64}" | |
) | |
return specialized_class.get_metric(metric, **kwargs) | |
{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} | |
###################################################################### | |
# metric mappings | |
# These map from metric id strings to class names | |
METRIC_MAPPING{{name_suffix}} = { | |
'euclidean': EuclideanDistance{{name_suffix}}, | |
'l2': EuclideanDistance{{name_suffix}}, | |
'minkowski': MinkowskiDistance{{name_suffix}}, | |
'p': MinkowskiDistance{{name_suffix}}, | |
'manhattan': ManhattanDistance{{name_suffix}}, | |
'cityblock': ManhattanDistance{{name_suffix}}, | |
'l1': ManhattanDistance{{name_suffix}}, | |
'chebyshev': ChebyshevDistance{{name_suffix}}, | |
'infinity': ChebyshevDistance{{name_suffix}}, | |
'seuclidean': SEuclideanDistance{{name_suffix}}, | |
'mahalanobis': MahalanobisDistance{{name_suffix}}, | |
'hamming': HammingDistance{{name_suffix}}, | |
'canberra': CanberraDistance{{name_suffix}}, | |
'braycurtis': BrayCurtisDistance{{name_suffix}}, | |
'matching': MatchingDistance{{name_suffix}}, | |
'jaccard': JaccardDistance{{name_suffix}}, | |
'dice': DiceDistance{{name_suffix}}, | |
'kulsinski': KulsinskiDistance{{name_suffix}}, | |
'rogerstanimoto': RogersTanimotoDistance{{name_suffix}}, | |
'russellrao': RussellRaoDistance{{name_suffix}}, | |
'sokalmichener': SokalMichenerDistance{{name_suffix}}, | |
'sokalsneath': SokalSneathDistance{{name_suffix}}, | |
'haversine': HaversineDistance{{name_suffix}}, | |
'pyfunc': PyFuncDistance{{name_suffix}}, | |
} | |
cdef inline object _buffer_to_ndarray{{name_suffix}}(const {{INPUT_DTYPE_t}}* x, intp_t n): | |
# Wrap a memory buffer with an ndarray. Warning: this is not robust. | |
# In particular, if x is deallocated before the returned array goes | |
# out of scope, this could cause memory errors. Since there is not | |
# a possibility of this for our use-case, this should be safe. | |
# Note: this Segfaults unless np.import_array() is called above | |
# TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 | |
return cnp.PyArray_SimpleNewFromData(1, <cnp.intp_t*>&n, cnp.NPY_FLOAT64, <void*>x) | |
cdef {{INPUT_DTYPE_t}} INF{{name_suffix}} = np.inf | |
###################################################################### | |
# Distance Metric Classes | |
cdef class DistanceMetric{{name_suffix}}(DistanceMetric): | |
"""DistanceMetric class | |
This class provides a uniform interface to fast distance metric | |
functions. The various metrics can be accessed via the :meth:`get_metric` | |
class method and the metric string identifier (see below). | |
Examples | |
-------- | |
>>> from sklearn.metrics import DistanceMetric | |
>>> dist = DistanceMetric.get_metric('euclidean') | |
>>> X = [[0, 1, 2], | |
[3, 4, 5]] | |
>>> dist.pairwise(X) | |
array([[ 0. , 5.19615242], | |
[ 5.19615242, 0. ]]) | |
Available Metrics | |
The following lists the string metric identifiers and the associated | |
distance metric classes: | |
**Metrics intended for real-valued vector spaces:** | |
============== ==================== ======== =============================== | |
identifier class name args distance function | |
-------------- -------------------- -------- ------------------------------- | |
"euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` | |
"manhattan" ManhattanDistance - ``sum(|x - y|)`` | |
"chebyshev" ChebyshevDistance - ``max(|x - y|)`` | |
"minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` | |
"seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` | |
"mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` | |
============== ==================== ======== =============================== | |
**Metrics intended for two-dimensional vector spaces:** Note that the haversine | |
distance metric requires data in the form of [latitude, longitude] and both | |
inputs and outputs are in units of radians. | |
============ ================== =============================================================== | |
identifier class name distance function | |
------------ ------------------ --------------------------------------------------------------- | |
"haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` | |
============ ================== =============================================================== | |
**Metrics intended for integer-valued vector spaces:** Though intended | |
for integer-valued vectors, these are also valid metrics in the case of | |
real-valued vectors. | |
============= ==================== ======================================== | |
identifier class name distance function | |
------------- -------------------- ---------------------------------------- | |
"hamming" HammingDistance ``N_unequal(x, y) / N_tot`` | |
"canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` | |
"braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` | |
============= ==================== ======================================== | |
**Metrics intended for boolean-valued vector spaces:** Any nonzero entry | |
is evaluated to "True". In the listings below, the following | |
abbreviations are used: | |
- N: number of dimensions | |
- NTT: number of dims in which both values are True | |
- NTF: number of dims in which the first value is True, second is False | |
- NFT: number of dims in which the first value is False, second is True | |
- NFF: number of dims in which both values are False | |
- NNEQ: number of non-equal dimensions, NNEQ = NTF + NFT | |
- NNZ: number of nonzero dimensions, NNZ = NTF + NFT + NTT | |
================= ======================= =============================== | |
identifier class name distance function | |
----------------- ----------------------- ------------------------------- | |
"jaccard" JaccardDistance NNEQ / NNZ | |
"matching" MatchingDistance NNEQ / N | |
"dice" DiceDistance NNEQ / (NTT + NNZ) | |
"kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) | |
"rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) | |
"russellrao" RussellRaoDistance (N - NTT) / N | |
"sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) | |
"sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) | |
================= ======================= =============================== | |
**User-defined distance:** | |
=========== =============== ======= | |
identifier class name args | |
----------- --------------- ------- | |
"pyfunc" PyFuncDistance func | |
=========== =============== ======= | |
Here ``func`` is a function which takes two one-dimensional numpy | |
arrays, and returns a distance. Note that in order to be used within | |
the BallTree, the distance must be a true metric: | |
i.e. it must satisfy the following properties | |
1) Non-negativity: d(x, y) >= 0 | |
2) Identity: d(x, y) = 0 if and only if x == y | |
3) Symmetry: d(x, y) = d(y, x) | |
4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) | |
Because of the Python object overhead involved in calling the python | |
function, this will be fairly slow, but it will have the same | |
scaling as other distances. | |
""" | |
def __cinit__(self): | |
self.p = 2 | |
self.vec = np.zeros(1, dtype=np.float64, order='C') | |
self.mat = np.zeros((1, 1), dtype=np.float64, order='C') | |
self.size = 1 | |
def __reduce__(self): | |
""" | |
reduce method used for pickling | |
""" | |
return (newObj, (self.__class__,), self.__getstate__()) | |
def __getstate__(self): | |
""" | |
get state for pickling | |
""" | |
if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}": | |
return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs) | |
return (float(self.p), np.asarray(self.vec), np.asarray(self.mat)) | |
def __setstate__(self, state): | |
""" | |
set state for pickling | |
""" | |
self.p = state[0] | |
self.vec = state[1] | |
self.mat = state[2] | |
if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}": | |
self.func = state[3] | |
self.kwargs = state[4] | |
self.size = self.vec.shape[0] | |
@classmethod | |
def get_metric(cls, metric, **kwargs): | |
"""Get the given distance metric from the string identifier. | |
See the docstring of DistanceMetric for a list of available metrics. | |
Parameters | |
---------- | |
metric : str or class name | |
The distance metric to use | |
**kwargs | |
additional arguments will be passed to the requested metric | |
""" | |
if isinstance(metric, DistanceMetric{{name_suffix}}): | |
return metric | |
if callable(metric): | |
return PyFuncDistance{{name_suffix}}(metric, **kwargs) | |
# Map the metric string ID to the metric class | |
if isinstance(metric, type) and issubclass(metric, DistanceMetric{{name_suffix}}): | |
pass | |
else: | |
try: | |
metric = METRIC_MAPPING{{name_suffix}}[metric] | |
except: | |
raise ValueError("Unrecognized metric '%s'" % metric) | |
# In Minkowski special cases, return more efficient methods | |
if metric is MinkowskiDistance{{name_suffix}}: | |
p = kwargs.pop('p', 2) | |
w = kwargs.pop('w', None) | |
if p == 1 and w is None: | |
return ManhattanDistance{{name_suffix}}(**kwargs) | |
elif p == 2 and w is None: | |
return EuclideanDistance{{name_suffix}}(**kwargs) | |
elif np.isinf(p) and w is None: | |
return ChebyshevDistance{{name_suffix}}(**kwargs) | |
else: | |
return MinkowskiDistance{{name_suffix}}(p, w, **kwargs) | |
else: | |
return metric(**kwargs) | |
def __init__(self): | |
if self.__class__ is DistanceMetric{{name_suffix}}: | |
raise NotImplementedError("DistanceMetric{{name_suffix}} is an abstract class") | |
def _validate_data(self, X): | |
"""Validate the input data. | |
This should be overridden in a base class if a specific input format | |
is required. | |
""" | |
return | |
cdef {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
"""Compute the distance between vectors x1 and x2 | |
This should be overridden in a base class. | |
""" | |
return -999 | |
cdef {{INPUT_DTYPE_t}} rdist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
"""Compute the rank-preserving surrogate distance between vectors x1 and x2. | |
This can optionally be overridden in a base class. | |
The rank-preserving surrogate distance is any measure that yields the same | |
rank as the distance, but is more efficient to compute. For example, the | |
rank-preserving surrogate distance of the Euclidean metric is the | |
squared-euclidean distance. | |
""" | |
return self.dist(x1, x2, size) | |
cdef int pdist( | |
self, | |
const {{INPUT_DTYPE_t}}[:, ::1] X, | |
{{INPUT_DTYPE_t}}[:, ::1] D, | |
) except -1: | |
"""Compute the pairwise distances between points in X""" | |
cdef intp_t i1, i2 | |
for i1 in range(X.shape[0]): | |
for i2 in range(i1, X.shape[0]): | |
D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) | |
D[i2, i1] = D[i1, i2] | |
return 0 | |
cdef int cdist( | |
self, | |
const {{INPUT_DTYPE_t}}[:, ::1] X, | |
const {{INPUT_DTYPE_t}}[:, ::1] Y, | |
{{INPUT_DTYPE_t}}[:, ::1] D, | |
) except -1: | |
"""Compute the cross-pairwise distances between arrays X and Y""" | |
cdef intp_t i1, i2 | |
if X.shape[1] != Y.shape[1]: | |
raise ValueError('X and Y must have the same second dimension') | |
for i1 in range(X.shape[0]): | |
for i2 in range(Y.shape[0]): | |
D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) | |
return 0 | |
cdef {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
"""Compute the distance between vectors x1 and x2 represented | |
under the CSR format. | |
This must be overridden in a subclass. | |
Notes | |
----- | |
0. The implementation of this method in subclasses must be robust to the | |
presence of explicit zeros in the CSR representation. | |
1. The `data` arrays are passed using pointers to be able to support an | |
alternative representation of the CSR data structure for supporting | |
fused sparse-dense datasets pairs with minimum overhead. | |
See the explanations in `SparseDenseDatasetsPair.__init__`. | |
2. An alternative signature would be: | |
cdef {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
) except -1 nogil: | |
Where callers would use slicing on the original CSR data and indices | |
memoryviews: | |
x1_start = X1_csr.indices_ptr[i] | |
x1_end = X1_csr.indices_ptr[i+1] | |
x2_start = X2_csr.indices_ptr[j] | |
x2_end = X2_csr.indices_ptr[j+1] | |
self.dist_csr( | |
&x1_data[x1_start], | |
x1_indices[x1_start:x1_end], | |
&x2_data[x2_start], | |
x2_indices[x2_start:x2_end], | |
) | |
Yet, slicing on memoryview slows down execution as it takes the GIL. | |
See: https://github.com/scikit-learn/scikit-learn/issues/17299 | |
Hence, to avoid slicing the data and indices arrays of the sparse | |
matrices containing respectively x1 and x2 (namely x{1,2}_{data,indices}) | |
are passed as well as their indices pointers (namely x{1,2}_{start,end}). | |
3. For reference about the CSR format, see section 3.4 of | |
Saad, Y. (2003), Iterative Methods for Sparse Linear Systems, SIAM. | |
https://www-users.cse.umn.edu/~saad/IterMethBook_2ndEd.pdf | |
""" | |
return -999 | |
cdef {{INPUT_DTYPE_t}} rdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
"""Distance between rows of CSR matrices x1 and x2. | |
This can optionally be overridden in a subclass. | |
The rank-preserving surrogate distance is any measure that yields the same | |
rank as the distance, but is more efficient to compute. For example, the | |
rank-preserving surrogate distance of the Euclidean metric is the | |
squared-euclidean distance. | |
Notes | |
----- | |
The implementation of this method in subclasses must be robust to the | |
presence of explicit zeros in the CSR representation. | |
More information about the motives for this method signature is given | |
in the docstring of dist_csr. | |
""" | |
return self.dist_csr( | |
x1_data, | |
x1_indices, | |
x2_data, | |
x2_indices, | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
) | |
cdef int pdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t[::1] x1_indices, | |
const int32_t[::1] x1_indptr, | |
const intp_t size, | |
{{INPUT_DTYPE_t}}[:, ::1] D, | |
) except -1 nogil: | |
"""Pairwise distances between rows in CSR matrix X. | |
Note that this implementation is twice faster than cdist_csr(X, X) | |
because it leverages the symmetry of the problem. | |
""" | |
cdef: | |
intp_t i1, i2 | |
intp_t n_x1 = x1_indptr.shape[0] - 1 | |
intp_t x1_start, x1_end, x2_start, x2_end | |
for i1 in range(n_x1): | |
x1_start = x1_indptr[i1] | |
x1_end = x1_indptr[i1 + 1] | |
for i2 in range(i1, n_x1): | |
x2_start = x1_indptr[i2] | |
x2_end = x1_indptr[i2 + 1] | |
D[i1, i2] = D[i2, i1] = self.dist_csr( | |
x1_data, | |
&x1_indices[0], | |
x1_data, | |
&x1_indices[0], | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
) | |
return 0 | |
cdef int cdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t[::1] x1_indices, | |
const int32_t[::1] x1_indptr, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t[::1] x2_indices, | |
const int32_t[::1] x2_indptr, | |
const intp_t size, | |
{{INPUT_DTYPE_t}}[:, ::1] D, | |
) except -1 nogil: | |
"""Compute the cross-pairwise distances between arrays X and Y | |
represented in the CSR format.""" | |
cdef: | |
intp_t i1, i2 | |
intp_t n_x1 = x1_indptr.shape[0] - 1 | |
intp_t n_x2 = x2_indptr.shape[0] - 1 | |
intp_t x1_start, x1_end, x2_start, x2_end | |
for i1 in range(n_x1): | |
x1_start = x1_indptr[i1] | |
x1_end = x1_indptr[i1 + 1] | |
for i2 in range(n_x2): | |
x2_start = x2_indptr[i2] | |
x2_end = x2_indptr[i2 + 1] | |
D[i1, i2] = self.dist_csr( | |
x1_data, | |
&x1_indices[0], | |
x2_data, | |
&x2_indices[0], | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
) | |
return 0 | |
cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: | |
"""Convert the rank-preserving surrogate distance to the distance""" | |
return rdist | |
cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: | |
"""Convert the distance to the rank-preserving surrogate distance""" | |
return dist | |
def rdist_to_dist(self, rdist): | |
"""Convert the rank-preserving surrogate distance to the distance. | |
The surrogate distance is any measure that yields the same rank as the | |
distance, but is more efficient to compute. For example, the | |
rank-preserving surrogate distance of the Euclidean metric is the | |
squared-euclidean distance. | |
Parameters | |
---------- | |
rdist : double | |
Surrogate distance. | |
Returns | |
------- | |
double | |
True distance. | |
""" | |
return rdist | |
def dist_to_rdist(self, dist): | |
"""Convert the true distance to the rank-preserving surrogate distance. | |
The surrogate distance is any measure that yields the same rank as the | |
distance, but is more efficient to compute. For example, the | |
rank-preserving surrogate distance of the Euclidean metric is the | |
squared-euclidean distance. | |
Parameters | |
---------- | |
dist : double | |
True distance. | |
Returns | |
------- | |
double | |
Surrogate distance. | |
""" | |
return dist | |
def _pairwise_dense_dense(self, X, Y): | |
cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr | |
cdef const {{INPUT_DTYPE_t}}[:, ::1] Yarr | |
cdef {{INPUT_DTYPE_t}}[:, ::1] Darr | |
Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') | |
self._validate_data(Xarr) | |
if X is Y: | |
Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') | |
self.pdist(Xarr, Darr) | |
else: | |
Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') | |
self._validate_data(Yarr) | |
Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') | |
self.cdist(Xarr, Yarr, Darr) | |
return np.asarray(Darr) | |
def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): | |
cdef: | |
intp_t n_X, n_features | |
const {{INPUT_DTYPE_t}}[::1] X_data | |
const int32_t[::1] X_indices | |
const int32_t[::1] X_indptr | |
intp_t n_Y | |
const {{INPUT_DTYPE_t}}[::1] Y_data | |
const int32_t[::1] Y_indices | |
const int32_t[::1] Y_indptr | |
{{INPUT_DTYPE_t}}[:, ::1] Darr | |
X_csr = X.tocsr() | |
n_X, n_features = X_csr.shape | |
X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) | |
X_indices = np.asarray(X_csr.indices, dtype=np.int32) | |
X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) | |
if X is Y: | |
Darr = np.empty((n_X, n_X), dtype={{INPUT_DTYPE}}, order='C') | |
self.pdist_csr( | |
x1_data=&X_data[0], | |
x1_indices=X_indices, | |
x1_indptr=X_indptr, | |
size=n_features, | |
D=Darr, | |
) | |
else: | |
Y_csr = Y.tocsr() | |
n_Y, _ = Y_csr.shape | |
Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) | |
Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) | |
Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) | |
Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') | |
self.cdist_csr( | |
x1_data=&X_data[0], | |
x1_indices=X_indices, | |
x1_indptr=X_indptr, | |
x2_data=&Y_data[0], | |
x2_indices=Y_indices, | |
x2_indptr=Y_indptr, | |
size=n_features, | |
D=Darr, | |
) | |
return np.asarray(Darr) | |
def _pairwise_sparse_dense(self, X: csr_matrix, Y): | |
cdef: | |
intp_t n_X = X.shape[0] | |
intp_t n_features = X.shape[1] | |
const {{INPUT_DTYPE_t}}[::1] X_data = np.asarray( | |
X.data, dtype={{INPUT_DTYPE}}, | |
) | |
const int32_t[::1] X_indices = np.asarray( | |
X.indices, dtype=np.int32, | |
) | |
const int32_t[::1] X_indptr = np.asarray( | |
X.indptr, dtype=np.int32, | |
) | |
const {{INPUT_DTYPE_t}}[:, ::1] Y_data = np.asarray( | |
Y, dtype={{INPUT_DTYPE}}, order="C", | |
) | |
intp_t n_Y = Y_data.shape[0] | |
const int32_t[::1] Y_indices = ( | |
np.arange(n_features, dtype=np.int32) | |
) | |
{{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') | |
intp_t i1, i2 | |
intp_t x1_start, x1_end | |
{{INPUT_DTYPE_t}} * x2_data | |
with nogil: | |
# Use the exact same adaptation for CSR than in SparseDenseDatasetsPair | |
# for supporting the sparse-dense case with minimal overhead. | |
# Note: at this point this method is only a convenience method | |
# used in the tests via the DistanceMetric.pairwise method. | |
# Therefore, there is no need to attempt parallelization of those | |
# nested for-loops. | |
# Efficient parallel computation of pairwise distances can be | |
# achieved via the PairwiseDistances class instead. The latter | |
# internally calls into vector-wise distance computation from | |
# the DistanceMetric subclass while benefiting from the generic | |
# Cython/OpenMP parallelization template for the generic pairwise | |
# distance + reduction computational pattern. | |
for i1 in range(n_X): | |
x1_start = X_indptr[i1] | |
x1_end = X_indptr[i1 + 1] | |
for i2 in range(n_Y): | |
x2_data = &Y_data[0, 0] + i2 * n_features | |
Darr[i1, i2] = self.dist_csr( | |
x1_data=&X_data[0], | |
x1_indices=&X_indices[0], | |
x2_data=x2_data, | |
x2_indices=&Y_indices[0], | |
x1_start=x1_start, | |
x1_end=x1_end, | |
x2_start=0, | |
x2_end=n_features, | |
size=n_features, | |
) | |
return np.asarray(Darr) | |
def _pairwise_dense_sparse(self, X, Y: csr_matrix): | |
# We could have implemented this method using _pairwise_dense_sparse by | |
# swapping argument and by transposing the results, but this would | |
# have come with an extra copy to ensure C-contiguity of the result. | |
cdef: | |
intp_t n_X = X.shape[0] | |
intp_t n_features = X.shape[1] | |
const {{INPUT_DTYPE_t}}[:, ::1] X_data = np.asarray( | |
X, dtype={{INPUT_DTYPE}}, order="C", | |
) | |
const int32_t[::1] X_indices = np.arange( | |
n_features, dtype=np.int32, | |
) | |
intp_t n_Y = Y.shape[0] | |
const {{INPUT_DTYPE_t}}[::1] Y_data = np.asarray( | |
Y.data, dtype={{INPUT_DTYPE}}, | |
) | |
const int32_t[::1] Y_indices = np.asarray( | |
Y.indices, dtype=np.int32, | |
) | |
const int32_t[::1] Y_indptr = np.asarray( | |
Y.indptr, dtype=np.int32, | |
) | |
{{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') | |
intp_t i1, i2 | |
{{INPUT_DTYPE_t}} * x1_data | |
intp_t x2_start, x2_end | |
with nogil: | |
# Use the exact same adaptation for CSR than in SparseDenseDatasetsPair | |
# for supporting the dense-sparse case with minimal overhead. | |
# Note: at this point this method is only a convenience method | |
# used in the tests via the DistanceMetric.pairwise method. | |
# Therefore, there is no need to attempt parallelization of those | |
# nested for-loops. | |
# Efficient parallel computation of pairwise distances can be | |
# achieved via the PairwiseDistances class instead. The latter | |
# internally calls into vector-wise distance computation from | |
# the DistanceMetric subclass while benefiting from the generic | |
# Cython/OpenMP parallelization template for the generic pairwise | |
# distance + reduction computational pattern. | |
for i1 in range(n_X): | |
x1_data = &X_data[0, 0] + i1 * n_features | |
for i2 in range(n_Y): | |
x2_start = Y_indptr[i2] | |
x2_end = Y_indptr[i2 + 1] | |
Darr[i1, i2] = self.dist_csr( | |
x1_data=x1_data, | |
x1_indices=&X_indices[0], | |
x2_data=&Y_data[0], | |
x2_indices=&Y_indices[0], | |
x1_start=0, | |
x1_end=n_features, | |
x2_start=x2_start, | |
x2_end=x2_end, | |
size=n_features, | |
) | |
return np.asarray(Darr) | |
def pairwise(self, X, Y=None): | |
"""Compute the pairwise distances between X and Y | |
This is a convenience routine for the sake of testing. For many | |
metrics, the utilities in scipy.spatial.distance.cdist and | |
scipy.spatial.distance.pdist will be faster. | |
Parameters | |
---------- | |
X : ndarray or CSR matrix of shape (n_samples_X, n_features) | |
Input data. | |
Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) | |
Input data. | |
If not specified, then Y=X. | |
Returns | |
------- | |
dist : ndarray of shape (n_samples_X, n_samples_Y) | |
The distance matrix of pairwise distances between points in X and Y. | |
""" | |
X = check_array(X, accept_sparse=['csr']) | |
if Y is None: | |
Y = X | |
else: | |
Y = check_array(Y, accept_sparse=['csr']) | |
X_is_sparse = issparse(X) | |
Y_is_sparse = issparse(Y) | |
if not X_is_sparse and not Y_is_sparse: | |
return self._pairwise_dense_dense(X, Y) | |
if X_is_sparse and Y_is_sparse: | |
return self._pairwise_sparse_sparse(X, Y) | |
if X_is_sparse and not Y_is_sparse: | |
return self._pairwise_sparse_dense(X, Y) | |
return self._pairwise_dense_sparse(X, Y) | |
#------------------------------------------------------------ | |
# Euclidean Distance | |
# d = sqrt(sum(x_i^2 - y_i^2)) | |
cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Euclidean Distance metric | |
.. math:: | |
D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } | |
""" | |
def __init__(self): | |
self.p = 2 | |
cdef inline {{INPUT_DTYPE_t}} dist(self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return euclidean_dist{{name_suffix}}(x1, x2, size) | |
cdef inline {{INPUT_DTYPE_t}} rdist(self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return euclidean_rdist{{name_suffix}}(x1, x2, size) | |
cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: | |
return sqrt(rdist) | |
cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: | |
return dist * dist | |
def rdist_to_dist(self, rdist): | |
return np.sqrt(rdist) | |
def dist_to_rdist(self, dist): | |
return dist ** 2 | |
cdef inline {{INPUT_DTYPE_t}} rdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t d = 0.0 | |
float64_t unsquared = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
unsquared = x1_data[i1] - x2_data[i2] | |
d = d + (unsquared * unsquared) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
unsquared = x1_data[i1] | |
d = d + (unsquared * unsquared) | |
i1 = i1 + 1 | |
else: | |
unsquared = x2_data[i2] | |
d = d + (unsquared * unsquared) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
unsquared = x2_data[i2] | |
d = d + (unsquared * unsquared) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
unsquared = x1_data[i1] | |
d = d + (unsquared * unsquared) | |
i1 = i1 + 1 | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
return sqrt( | |
self.rdist_csr( | |
x1_data, | |
x1_indices, | |
x2_data, | |
x2_indices, | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
)) | |
#------------------------------------------------------------ | |
# SEuclidean Distance | |
# d = sqrt(sum((x_i - y_i2)^2 / v_i)) | |
cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Standardized Euclidean Distance metric | |
.. math:: | |
D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } | |
""" | |
def __init__(self, V): | |
self.vec = np.asarray(V, dtype=np.float64) | |
self.size = self.vec.shape[0] | |
self.p = 2 | |
def _validate_data(self, X): | |
if X.shape[1] != self.size: | |
raise ValueError('SEuclidean dist: size of V does not match') | |
cdef inline {{INPUT_DTYPE_t}} rdist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t tmp, d=0 | |
cdef intp_t j | |
for j in range(size): | |
tmp = x1[j] - x2[j] | |
d += (tmp * tmp / self.vec[j]) | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return sqrt(self.rdist(x1, x2, size)) | |
cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: | |
return sqrt(rdist) | |
cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: | |
return dist * dist | |
def rdist_to_dist(self, rdist): | |
return np.sqrt(rdist) | |
def dist_to_rdist(self, dist): | |
return dist ** 2 | |
cdef inline {{INPUT_DTYPE_t}} rdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t d = 0.0 | |
float64_t unsquared = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
unsquared = x1_data[i1] - x2_data[i2] | |
d = d + (unsquared * unsquared) / self.vec[ix1] | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
unsquared = x1_data[i1] | |
d = d + (unsquared * unsquared) / self.vec[ix1] | |
i1 = i1 + 1 | |
else: | |
unsquared = x2_data[i2] | |
d = d + (unsquared * unsquared) / self.vec[ix2] | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
ix2 = x2_indices[i2] | |
unsquared = x2_data[i2] | |
d = d + (unsquared * unsquared) / self.vec[ix2] | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
ix1 = x1_indices[i1] | |
unsquared = x1_data[i1] | |
d = d + (unsquared * unsquared) / self.vec[ix1] | |
i1 = i1 + 1 | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
return sqrt( | |
self.rdist_csr( | |
x1_data, | |
x1_indices, | |
x2_data, | |
x2_indices, | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
)) | |
#------------------------------------------------------------ | |
# Manhattan Distance | |
# d = sum(abs(x_i - y_i)) | |
cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Manhattan/City-block Distance metric | |
.. math:: | |
D(x, y) = \sum_i |x_i - y_i| | |
""" | |
def __init__(self): | |
self.p = 1 | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t d = 0 | |
cdef intp_t j | |
for j in range(size): | |
d += fabs(x1[j] - x2[j]) | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
{{INPUT_DTYPE_t}} d = 0.0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
d = d + fabs(x1_data[i1] - x2_data[i2]) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
d = d + fabs(x1_data[i1]) | |
i1 = i1 + 1 | |
else: | |
d = d + fabs(x2_data[i2]) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
d = d + fabs(x2_data[i2]) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
d = d + fabs(x1_data[i1]) | |
i1 = i1 + 1 | |
return d | |
#------------------------------------------------------------ | |
# Chebyshev Distance | |
# d = max_i(abs(x_i - y_i)) | |
cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
"""Chebyshev/Infinity Distance | |
.. math:: | |
D(x, y) = max_i (|x_i - y_i|) | |
Examples | |
-------- | |
>>> from sklearn.metrics.dist_metrics import DistanceMetric | |
>>> dist = DistanceMetric.get_metric('chebyshev') | |
>>> X = [[0, 1, 2], | |
... [3, 4, 5]] | |
>>> Y = [[-1, 0, 1], | |
... [3, 4, 5]] | |
>>> dist.pairwise(X, Y) | |
array([[1.732..., 5.196...], | |
[6.928..., 0.... ]]) | |
""" | |
def __init__(self): | |
self.p = INF{{name_suffix}} | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t d = 0 | |
cdef intp_t j | |
for j in range(size): | |
d = fmax(d, fabs(x1[j] - x2[j])) | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t d = 0.0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
d = fmax(d, fabs(x1_data[i1])) | |
i1 = i1 + 1 | |
else: | |
d = fmax(d, fabs(x2_data[i2])) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
d = fmax(d, fabs(x2_data[i2])) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
d = fmax(d, fabs(x1_data[i1])) | |
i1 = i1 + 1 | |
return d | |
#------------------------------------------------------------ | |
# Minkowski Distance | |
cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Minkowski Distance | |
.. math:: | |
D(x, y) = {||u-v||}_p | |
when w is None. | |
Here is the more general expanded expression for the weighted case: | |
.. math:: | |
D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p) | |
Parameters | |
---------- | |
p : float | |
The order of the p-norm of the difference (see above). | |
.. versionchanged:: 1.4.0 | |
Minkowski distance allows `p` to be `0<p<1`. | |
w : (N,) array-like (optional) | |
The weight vector. | |
Minkowski Distance requires p > 0 and finite. | |
When :math:`p \in (0,1)`, it isn't a true metric but is permissible when | |
the triangular inequality isn't necessary. | |
For p = infinity, use ChebyshevDistance. | |
Note that for p=1, ManhattanDistance is more efficient, and for | |
p=2, EuclideanDistance is more efficient. | |
""" | |
def __init__(self, p, w=None): | |
if p <= 0: | |
raise ValueError("p must be greater than 0") | |
elif np.isinf(p): | |
raise ValueError("MinkowskiDistance requires finite p. " | |
"For p=inf, use ChebyshevDistance.") | |
self.p = p | |
if w is not None: | |
w_array = check_array( | |
w, ensure_2d=False, dtype=np.float64, input_name="w" | |
) | |
if (w_array < 0).any(): | |
raise ValueError("w cannot contain negative weights") | |
self.vec = w_array | |
self.size = self.vec.shape[0] | |
else: | |
self.vec = np.asarray([], dtype=np.float64) | |
self.size = 0 | |
def _validate_data(self, X): | |
if self.size > 0 and X.shape[1] != self.size: | |
raise ValueError("MinkowskiDistance: the size of w must match " | |
f"the number of features ({X.shape[1]}). " | |
f"Currently len(w)={self.size}.") | |
cdef inline {{INPUT_DTYPE_t}} rdist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t d=0 | |
cdef intp_t j | |
cdef bint has_w = self.size > 0 | |
if has_w: | |
for j in range(size): | |
d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)) | |
else: | |
for j in range(size): | |
d += (pow(fabs(x1[j] - x2[j]), self.p)) | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return pow(self.rdist(x1, x2, size), 1. / self.p) | |
cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: | |
return pow(rdist, 1. / self.p) | |
cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: | |
return pow(dist, self.p) | |
def rdist_to_dist(self, rdist): | |
return rdist ** (1. / self.p) | |
def dist_to_rdist(self, dist): | |
return dist ** self.p | |
cdef inline {{INPUT_DTYPE_t}} rdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t d = 0.0 | |
bint has_w = self.size > 0 | |
if has_w: | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
d = d + (self.vec[ix1] * pow(fabs( | |
x1_data[i1] - x2_data[i2] | |
), self.p)) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) | |
i1 = i1 + 1 | |
else: | |
d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
ix2 = x2_indices[i2] | |
d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
ix1 = x1_indices[i1] | |
d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) | |
i1 = i1 + 1 | |
return d | |
else: | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
d = d + (pow(fabs( | |
x1_data[i1] - x2_data[i2] | |
), self.p)) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
d = d + (pow(fabs(x1_data[i1]), self.p)) | |
i1 = i1 + 1 | |
else: | |
d = d + (pow(fabs(x2_data[i2]), self.p)) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
d = d + (pow(fabs(x2_data[i2]), self.p)) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
d = d + (pow(fabs(x1_data[i1]), self.p)) | |
i1 = i1 + 1 | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
return pow( | |
self.rdist_csr( | |
x1_data, | |
x1_indices, | |
x2_data, | |
x2_indices, | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
), | |
1 / self.p | |
) | |
#------------------------------------------------------------ | |
# Mahalanobis Distance | |
# d = sqrt( (x - y)^T V^-1 (x - y) ) | |
cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
"""Mahalanobis Distance | |
.. math:: | |
D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } | |
Parameters | |
---------- | |
V : array-like | |
Symmetric positive-definite covariance matrix. | |
The inverse of this matrix will be explicitly computed. | |
VI : array-like | |
optionally specify the inverse directly. If VI is passed, | |
then V is not referenced. | |
""" | |
cdef float64_t[::1] buffer | |
def __init__(self, V=None, VI=None): | |
if VI is None: | |
if V is None: | |
raise ValueError("Must provide either V or VI " | |
"for Mahalanobis distance") | |
VI = np.linalg.inv(V) | |
if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: | |
raise ValueError("V/VI must be square") | |
self.mat = np.asarray(VI, dtype=np.float64, order='C') | |
self.size = self.mat.shape[0] | |
# We need to create a buffer to store the vectors' coordinates' differences | |
self.buffer = np.zeros(self.size, dtype=np.float64) | |
def __setstate__(self, state): | |
super().__setstate__(state) | |
self.size = self.mat.shape[0] | |
self.buffer = np.zeros(self.size, dtype=np.float64) | |
def _validate_data(self, X): | |
if X.shape[1] != self.size: | |
raise ValueError('Mahalanobis dist: size of V does not match') | |
cdef inline {{INPUT_DTYPE_t}} rdist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t tmp, d = 0 | |
cdef intp_t i, j | |
# compute (x1 - x2).T * VI * (x1 - x2) | |
for i in range(size): | |
self.buffer[i] = x1[i] - x2[i] | |
for i in range(size): | |
tmp = 0 | |
for j in range(size): | |
tmp += self.mat[i, j] * self.buffer[j] | |
d += tmp * self.buffer[i] | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return sqrt(self.rdist(x1, x2, size)) | |
cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: | |
return sqrt(rdist) | |
cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: | |
return dist * dist | |
def rdist_to_dist(self, rdist): | |
return np.sqrt(rdist) | |
def dist_to_rdist(self, dist): | |
return dist ** 2 | |
cdef inline {{INPUT_DTYPE_t}} rdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t tmp, d = 0.0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
self.buffer[ix1] = x1_data[i1] - x2_data[i2] | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
self.buffer[ix1] = x1_data[i1] | |
i1 = i1 + 1 | |
else: | |
self.buffer[ix2] = - x2_data[i2] | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
ix2 = x2_indices[i2] | |
self.buffer[ix2] = - x2_data[i2] | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
ix1 = x1_indices[i1] | |
self.buffer[ix1] = x1_data[i1] | |
i1 = i1 + 1 | |
for i in range(size): | |
tmp = 0 | |
for j in range(size): | |
tmp += self.mat[i, j] * self.buffer[j] | |
d += tmp * self.buffer[i] | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
return sqrt( | |
self.rdist_csr( | |
x1_data, | |
x1_indices, | |
x2_data, | |
x2_indices, | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
)) | |
#------------------------------------------------------------ | |
# Hamming Distance | |
# d = N_unequal(x, y) / N_tot | |
cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Hamming Distance | |
Hamming distance is meant for discrete-valued vectors, though it is | |
a valid metric for real-valued vectors. | |
.. math:: | |
D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int n_unequal = 0 | |
cdef intp_t j | |
for j in range(size): | |
if x1[j] != x2[j]: | |
n_unequal += 1 | |
return float(n_unequal) / size | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t d = 0.0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
d += (x1_data[i1] != x2_data[i2]) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
d += (x1_data[i1] != 0) | |
i1 = i1 + 1 | |
else: | |
d += (x2_data[i2] != 0) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
d += (x2_data[i2] != 0) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
d += (x1_data[i1] != 0) | |
i1 = i1 + 1 | |
d /= size | |
return d | |
#------------------------------------------------------------ | |
# Canberra Distance | |
# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] | |
cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Canberra Distance | |
Canberra distance is meant for discrete-valued vectors, though it is | |
a valid metric for real-valued vectors. | |
.. math:: | |
D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t denom, d = 0 | |
cdef intp_t j | |
for j in range(size): | |
denom = fabs(x1[j]) + fabs(x2[j]) | |
if denom > 0: | |
d += fabs(x1[j] - x2[j]) / denom | |
return d | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t d = 0.0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
d += ( | |
fabs(x1_data[i1] - x2_data[i2]) / | |
(fabs(x1_data[i1]) + fabs(x2_data[i2])) | |
) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
d += 1. | |
i1 = i1 + 1 | |
else: | |
d += 1. | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
d += 1. | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
d += 1. | |
i1 = i1 + 1 | |
return d | |
#------------------------------------------------------------ | |
# Bray-Curtis Distance | |
# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] | |
cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Bray-Curtis Distance | |
Bray-Curtis distance is meant for discrete-valued vectors, though it is | |
a valid metric for real-valued vectors. | |
.. math:: | |
D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t num = 0, denom = 0 | |
cdef intp_t j | |
for j in range(size): | |
num += fabs(x1[j] - x2[j]) | |
denom += fabs(x1[j]) + fabs(x2[j]) | |
if denom > 0: | |
return num / denom | |
else: | |
return 0.0 | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t num = 0.0 | |
float64_t denom = 0.0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
num += fabs(x1_data[i1] - x2_data[i2]) | |
denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
num += fabs(x1_data[i1]) | |
denom += fabs(x1_data[i1]) | |
i1 = i1 + 1 | |
else: | |
num += fabs(x2_data[i2]) | |
denom += fabs(x2_data[i2]) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
num += fabs(x1_data[i1]) | |
denom += fabs(x1_data[i1]) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
num += fabs(x2_data[i2]) | |
denom += fabs(x2_data[i2]) | |
i1 = i1 + 1 | |
return num / denom | |
#------------------------------------------------------------ | |
# Jaccard Distance (boolean) | |
# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) | |
cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Jaccard Distance | |
Jaccard Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_eq = 0, nnz = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
nnz += (tf1 or tf2) | |
n_eq += (tf1 and tf2) | |
# Based on https://github.com/scipy/scipy/pull/7373 | |
# When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric | |
# was changed to return 0, instead of nan. | |
if nnz == 0: | |
return 0 | |
return (nnz - n_eq) * 1.0 / nnz | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_tt = 0, nnz = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
nnz += (tf1 or tf2) | |
n_tt += (tf1 and tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
nnz += tf1 | |
i1 = i1 + 1 | |
else: | |
nnz += tf2 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
tf2 = x2_data[i2] != 0 | |
nnz += tf2 | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
tf1 = x1_data[i1] != 0 | |
nnz += tf1 | |
i1 = i1 + 1 | |
# Based on https://github.com/scipy/scipy/pull/7373 | |
# When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric | |
# was changed to return 0, instead of nan. | |
if nnz == 0: | |
return 0 | |
return (nnz - n_tt) * 1.0 / nnz | |
#------------------------------------------------------------ | |
# Matching Distance (boolean) | |
# D(x, y) = n_neq / n | |
cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Matching Distance | |
Matching Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = (N_TF + N_FT) / N | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_neq = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_neq += (tf1 != tf2) | |
return n_neq * 1. / size | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_neq = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
if ix1 == ix2: | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
n_neq += (tf1 != tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
n_neq += (x1_data[i1] != 0) | |
i1 = i1 + 1 | |
else: | |
n_neq += (x2_data[i2] != 0) | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
n_neq += (x2_data[i2] != 0) | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
n_neq += (x1_data[i1] != 0) | |
i1 = i1 + 1 | |
return n_neq * 1.0 / size | |
#------------------------------------------------------------ | |
# Dice Distance (boolean) | |
# D(x, y) = n_neq / (2 * ntt + n_neq) | |
cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Dice Distance | |
Dice Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_neq = 0, n_tt = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_tt += (tf1 and tf2) | |
n_neq += (tf1 != tf2) | |
return n_neq / (2.0 * n_tt + n_neq) | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_tt = 0, n_neq = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
n_tt += (tf1 and tf2) | |
n_neq += (tf1 != tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
n_neq += tf1 | |
i1 = i1 + 1 | |
else: | |
n_neq += tf2 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
tf2 = x2_data[i2] != 0 | |
n_neq += tf2 | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
tf1 = x1_data[i1] != 0 | |
n_neq += tf1 | |
i1 = i1 + 1 | |
return n_neq / (2.0 * n_tt + n_neq) | |
#------------------------------------------------------------ | |
# Kulsinski Distance (boolean) | |
# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) | |
cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Kulsinski Distance | |
Kulsinski Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = 1 - N_TT / (N + N_TF + N_FT) | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_tt = 0, n_neq = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_neq += (tf1 != tf2) | |
n_tt += (tf1 and tf2) | |
return (n_neq - n_tt + size) * 1.0 / (n_neq + size) | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_tt = 0, n_neq = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
n_tt += (tf1 and tf2) | |
n_neq += (tf1 != tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
n_neq += tf1 | |
i1 = i1 + 1 | |
else: | |
n_neq += tf2 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
tf2 = x2_data[i2] != 0 | |
n_neq += tf2 | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
tf1 = x1_data[i1] != 0 | |
n_neq += tf1 | |
i1 = i1 + 1 | |
return (n_neq - n_tt + size) * 1.0 / (n_neq + size) | |
#------------------------------------------------------------ | |
# Rogers-Tanimoto Distance (boolean) | |
# D(x, y) = 2 * n_neq / (n + n_neq) | |
cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Rogers-Tanimoto Distance | |
Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_neq = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_neq += (tf1 != tf2) | |
return (2.0 * n_neq) / (size + n_neq) | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_neq = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
n_neq += (tf1 != tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
n_neq += tf1 | |
i1 = i1 + 1 | |
else: | |
n_neq += tf2 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
tf2 = x2_data[i2] != 0 | |
n_neq += tf2 | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
tf1 = x1_data[i1] != 0 | |
n_neq += tf1 | |
i1 = i1 + 1 | |
return (2.0 * n_neq) / (size + n_neq) | |
#------------------------------------------------------------ | |
# Russell-Rao Distance (boolean) | |
# D(x, y) = (n - ntt) / n | |
cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Russell-Rao Distance | |
Russell-Rao Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = (N - N_TT) / N | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_tt = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_tt += (tf1 and tf2) | |
return (size - n_tt) * 1. / size | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_tt = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
n_tt += (tf1 and tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
i1 = i1 + 1 | |
else: | |
i2 = i2 + 1 | |
# We don't need to go through all the longest | |
# vector because tf1 or tf2 will be false | |
# and thus n_tt won't be increased. | |
return (size - n_tt) * 1. / size | |
#------------------------------------------------------------ | |
# Sokal-Michener Distance (boolean) | |
# D(x, y) = 2 * n_neq / (n + n_neq) | |
cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Sokal-Michener Distance | |
Sokal-Michener Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_neq = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_neq += (tf1 != tf2) | |
return (2.0 * n_neq) / (size + n_neq) | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_neq = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
n_neq += (tf1 != tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
n_neq += tf1 | |
i1 = i1 + 1 | |
else: | |
n_neq += tf2 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
tf2 = x2_data[i2] != 0 | |
n_neq += tf2 | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
tf1 = x1_data[i1] != 0 | |
n_neq += tf1 | |
i1 = i1 + 1 | |
return (2.0 * n_neq) / (size + n_neq) | |
#------------------------------------------------------------ | |
# Sokal-Sneath Distance (boolean) | |
# D(x, y) = n_neq / (0.5 * n_tt + n_neq) | |
cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
r"""Sokal-Sneath Distance | |
Sokal-Sneath Distance is a dissimilarity measure for boolean-valued | |
vectors. All nonzero entries will be treated as True, zero entries will | |
be treated as False. | |
D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) | |
""" | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef int tf1, tf2, n_tt = 0, n_neq = 0 | |
cdef intp_t j | |
for j in range(size): | |
tf1 = x1[j] != 0 | |
tf2 = x2[j] != 0 | |
n_neq += (tf1 != tf2) | |
n_tt += (tf1 and tf2) | |
return n_neq / (0.5 * n_tt + n_neq) | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
intp_t tf1, tf2, n_tt = 0, n_neq = 0 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
tf1 = x1_data[i1] != 0 | |
tf2 = x2_data[i2] != 0 | |
if ix1 == ix2: | |
n_tt += (tf1 and tf2) | |
n_neq += (tf1 != tf2) | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
elif ix1 < ix2: | |
n_neq += tf1 | |
i1 = i1 + 1 | |
else: | |
n_neq += tf2 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
tf2 = x2_data[i2] != 0 | |
n_neq += tf2 | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
tf1 = x1_data[i1] != 0 | |
n_neq += tf1 | |
i1 = i1 + 1 | |
return n_neq / (0.5 * n_tt + n_neq) | |
#------------------------------------------------------------ | |
# Haversine Distance (2 dimensional) | |
# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) | |
# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} | |
cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
"""Haversine (Spherical) Distance | |
The Haversine distance is the angular distance between two points on | |
the surface of a sphere. The first distance of each point is assumed | |
to be the latitude, the second is the longitude, given in radians. | |
The dimension of the points must be 2: | |
D(x, y) = 2 arcsin[sqrt{sin^2((x1 - y1) / 2) + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] | |
""" | |
def _validate_data(self, X): | |
if X.shape[1] != 2: | |
raise ValueError("Haversine distance only valid " | |
"in 2 dimensions") | |
cdef inline {{INPUT_DTYPE_t}} rdist(self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
cdef float64_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) | |
cdef float64_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) | |
return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) | |
cdef inline {{INPUT_DTYPE_t}} dist(self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return 2 * asin(sqrt(self.rdist(x1, x2, size))) | |
cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: | |
return 2 * asin(sqrt(rdist)) | |
cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: | |
cdef float64_t tmp = sin(0.5 * dist) | |
return tmp * tmp | |
def rdist_to_dist(self, rdist): | |
return 2 * np.arcsin(np.sqrt(rdist)) | |
def dist_to_rdist(self, dist): | |
tmp = np.sin(0.5 * dist) | |
return tmp * tmp | |
cdef inline {{INPUT_DTYPE_t}} dist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
return 2 * asin(sqrt(self.rdist_csr( | |
x1_data, | |
x1_indices, | |
x2_data, | |
x2_indices, | |
x1_start, | |
x1_end, | |
x2_start, | |
x2_end, | |
size, | |
))) | |
cdef inline {{INPUT_DTYPE_t}} rdist_csr( | |
self, | |
const {{INPUT_DTYPE_t}}* x1_data, | |
const int32_t* x1_indices, | |
const {{INPUT_DTYPE_t}}* x2_data, | |
const int32_t* x2_indices, | |
const int32_t x1_start, | |
const int32_t x1_end, | |
const int32_t x2_start, | |
const int32_t x2_end, | |
const intp_t size, | |
) except -1 nogil: | |
cdef: | |
intp_t ix1, ix2 | |
intp_t i1 = x1_start | |
intp_t i2 = x2_start | |
float64_t x1_0 = 0 | |
float64_t x1_1 = 0 | |
float64_t x2_0 = 0 | |
float64_t x2_1 = 0 | |
float64_t sin_0 | |
float64_t sin_1 | |
while i1 < x1_end and i2 < x2_end: | |
ix1 = x1_indices[i1] | |
ix2 = x2_indices[i2] | |
# Find the components in the 2D vectors to work with | |
x1_component = ix1 if (x1_start == 0) else ix1 % x1_start | |
x2_component = ix2 if (x2_start == 0) else ix2 % x2_start | |
if x1_component == 0: | |
x1_0 = x1_data[i1] | |
else: | |
x1_1 = x1_data[i1] | |
if x2_component == 0: | |
x2_0 = x2_data[i2] | |
else: | |
x2_1 = x2_data[i2] | |
i1 = i1 + 1 | |
i2 = i2 + 1 | |
if i1 == x1_end: | |
while i2 < x2_end: | |
ix2 = x2_indices[i2] | |
x2_component = ix2 if (x2_start == 0) else ix2 % x2_start | |
if x2_component == 0: | |
x2_0 = x2_data[i2] | |
else: | |
x2_1 = x2_data[i2] | |
i2 = i2 + 1 | |
else: | |
while i1 < x1_end: | |
ix1 = x1_indices[i1] | |
x1_component = ix1 if (x1_start == 0) else ix1 % x1_start | |
if x1_component == 0: | |
x1_0 = x1_data[i1] | |
else: | |
x1_1 = x1_data[i1] | |
i1 = i1 + 1 | |
sin_0 = sin(0.5 * (x1_0 - x2_0)) | |
sin_1 = sin(0.5 * (x1_1 - x2_1)) | |
return (sin_0 * sin_0 + cos(x1_0) * cos(x2_0) * sin_1 * sin_1) | |
#------------------------------------------------------------ | |
# User-defined distance | |
# | |
cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): | |
"""PyFunc Distance | |
A user-defined distance | |
Parameters | |
---------- | |
func : function | |
func should take two numpy arrays as input, and return a distance. | |
""" | |
def __init__(self, func, **kwargs): | |
self.func = func | |
self.kwargs = kwargs | |
# in cython < 0.26, GIL was required to be acquired during definition of | |
# the function and inside the body of the function. This behaviour is not | |
# allowed in cython >= 0.26 since it is a redundant GIL acquisition. The | |
# only way to be back compatible is to inherit `dist` from the base class | |
# without GIL and called an inline `_dist` which acquire GIL. | |
cdef inline {{INPUT_DTYPE_t}} dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 nogil: | |
return self._dist(x1, x2, size) | |
cdef inline {{INPUT_DTYPE_t}} _dist( | |
self, | |
const {{INPUT_DTYPE_t}}* x1, | |
const {{INPUT_DTYPE_t}}* x2, | |
intp_t size, | |
) except -1 with gil: | |
cdef: | |
object x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size) | |
object x2arr = _buffer_to_ndarray{{name_suffix}}(x2, size) | |
d = self.func(x1arr, x2arr, **self.kwargs) | |
try: | |
# Cython generates code here that results in a TypeError | |
# if d is the wrong type. | |
return d | |
except TypeError: | |
raise TypeError("Custom distance function must accept two " | |
"vectors and return a float.") | |
{{endfor}} | |