|
|
|
|
|
import numpy as np |
|
from ._ckdtree import cKDTree, cKDTreeNode |
|
|
|
__all__ = ['minkowski_distance_p', 'minkowski_distance', |
|
'distance_matrix', |
|
'Rectangle', 'KDTree'] |
|
|
|
|
|
def minkowski_distance_p(x, y, p=2): |
|
"""Compute the pth power of the L**p distance between two arrays. |
|
|
|
For efficiency, this function computes the L**p distance but does |
|
not extract the pth root. If `p` is 1 or infinity, this is equal to |
|
the actual L**p distance. |
|
|
|
The last dimensions of `x` and `y` must be the same length. Any |
|
other dimensions must be compatible for broadcasting. |
|
|
|
Parameters |
|
---------- |
|
x : (..., K) array_like |
|
Input array. |
|
y : (..., K) array_like |
|
Input array. |
|
p : float, 1 <= p <= infinity |
|
Which Minkowski p-norm to use. |
|
|
|
Returns |
|
------- |
|
dist : ndarray |
|
pth power of the distance between the input arrays. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.spatial import minkowski_distance_p |
|
>>> minkowski_distance_p([[0, 0], [0, 0]], [[1, 1], [0, 1]]) |
|
array([2., 1.]) |
|
|
|
""" |
|
x = np.asarray(x) |
|
y = np.asarray(y) |
|
|
|
|
|
|
|
|
|
common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype), |
|
'float64') |
|
|
|
|
|
x = x.astype(common_datatype) |
|
y = y.astype(common_datatype) |
|
|
|
if p == np.inf: |
|
return np.amax(np.abs(y-x), axis=-1) |
|
elif p == 1: |
|
return np.sum(np.abs(y-x), axis=-1) |
|
else: |
|
return np.sum(np.abs(y-x)**p, axis=-1) |
|
|
|
|
|
def minkowski_distance(x, y, p=2): |
|
"""Compute the L**p distance between two arrays. |
|
|
|
The last dimensions of `x` and `y` must be the same length. Any |
|
other dimensions must be compatible for broadcasting. |
|
|
|
Parameters |
|
---------- |
|
x : (..., K) array_like |
|
Input array. |
|
y : (..., K) array_like |
|
Input array. |
|
p : float, 1 <= p <= infinity |
|
Which Minkowski p-norm to use. |
|
|
|
Returns |
|
------- |
|
dist : ndarray |
|
Distance between the input arrays. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.spatial import minkowski_distance |
|
>>> minkowski_distance([[0, 0], [0, 0]], [[1, 1], [0, 1]]) |
|
array([ 1.41421356, 1. ]) |
|
|
|
""" |
|
x = np.asarray(x) |
|
y = np.asarray(y) |
|
if p == np.inf or p == 1: |
|
return minkowski_distance_p(x, y, p) |
|
else: |
|
return minkowski_distance_p(x, y, p)**(1./p) |
|
|
|
|
|
class Rectangle: |
|
"""Hyperrectangle class. |
|
|
|
Represents a Cartesian product of intervals. |
|
""" |
|
def __init__(self, maxes, mins): |
|
"""Construct a hyperrectangle.""" |
|
self.maxes = np.maximum(maxes,mins).astype(float) |
|
self.mins = np.minimum(maxes,mins).astype(float) |
|
self.m, = self.maxes.shape |
|
|
|
def __repr__(self): |
|
return f"<Rectangle {list(zip(self.mins, self.maxes))}>" |
|
|
|
def volume(self): |
|
"""Total volume.""" |
|
return np.prod(self.maxes-self.mins) |
|
|
|
def split(self, d, split): |
|
"""Produce two hyperrectangles by splitting. |
|
|
|
In general, if you need to compute maximum and minimum |
|
distances to the children, it can be done more efficiently |
|
by updating the maximum and minimum distances to the parent. |
|
|
|
Parameters |
|
---------- |
|
d : int |
|
Axis to split hyperrectangle along. |
|
split : float |
|
Position along axis `d` to split at. |
|
|
|
""" |
|
mid = np.copy(self.maxes) |
|
mid[d] = split |
|
less = Rectangle(self.mins, mid) |
|
mid = np.copy(self.mins) |
|
mid[d] = split |
|
greater = Rectangle(mid, self.maxes) |
|
return less, greater |
|
|
|
def min_distance_point(self, x, p=2.): |
|
""" |
|
Return the minimum distance between input and points in the |
|
hyperrectangle. |
|
|
|
Parameters |
|
---------- |
|
x : array_like |
|
Input. |
|
p : float, optional |
|
Input. |
|
|
|
""" |
|
return minkowski_distance( |
|
0, np.maximum(0, np.maximum(self.mins-x, x-self.maxes)), |
|
p |
|
) |
|
|
|
def max_distance_point(self, x, p=2.): |
|
""" |
|
Return the maximum distance between input and points in the hyperrectangle. |
|
|
|
Parameters |
|
---------- |
|
x : array_like |
|
Input array. |
|
p : float, optional |
|
Input. |
|
|
|
""" |
|
return minkowski_distance(0, np.maximum(self.maxes-x, x-self.mins), p) |
|
|
|
def min_distance_rectangle(self, other, p=2.): |
|
""" |
|
Compute the minimum distance between points in the two hyperrectangles. |
|
|
|
Parameters |
|
---------- |
|
other : hyperrectangle |
|
Input. |
|
p : float |
|
Input. |
|
|
|
""" |
|
return minkowski_distance( |
|
0, |
|
np.maximum(0, np.maximum(self.mins-other.maxes, |
|
other.mins-self.maxes)), |
|
p |
|
) |
|
|
|
def max_distance_rectangle(self, other, p=2.): |
|
""" |
|
Compute the maximum distance between points in the two hyperrectangles. |
|
|
|
Parameters |
|
---------- |
|
other : hyperrectangle |
|
Input. |
|
p : float, optional |
|
Input. |
|
|
|
""" |
|
return minkowski_distance( |
|
0, np.maximum(self.maxes-other.mins, other.maxes-self.mins), p) |
|
|
|
|
|
class KDTree(cKDTree): |
|
"""kd-tree for quick nearest-neighbor lookup. |
|
|
|
This class provides an index into a set of k-dimensional points |
|
which can be used to rapidly look up the nearest neighbors of any |
|
point. |
|
|
|
Parameters |
|
---------- |
|
data : array_like, shape (n,m) |
|
The n data points of dimension m to be indexed. This array is |
|
not copied unless this is necessary to produce a contiguous |
|
array of doubles, and so modifying this data will result in |
|
bogus results. The data are also copied if the kd-tree is built |
|
with copy_data=True. |
|
leafsize : positive int, optional |
|
The number of points at which the algorithm switches over to |
|
brute-force. Default: 10. |
|
compact_nodes : bool, optional |
|
If True, the kd-tree is built to shrink the hyperrectangles to |
|
the actual data range. This usually gives a more compact tree that |
|
is robust against degenerated input data and gives faster queries |
|
at the expense of longer build time. Default: True. |
|
copy_data : bool, optional |
|
If True the data is always copied to protect the kd-tree against |
|
data corruption. Default: False. |
|
balanced_tree : bool, optional |
|
If True, the median is used to split the hyperrectangles instead of |
|
the midpoint. This usually gives a more compact tree and |
|
faster queries at the expense of longer build time. Default: True. |
|
boxsize : array_like or scalar, optional |
|
Apply a m-d toroidal topology to the KDTree.. The topology is generated |
|
by :math:`x_i + n_i L_i` where :math:`n_i` are integers and :math:`L_i` |
|
is the boxsize along i-th dimension. The input data shall be wrapped |
|
into :math:`[0, L_i)`. A ValueError is raised if any of the data is |
|
outside of this bound. |
|
|
|
Notes |
|
----- |
|
The algorithm used is described in Maneewongvatana and Mount 1999. |
|
The general idea is that the kd-tree is a binary tree, each of whose |
|
nodes represents an axis-aligned hyperrectangle. Each node specifies |
|
an axis and splits the set of points based on whether their coordinate |
|
along that axis is greater than or less than a particular value. |
|
|
|
During construction, the axis and splitting point are chosen by the |
|
"sliding midpoint" rule, which ensures that the cells do not all |
|
become long and thin. |
|
|
|
The tree can be queried for the r closest neighbors of any given point |
|
(optionally returning only those within some maximum distance of the |
|
point). It can also be queried, with a substantial gain in efficiency, |
|
for the r approximate closest neighbors. |
|
|
|
For large dimensions (20 is already large) do not expect this to run |
|
significantly faster than brute force. High-dimensional nearest-neighbor |
|
queries are a substantial open problem in computer science. |
|
|
|
Attributes |
|
---------- |
|
data : ndarray, shape (n,m) |
|
The n data points of dimension m to be indexed. This array is |
|
not copied unless this is necessary to produce a contiguous |
|
array of doubles. The data are also copied if the kd-tree is built |
|
with ``copy_data=True``. |
|
leafsize : positive int |
|
The number of points at which the algorithm switches over to |
|
brute-force. |
|
m : int |
|
The dimension of a single data-point. |
|
n : int |
|
The number of data points. |
|
maxes : ndarray, shape (m,) |
|
The maximum value in each dimension of the n data points. |
|
mins : ndarray, shape (m,) |
|
The minimum value in each dimension of the n data points. |
|
size : int |
|
The number of nodes in the tree. |
|
|
|
""" |
|
|
|
class node: |
|
@staticmethod |
|
def _create(ckdtree_node=None): |
|
"""Create either an inner or leaf node, wrapping a cKDTreeNode instance""" |
|
if ckdtree_node is None: |
|
return KDTree.node(ckdtree_node) |
|
elif ckdtree_node.split_dim == -1: |
|
return KDTree.leafnode(ckdtree_node) |
|
else: |
|
return KDTree.innernode(ckdtree_node) |
|
|
|
def __init__(self, ckdtree_node=None): |
|
if ckdtree_node is None: |
|
ckdtree_node = cKDTreeNode() |
|
self._node = ckdtree_node |
|
|
|
def __lt__(self, other): |
|
return id(self) < id(other) |
|
|
|
def __gt__(self, other): |
|
return id(self) > id(other) |
|
|
|
def __le__(self, other): |
|
return id(self) <= id(other) |
|
|
|
def __ge__(self, other): |
|
return id(self) >= id(other) |
|
|
|
def __eq__(self, other): |
|
return id(self) == id(other) |
|
|
|
class leafnode(node): |
|
@property |
|
def idx(self): |
|
return self._node.indices |
|
|
|
@property |
|
def children(self): |
|
return self._node.children |
|
|
|
class innernode(node): |
|
def __init__(self, ckdtreenode): |
|
assert isinstance(ckdtreenode, cKDTreeNode) |
|
super().__init__(ckdtreenode) |
|
self.less = KDTree.node._create(ckdtreenode.lesser) |
|
self.greater = KDTree.node._create(ckdtreenode.greater) |
|
|
|
@property |
|
def split_dim(self): |
|
return self._node.split_dim |
|
|
|
@property |
|
def split(self): |
|
return self._node.split |
|
|
|
@property |
|
def children(self): |
|
return self._node.children |
|
|
|
@property |
|
def tree(self): |
|
if not hasattr(self, "_tree"): |
|
self._tree = KDTree.node._create(super().tree) |
|
|
|
return self._tree |
|
|
|
def __init__(self, data, leafsize=10, compact_nodes=True, copy_data=False, |
|
balanced_tree=True, boxsize=None): |
|
data = np.asarray(data) |
|
if data.dtype.kind == 'c': |
|
raise TypeError("KDTree does not work with complex data") |
|
|
|
|
|
super().__init__(data, leafsize, compact_nodes, copy_data, |
|
balanced_tree, boxsize) |
|
|
|
def query( |
|
self, x, k=1, eps=0, p=2, distance_upper_bound=np.inf, workers=1): |
|
r"""Query the kd-tree for nearest neighbors. |
|
|
|
Parameters |
|
---------- |
|
x : array_like, last dimension self.m |
|
An array of points to query. |
|
k : int or Sequence[int], optional |
|
Either the number of nearest neighbors to return, or a list of the |
|
k-th nearest neighbors to return, starting from 1. |
|
eps : nonnegative float, optional |
|
Return approximate nearest neighbors; the kth returned value |
|
is guaranteed to be no further than (1+eps) times the |
|
distance to the real kth nearest neighbor. |
|
p : float, 1<=p<=infinity, optional |
|
Which Minkowski p-norm to use. |
|
1 is the sum-of-absolute-values distance ("Manhattan" distance). |
|
2 is the usual Euclidean distance. |
|
infinity is the maximum-coordinate-difference distance. |
|
A large, finite p may cause a ValueError if overflow can occur. |
|
distance_upper_bound : nonnegative float, optional |
|
Return only neighbors within this distance. This is used to prune |
|
tree searches, so if you are doing a series of nearest-neighbor |
|
queries, it may help to supply the distance to the nearest neighbor |
|
of the most recent point. |
|
workers : int, optional |
|
Number of workers to use for parallel processing. If -1 is given |
|
all CPU threads are used. Default: 1. |
|
|
|
.. versionadded:: 1.6.0 |
|
|
|
Returns |
|
------- |
|
d : float or array of floats |
|
The distances to the nearest neighbors. |
|
If ``x`` has shape ``tuple+(self.m,)``, then ``d`` has shape |
|
``tuple+(k,)``. |
|
When k == 1, the last dimension of the output is squeezed. |
|
Missing neighbors are indicated with infinite distances. |
|
Hits are sorted by distance (nearest first). |
|
|
|
.. versionchanged:: 1.9.0 |
|
Previously if ``k=None``, then `d` was an object array of |
|
shape ``tuple``, containing lists of distances. This behavior |
|
has been removed, use `query_ball_point` instead. |
|
|
|
i : integer or array of integers |
|
The index of each neighbor in ``self.data``. |
|
``i`` is the same shape as d. |
|
Missing neighbors are indicated with ``self.n``. |
|
|
|
Examples |
|
-------- |
|
|
|
>>> import numpy as np |
|
>>> from scipy.spatial import KDTree |
|
>>> x, y = np.mgrid[0:5, 2:8] |
|
>>> tree = KDTree(np.c_[x.ravel(), y.ravel()]) |
|
|
|
To query the nearest neighbours and return squeezed result, use |
|
|
|
>>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=1) |
|
>>> print(dd, ii, sep='\n') |
|
[2. 0.2236068] |
|
[ 0 13] |
|
|
|
To query the nearest neighbours and return unsqueezed result, use |
|
|
|
>>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1]) |
|
>>> print(dd, ii, sep='\n') |
|
[[2. ] |
|
[0.2236068]] |
|
[[ 0] |
|
[13]] |
|
|
|
To query the second nearest neighbours and return unsqueezed result, |
|
use |
|
|
|
>>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[2]) |
|
>>> print(dd, ii, sep='\n') |
|
[[2.23606798] |
|
[0.80622577]] |
|
[[ 6] |
|
[19]] |
|
|
|
To query the first and second nearest neighbours, use |
|
|
|
>>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=2) |
|
>>> print(dd, ii, sep='\n') |
|
[[2. 2.23606798] |
|
[0.2236068 0.80622577]] |
|
[[ 0 6] |
|
[13 19]] |
|
|
|
or, be more specific |
|
|
|
>>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1, 2]) |
|
>>> print(dd, ii, sep='\n') |
|
[[2. 2.23606798] |
|
[0.2236068 0.80622577]] |
|
[[ 0 6] |
|
[13 19]] |
|
|
|
""" |
|
x = np.asarray(x) |
|
if x.dtype.kind == 'c': |
|
raise TypeError("KDTree does not work with complex data") |
|
|
|
if k is None: |
|
raise ValueError("k must be an integer or a sequence of integers") |
|
|
|
d, i = super().query(x, k, eps, p, distance_upper_bound, workers) |
|
if isinstance(i, int): |
|
i = np.intp(i) |
|
return d, i |
|
|
|
def query_ball_point(self, x, r, p=2., eps=0, workers=1, |
|
return_sorted=None, return_length=False): |
|
"""Find all points within distance r of point(s) x. |
|
|
|
Parameters |
|
---------- |
|
x : array_like, shape tuple + (self.m,) |
|
The point or points to search for neighbors of. |
|
r : array_like, float |
|
The radius of points to return, must broadcast to the length of x. |
|
p : float, optional |
|
Which Minkowski p-norm to use. Should be in the range [1, inf]. |
|
A finite large p may cause a ValueError if overflow can occur. |
|
eps : nonnegative float, optional |
|
Approximate search. Branches of the tree are not explored if their |
|
nearest points are further than ``r / (1 + eps)``, and branches are |
|
added in bulk if their furthest points are nearer than |
|
``r * (1 + eps)``. |
|
workers : int, optional |
|
Number of jobs to schedule for parallel processing. If -1 is given |
|
all processors are used. Default: 1. |
|
|
|
.. versionadded:: 1.6.0 |
|
return_sorted : bool, optional |
|
Sorts returned indices if True and does not sort them if False. If |
|
None, does not sort single point queries, but does sort |
|
multi-point queries which was the behavior before this option |
|
was added. |
|
|
|
.. versionadded:: 1.6.0 |
|
return_length : bool, optional |
|
Return the number of points inside the radius instead of a list |
|
of the indices. |
|
|
|
.. versionadded:: 1.6.0 |
|
|
|
Returns |
|
------- |
|
results : list or array of lists |
|
If `x` is a single point, returns a list of the indices of the |
|
neighbors of `x`. If `x` is an array of points, returns an object |
|
array of shape tuple containing lists of neighbors. |
|
|
|
Notes |
|
----- |
|
If you have many points whose neighbors you want to find, you may save |
|
substantial amounts of time by putting them in a KDTree and using |
|
query_ball_tree. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from scipy import spatial |
|
>>> x, y = np.mgrid[0:5, 0:5] |
|
>>> points = np.c_[x.ravel(), y.ravel()] |
|
>>> tree = spatial.KDTree(points) |
|
>>> sorted(tree.query_ball_point([2, 0], 1)) |
|
[5, 10, 11, 15] |
|
|
|
Query multiple points and plot the results: |
|
|
|
>>> import matplotlib.pyplot as plt |
|
>>> points = np.asarray(points) |
|
>>> plt.plot(points[:,0], points[:,1], '.') |
|
>>> for results in tree.query_ball_point(([2, 0], [3, 3]), 1): |
|
... nearby_points = points[results] |
|
... plt.plot(nearby_points[:,0], nearby_points[:,1], 'o') |
|
>>> plt.margins(0.1, 0.1) |
|
>>> plt.show() |
|
|
|
""" |
|
x = np.asarray(x) |
|
if x.dtype.kind == 'c': |
|
raise TypeError("KDTree does not work with complex data") |
|
return super().query_ball_point( |
|
x, r, p, eps, workers, return_sorted, return_length) |
|
|
|
def query_ball_tree(self, other, r, p=2., eps=0): |
|
""" |
|
Find all pairs of points between `self` and `other` whose distance is |
|
at most r. |
|
|
|
Parameters |
|
---------- |
|
other : KDTree instance |
|
The tree containing points to search against. |
|
r : float |
|
The maximum distance, has to be positive. |
|
p : float, optional |
|
Which Minkowski norm to use. `p` has to meet the condition |
|
``1 <= p <= infinity``. |
|
eps : float, optional |
|
Approximate search. Branches of the tree are not explored |
|
if their nearest points are further than ``r/(1+eps)``, and |
|
branches are added in bulk if their furthest points are nearer |
|
than ``r * (1+eps)``. `eps` has to be non-negative. |
|
|
|
Returns |
|
------- |
|
results : list of lists |
|
For each element ``self.data[i]`` of this tree, ``results[i]`` is a |
|
list of the indices of its neighbors in ``other.data``. |
|
|
|
Examples |
|
-------- |
|
You can search all pairs of points between two kd-trees within a distance: |
|
|
|
>>> import matplotlib.pyplot as plt |
|
>>> import numpy as np |
|
>>> from scipy.spatial import KDTree |
|
>>> rng = np.random.default_rng() |
|
>>> points1 = rng.random((15, 2)) |
|
>>> points2 = rng.random((15, 2)) |
|
>>> plt.figure(figsize=(6, 6)) |
|
>>> plt.plot(points1[:, 0], points1[:, 1], "xk", markersize=14) |
|
>>> plt.plot(points2[:, 0], points2[:, 1], "og", markersize=14) |
|
>>> kd_tree1 = KDTree(points1) |
|
>>> kd_tree2 = KDTree(points2) |
|
>>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2) |
|
>>> for i in range(len(indexes)): |
|
... for j in indexes[i]: |
|
... plt.plot([points1[i, 0], points2[j, 0]], |
|
... [points1[i, 1], points2[j, 1]], "-r") |
|
>>> plt.show() |
|
|
|
""" |
|
return super().query_ball_tree(other, r, p, eps) |
|
|
|
def query_pairs(self, r, p=2., eps=0, output_type='set'): |
|
"""Find all pairs of points in `self` whose distance is at most r. |
|
|
|
Parameters |
|
---------- |
|
r : positive float |
|
The maximum distance. |
|
p : float, optional |
|
Which Minkowski norm to use. `p` has to meet the condition |
|
``1 <= p <= infinity``. |
|
eps : float, optional |
|
Approximate search. Branches of the tree are not explored |
|
if their nearest points are further than ``r/(1+eps)``, and |
|
branches are added in bulk if their furthest points are nearer |
|
than ``r * (1+eps)``. `eps` has to be non-negative. |
|
output_type : string, optional |
|
Choose the output container, 'set' or 'ndarray'. Default: 'set' |
|
|
|
.. versionadded:: 1.6.0 |
|
|
|
Returns |
|
------- |
|
results : set or ndarray |
|
Set of pairs ``(i,j)``, with ``i < j``, for which the corresponding |
|
positions are close. If output_type is 'ndarray', an ndarry is |
|
returned instead of a set. |
|
|
|
Examples |
|
-------- |
|
You can search all pairs of points in a kd-tree within a distance: |
|
|
|
>>> import matplotlib.pyplot as plt |
|
>>> import numpy as np |
|
>>> from scipy.spatial import KDTree |
|
>>> rng = np.random.default_rng() |
|
>>> points = rng.random((20, 2)) |
|
>>> plt.figure(figsize=(6, 6)) |
|
>>> plt.plot(points[:, 0], points[:, 1], "xk", markersize=14) |
|
>>> kd_tree = KDTree(points) |
|
>>> pairs = kd_tree.query_pairs(r=0.2) |
|
>>> for (i, j) in pairs: |
|
... plt.plot([points[i, 0], points[j, 0]], |
|
... [points[i, 1], points[j, 1]], "-r") |
|
>>> plt.show() |
|
|
|
""" |
|
return super().query_pairs(r, p, eps, output_type) |
|
|
|
def count_neighbors(self, other, r, p=2., weights=None, cumulative=True): |
|
"""Count how many nearby pairs can be formed. |
|
|
|
Count the number of pairs ``(x1,x2)`` can be formed, with ``x1`` drawn |
|
from ``self`` and ``x2`` drawn from ``other``, and where |
|
``distance(x1, x2, p) <= r``. |
|
|
|
Data points on ``self`` and ``other`` are optionally weighted by the |
|
``weights`` argument. (See below) |
|
|
|
This is adapted from the "two-point correlation" algorithm described by |
|
Gray and Moore [1]_. See notes for further discussion. |
|
|
|
Parameters |
|
---------- |
|
other : KDTree |
|
The other tree to draw points from, can be the same tree as self. |
|
r : float or one-dimensional array of floats |
|
The radius to produce a count for. Multiple radii are searched with |
|
a single tree traversal. |
|
If the count is non-cumulative(``cumulative=False``), ``r`` defines |
|
the edges of the bins, and must be non-decreasing. |
|
p : float, optional |
|
1<=p<=infinity. |
|
Which Minkowski p-norm to use. |
|
Default 2.0. |
|
A finite large p may cause a ValueError if overflow can occur. |
|
weights : tuple, array_like, or None, optional |
|
If None, the pair-counting is unweighted. |
|
If given as a tuple, weights[0] is the weights of points in |
|
``self``, and weights[1] is the weights of points in ``other``; |
|
either can be None to indicate the points are unweighted. |
|
If given as an array_like, weights is the weights of points in |
|
``self`` and ``other``. For this to make sense, ``self`` and |
|
``other`` must be the same tree. If ``self`` and ``other`` are two |
|
different trees, a ``ValueError`` is raised. |
|
Default: None |
|
|
|
.. versionadded:: 1.6.0 |
|
cumulative : bool, optional |
|
Whether the returned counts are cumulative. When cumulative is set |
|
to ``False`` the algorithm is optimized to work with a large number |
|
of bins (>10) specified by ``r``. When ``cumulative`` is set to |
|
True, the algorithm is optimized to work with a small number of |
|
``r``. Default: True |
|
|
|
.. versionadded:: 1.6.0 |
|
|
|
Returns |
|
------- |
|
result : scalar or 1-D array |
|
The number of pairs. For unweighted counts, the result is integer. |
|
For weighted counts, the result is float. |
|
If cumulative is False, ``result[i]`` contains the counts with |
|
``(-inf if i == 0 else r[i-1]) < R <= r[i]`` |
|
|
|
Notes |
|
----- |
|
Pair-counting is the basic operation used to calculate the two point |
|
correlation functions from a data set composed of position of objects. |
|
|
|
Two point correlation function measures the clustering of objects and |
|
is widely used in cosmology to quantify the large scale structure |
|
in our Universe, but it may be useful for data analysis in other fields |
|
where self-similar assembly of objects also occur. |
|
|
|
The Landy-Szalay estimator for the two point correlation function of |
|
``D`` measures the clustering signal in ``D``. [2]_ |
|
|
|
For example, given the position of two sets of objects, |
|
|
|
- objects ``D`` (data) contains the clustering signal, and |
|
|
|
- objects ``R`` (random) that contains no signal, |
|
|
|
.. math:: |
|
|
|
\\xi(r) = \\frac{<D, D> - 2 f <D, R> + f^2<R, R>}{f^2<R, R>}, |
|
|
|
where the brackets represents counting pairs between two data sets |
|
in a finite bin around ``r`` (distance), corresponding to setting |
|
`cumulative=False`, and ``f = float(len(D)) / float(len(R))`` is the |
|
ratio between number of objects from data and random. |
|
|
|
The algorithm implemented here is loosely based on the dual-tree |
|
algorithm described in [1]_. We switch between two different |
|
pair-cumulation scheme depending on the setting of ``cumulative``. |
|
The computing time of the method we use when for |
|
``cumulative == False`` does not scale with the total number of bins. |
|
The algorithm for ``cumulative == True`` scales linearly with the |
|
number of bins, though it is slightly faster when only |
|
1 or 2 bins are used. [5]_. |
|
|
|
As an extension to the naive pair-counting, |
|
weighted pair-counting counts the product of weights instead |
|
of number of pairs. |
|
Weighted pair-counting is used to estimate marked correlation functions |
|
([3]_, section 2.2), |
|
or to properly calculate the average of data per distance bin |
|
(e.g. [4]_, section 2.1 on redshift). |
|
|
|
.. [1] Gray and Moore, |
|
"N-body problems in statistical learning", |
|
Mining the sky, 2000, |
|
https://arxiv.org/abs/astro-ph/0012333 |
|
|
|
.. [2] Landy and Szalay, |
|
"Bias and variance of angular correlation functions", |
|
The Astrophysical Journal, 1993, |
|
http://adsabs.harvard.edu/abs/1993ApJ...412...64L |
|
|
|
.. [3] Sheth, Connolly and Skibba, |
|
"Marked correlations in galaxy formation models", |
|
Arxiv e-print, 2005, |
|
https://arxiv.org/abs/astro-ph/0511773 |
|
|
|
.. [4] Hawkins, et al., |
|
"The 2dF Galaxy Redshift Survey: correlation functions, |
|
peculiar velocities and the matter density of the Universe", |
|
Monthly Notices of the Royal Astronomical Society, 2002, |
|
http://adsabs.harvard.edu/abs/2003MNRAS.346...78H |
|
|
|
.. [5] https://github.com/scipy/scipy/pull/5647#issuecomment-168474926 |
|
|
|
Examples |
|
-------- |
|
You can count neighbors number between two kd-trees within a distance: |
|
|
|
>>> import numpy as np |
|
>>> from scipy.spatial import KDTree |
|
>>> rng = np.random.default_rng() |
|
>>> points1 = rng.random((5, 2)) |
|
>>> points2 = rng.random((5, 2)) |
|
>>> kd_tree1 = KDTree(points1) |
|
>>> kd_tree2 = KDTree(points2) |
|
>>> kd_tree1.count_neighbors(kd_tree2, 0.2) |
|
1 |
|
|
|
This number is same as the total pair number calculated by |
|
`query_ball_tree`: |
|
|
|
>>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2) |
|
>>> sum([len(i) for i in indexes]) |
|
1 |
|
|
|
""" |
|
return super().count_neighbors(other, r, p, weights, cumulative) |
|
|
|
def sparse_distance_matrix( |
|
self, other, max_distance, p=2., output_type='dok_matrix'): |
|
"""Compute a sparse distance matrix. |
|
|
|
Computes a distance matrix between two KDTrees, leaving as zero |
|
any distance greater than max_distance. |
|
|
|
Parameters |
|
---------- |
|
other : KDTree |
|
|
|
max_distance : positive float |
|
|
|
p : float, 1<=p<=infinity |
|
Which Minkowski p-norm to use. |
|
A finite large p may cause a ValueError if overflow can occur. |
|
|
|
output_type : string, optional |
|
Which container to use for output data. Options: 'dok_matrix', |
|
'coo_matrix', 'dict', or 'ndarray'. Default: 'dok_matrix'. |
|
|
|
.. versionadded:: 1.6.0 |
|
|
|
Returns |
|
------- |
|
result : dok_matrix, coo_matrix, dict or ndarray |
|
Sparse matrix representing the results in "dictionary of keys" |
|
format. If a dict is returned the keys are (i,j) tuples of indices. |
|
If output_type is 'ndarray' a record array with fields 'i', 'j', |
|
and 'v' is returned, |
|
|
|
Examples |
|
-------- |
|
You can compute a sparse distance matrix between two kd-trees: |
|
|
|
>>> import numpy as np |
|
>>> from scipy.spatial import KDTree |
|
>>> rng = np.random.default_rng() |
|
>>> points1 = rng.random((5, 2)) |
|
>>> points2 = rng.random((5, 2)) |
|
>>> kd_tree1 = KDTree(points1) |
|
>>> kd_tree2 = KDTree(points2) |
|
>>> sdm = kd_tree1.sparse_distance_matrix(kd_tree2, 0.3) |
|
>>> sdm.toarray() |
|
array([[0. , 0. , 0.12295571, 0. , 0. ], |
|
[0. , 0. , 0. , 0. , 0. ], |
|
[0.28942611, 0. , 0. , 0.2333084 , 0. ], |
|
[0. , 0. , 0. , 0. , 0. ], |
|
[0.24617575, 0.29571802, 0.26836782, 0. , 0. ]]) |
|
|
|
You can check distances above the `max_distance` are zeros: |
|
|
|
>>> from scipy.spatial import distance_matrix |
|
>>> distance_matrix(points1, points2) |
|
array([[0.56906522, 0.39923701, 0.12295571, 0.8658745 , 0.79428925], |
|
[0.37327919, 0.7225693 , 0.87665969, 0.32580855, 0.75679479], |
|
[0.28942611, 0.30088013, 0.6395831 , 0.2333084 , 0.33630734], |
|
[0.31994999, 0.72658602, 0.71124834, 0.55396483, 0.90785663], |
|
[0.24617575, 0.29571802, 0.26836782, 0.57714465, 0.6473269 ]]) |
|
|
|
""" |
|
return super().sparse_distance_matrix( |
|
other, max_distance, p, output_type) |
|
|
|
|
|
def distance_matrix(x, y, p=2, threshold=1000000): |
|
"""Compute the distance matrix. |
|
|
|
Returns the matrix of all pair-wise distances. |
|
|
|
Parameters |
|
---------- |
|
x : (M, K) array_like |
|
Matrix of M vectors in K dimensions. |
|
y : (N, K) array_like |
|
Matrix of N vectors in K dimensions. |
|
p : float, 1 <= p <= infinity |
|
Which Minkowski p-norm to use. |
|
threshold : positive int |
|
If ``M * N * K`` > `threshold`, algorithm uses a Python loop instead |
|
of large temporary arrays. |
|
|
|
Returns |
|
------- |
|
result : (M, N) ndarray |
|
Matrix containing the distance from every vector in `x` to every vector |
|
in `y`. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.spatial import distance_matrix |
|
>>> distance_matrix([[0,0],[0,1]], [[1,0],[1,1]]) |
|
array([[ 1. , 1.41421356], |
|
[ 1.41421356, 1. ]]) |
|
|
|
""" |
|
|
|
x = np.asarray(x) |
|
m, k = x.shape |
|
y = np.asarray(y) |
|
n, kk = y.shape |
|
|
|
if k != kk: |
|
raise ValueError(f"x contains {k}-dimensional vectors but y contains " |
|
f"{kk}-dimensional vectors") |
|
|
|
if m*n*k <= threshold: |
|
return minkowski_distance(x[:,np.newaxis,:],y[np.newaxis,:,:],p) |
|
else: |
|
result = np.empty((m,n),dtype=float) |
|
if m < n: |
|
for i in range(m): |
|
result[i,:] = minkowski_distance(x[i],y,p) |
|
else: |
|
for j in range(n): |
|
result[:,j] = minkowski_distance(x,y[j],p) |
|
return result |
|
|