|
|
|
|
|
import numpy as np |
|
cimport cython |
|
|
|
from ..metrics._dist_metrics cimport DistanceMetric64 |
|
from ..utils._fast_dict cimport IntFloatDict |
|
from ..utils._typedefs cimport float64_t, intp_t, uint8_t |
|
|
|
|
|
from cython.operator cimport dereference as deref, preincrement as inc |
|
from libcpp.map cimport map as cpp_map |
|
from libc.math cimport fmax, INFINITY |
|
|
|
|
|
|
|
|
|
|
|
def compute_ward_dist( |
|
const float64_t[::1] m_1, |
|
const float64_t[:, ::1] m_2, |
|
const intp_t[::1] coord_row, |
|
const intp_t[::1] coord_col, |
|
float64_t[::1] res |
|
): |
|
cdef intp_t size_max = coord_row.shape[0] |
|
cdef intp_t n_features = m_2.shape[1] |
|
cdef intp_t i, j, row, col |
|
cdef float64_t pa, n |
|
|
|
for i in range(size_max): |
|
row = coord_row[i] |
|
col = coord_col[i] |
|
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col]) |
|
pa = 0. |
|
for j in range(n_features): |
|
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2 |
|
res[i] = pa * n |
|
|
|
|
|
|
|
|
|
|
|
def _hc_get_descendent(intp_t node, children, intp_t n_leaves): |
|
""" |
|
Function returning all the descendent leaves of a set of nodes in the tree. |
|
|
|
Parameters |
|
---------- |
|
node : integer |
|
The node for which we want the descendents. |
|
|
|
children : list of pairs, length n_nodes |
|
The children of each non-leaf node. Values less than `n_samples` refer |
|
to leaves of the tree. A greater value `i` indicates a node with |
|
children `children[i - n_samples]`. |
|
|
|
n_leaves : integer |
|
Number of leaves. |
|
|
|
Returns |
|
------- |
|
descendent : list of int |
|
""" |
|
ind = [node] |
|
if node < n_leaves: |
|
return ind |
|
descendent = [] |
|
|
|
|
|
|
|
|
|
cdef intp_t i, n_indices = 1 |
|
|
|
while n_indices: |
|
i = ind.pop() |
|
if i < n_leaves: |
|
descendent.append(i) |
|
n_indices -= 1 |
|
else: |
|
ind.extend(children[i - n_leaves]) |
|
n_indices += 1 |
|
return descendent |
|
|
|
|
|
def hc_get_heads(intp_t[:] parents, copy=True): |
|
"""Returns the heads of the forest, as defined by parents. |
|
|
|
Parameters |
|
---------- |
|
parents : array of integers |
|
The parent structure defining the forest (ensemble of trees) |
|
copy : boolean |
|
If copy is False, the input 'parents' array is modified inplace |
|
|
|
Returns |
|
------- |
|
heads : array of integers of same shape as parents |
|
The indices in the 'parents' of the tree heads |
|
|
|
""" |
|
cdef intp_t parent, node0, node, size |
|
if copy: |
|
parents = np.copy(parents) |
|
size = parents.size |
|
|
|
|
|
for node0 in range(size - 1, -1, -1): |
|
node = node0 |
|
parent = parents[node] |
|
while parent != node: |
|
parents[node0] = parent |
|
node = parent |
|
parent = parents[node] |
|
return parents |
|
|
|
|
|
def _get_parents( |
|
nodes, |
|
heads, |
|
const intp_t[:] parents, |
|
uint8_t[::1] not_visited |
|
): |
|
"""Returns the heads of the given nodes, as defined by parents. |
|
|
|
Modifies 'heads' and 'not_visited' in-place. |
|
|
|
Parameters |
|
---------- |
|
nodes : list of integers |
|
The nodes to start from |
|
heads : list of integers |
|
A list to hold the results (modified inplace) |
|
parents : array of integers |
|
The parent structure defining the tree |
|
not_visited |
|
The tree nodes to consider (modified inplace) |
|
|
|
""" |
|
cdef intp_t parent, node |
|
|
|
for node in nodes: |
|
parent = parents[node] |
|
while parent != node: |
|
node = parent |
|
parent = parents[node] |
|
if not_visited[node]: |
|
not_visited[node] = 0 |
|
heads.append(node) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def max_merge( |
|
IntFloatDict a, |
|
IntFloatDict b, |
|
const intp_t[:] mask, |
|
intp_t n_a, |
|
intp_t n_b |
|
): |
|
"""Merge two IntFloatDicts with the max strategy: when the same key is |
|
present in the two dicts, the max of the two values is used. |
|
|
|
Parameters |
|
========== |
|
a, b : IntFloatDict object |
|
The IntFloatDicts to merge |
|
mask : ndarray array of dtype integer and of dimension 1 |
|
a mask for keys to ignore: if not mask[key] the corresponding key |
|
is skipped in the output dictionary |
|
n_a, n_b : float |
|
n_a and n_b are weights for a and b for the merge strategy. |
|
They are not used in the case of a max merge. |
|
|
|
Returns |
|
======= |
|
out : IntFloatDict object |
|
The IntFloatDict resulting from the merge |
|
""" |
|
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict) |
|
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin() |
|
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end() |
|
cdef intp_t key |
|
cdef float64_t value |
|
|
|
while a_it != a_end: |
|
key = deref(a_it).first |
|
if mask[key]: |
|
out_obj.my_map[key] = deref(a_it).second |
|
inc(a_it) |
|
|
|
|
|
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin() |
|
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end() |
|
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin() |
|
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end() |
|
while b_it != b_end: |
|
key = deref(b_it).first |
|
value = deref(b_it).second |
|
if mask[key]: |
|
out_it = out_obj.my_map.find(key) |
|
if out_it == out_end: |
|
|
|
out_obj.my_map[key] = value |
|
else: |
|
deref(out_it).second = fmax(deref(out_it).second, value) |
|
inc(b_it) |
|
return out_obj |
|
|
|
|
|
def average_merge( |
|
IntFloatDict a, |
|
IntFloatDict b, |
|
const intp_t[:] mask, |
|
intp_t n_a, |
|
intp_t n_b |
|
): |
|
"""Merge two IntFloatDicts with the average strategy: when the |
|
same key is present in the two dicts, the weighted average of the two |
|
values is used. |
|
|
|
Parameters |
|
========== |
|
a, b : IntFloatDict object |
|
The IntFloatDicts to merge |
|
mask : ndarray array of dtype integer and of dimension 1 |
|
a mask for keys to ignore: if not mask[key] the corresponding key |
|
is skipped in the output dictionary |
|
n_a, n_b : float |
|
n_a and n_b are weights for a and b for the merge strategy. |
|
They are used for a weighted mean. |
|
|
|
Returns |
|
======= |
|
out : IntFloatDict object |
|
The IntFloatDict resulting from the merge |
|
""" |
|
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict) |
|
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin() |
|
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end() |
|
cdef intp_t key |
|
cdef float64_t value |
|
cdef float64_t n_out = <float64_t> (n_a + n_b) |
|
|
|
while a_it != a_end: |
|
key = deref(a_it).first |
|
if mask[key]: |
|
out_obj.my_map[key] = deref(a_it).second |
|
inc(a_it) |
|
|
|
|
|
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin() |
|
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end() |
|
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin() |
|
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end() |
|
while b_it != b_end: |
|
key = deref(b_it).first |
|
value = deref(b_it).second |
|
if mask[key]: |
|
out_it = out_obj.my_map.find(key) |
|
if out_it == out_end: |
|
|
|
out_obj.my_map[key] = value |
|
else: |
|
deref(out_it).second = (n_a * deref(out_it).second |
|
+ n_b * value) / n_out |
|
inc(b_it) |
|
return out_obj |
|
|
|
|
|
|
|
|
|
|
|
cdef class WeightedEdge: |
|
cdef public intp_t a |
|
cdef public intp_t b |
|
cdef public float64_t weight |
|
|
|
def __init__(self, float64_t weight, intp_t a, intp_t b): |
|
self.weight = weight |
|
self.a = a |
|
self.b = b |
|
|
|
def __richcmp__(self, WeightedEdge other, int op): |
|
"""Cython-specific comparison method. |
|
|
|
op is the comparison code:: |
|
< 0 |
|
== 2 |
|
> 4 |
|
<= 1 |
|
!= 3 |
|
>= 5 |
|
""" |
|
if op == 0: |
|
return self.weight < other.weight |
|
elif op == 1: |
|
return self.weight <= other.weight |
|
elif op == 2: |
|
return self.weight == other.weight |
|
elif op == 3: |
|
return self.weight != other.weight |
|
elif op == 4: |
|
return self.weight > other.weight |
|
elif op == 5: |
|
return self.weight >= other.weight |
|
|
|
def __repr__(self): |
|
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__, |
|
self.weight, |
|
self.a, self.b) |
|
|
|
|
|
|
|
|
|
|
|
cdef class UnionFind(object): |
|
|
|
def __init__(self, N): |
|
self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C') |
|
self.next_label = N |
|
self.size = np.hstack((np.ones(N, dtype=np.intp), |
|
np.zeros(N - 1, dtype=np.intp))) |
|
|
|
cdef void union(self, intp_t m, intp_t n) noexcept: |
|
self.parent[m] = self.next_label |
|
self.parent[n] = self.next_label |
|
self.size[self.next_label] = self.size[m] + self.size[n] |
|
self.next_label += 1 |
|
return |
|
|
|
@cython.wraparound(True) |
|
cdef intp_t fast_find(self, intp_t n) noexcept: |
|
cdef intp_t p |
|
p = n |
|
|
|
while self.parent[n] != -1: |
|
n = self.parent[n] |
|
|
|
while self.parent[p] != n: |
|
p, self.parent[p] = self.parent[p], n |
|
return n |
|
|
|
|
|
def _single_linkage_label(const float64_t[:, :] L): |
|
""" |
|
Convert an linkage array or MST to a tree by labelling clusters at merges. |
|
This is done by using a Union find structure to keep track of merges |
|
efficiently. This is the private version of the function that assumes that |
|
``L`` has been properly validated. See ``single_linkage_label`` for the |
|
user facing version of this function. |
|
|
|
Parameters |
|
---------- |
|
L: array of shape (n_samples - 1, 3) |
|
The linkage array or MST where each row specifies two samples |
|
to be merged and a distance or weight at which the merge occurs. This |
|
array is assumed to be sorted by the distance/weight. |
|
|
|
Returns |
|
------- |
|
A tree in the format used by scipy.cluster.hierarchy. |
|
""" |
|
|
|
cdef float64_t[:, ::1] result_arr |
|
|
|
cdef intp_t left, left_cluster, right, right_cluster, index |
|
cdef float64_t delta |
|
|
|
result_arr = np.zeros((L.shape[0], 4), dtype=np.float64) |
|
U = UnionFind(L.shape[0] + 1) |
|
|
|
for index in range(L.shape[0]): |
|
|
|
left = <intp_t> L[index, 0] |
|
right = <intp_t> L[index, 1] |
|
delta = L[index, 2] |
|
|
|
left_cluster = U.fast_find(left) |
|
right_cluster = U.fast_find(right) |
|
|
|
result_arr[index][0] = left_cluster |
|
result_arr[index][1] = right_cluster |
|
result_arr[index][2] = delta |
|
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster] |
|
|
|
U.union(left_cluster, right_cluster) |
|
|
|
return np.asarray(result_arr) |
|
|
|
|
|
@cython.wraparound(True) |
|
def single_linkage_label(L): |
|
""" |
|
Convert an linkage array or MST to a tree by labelling clusters at merges. |
|
This is done by using a Union find structure to keep track of merges |
|
efficiently. |
|
|
|
Parameters |
|
---------- |
|
L: array of shape (n_samples - 1, 3) |
|
The linkage array or MST where each row specifies two samples |
|
to be merged and a distance or weight at which the merge occurs. This |
|
array is assumed to be sorted by the distance/weight. |
|
|
|
Returns |
|
------- |
|
A tree in the format used by scipy.cluster.hierarchy. |
|
""" |
|
|
|
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1: |
|
raise ValueError("Input MST array is not a validly formatted MST array") |
|
|
|
is_sorted = lambda x: np.all(x[:-1] <= x[1:]) |
|
if not is_sorted(L[:, 2]): |
|
raise ValueError("Input MST array must be sorted by weight") |
|
|
|
return _single_linkage_label(L) |
|
|
|
|
|
|
|
def mst_linkage_core( |
|
const float64_t [:, ::1] raw_data, |
|
DistanceMetric64 dist_metric): |
|
""" |
|
Compute the necessary elements of a minimum spanning |
|
tree for computation of single linkage clustering. This |
|
represents the MST-LINKAGE-CORE algorithm (Figure 6) from |
|
:arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering |
|
algorithms" <1109.2378>`. |
|
|
|
In contrast to the scipy implementation is never computes |
|
a full distance matrix, generating distances only as they |
|
are needed and releasing them when no longer needed. |
|
|
|
Parameters |
|
---------- |
|
raw_data: array of shape (n_samples, n_features) |
|
The array of feature data to be clustered. Must be C-aligned |
|
|
|
dist_metric: DistanceMetric64 |
|
A DistanceMetric64 object conforming to the API from |
|
``sklearn.metrics._dist_metrics.pxd`` that will be |
|
used to compute distances. |
|
|
|
Returns |
|
------- |
|
mst_core_data: array of shape (n_samples, 3) |
|
An array providing information from which one |
|
can either compute an MST, or the linkage hierarchy |
|
very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical, |
|
agglomerative clustering algorithms" <1109.2378>` algorithm |
|
MST-LINKAGE-CORE for more details. |
|
""" |
|
cdef: |
|
intp_t n_samples = raw_data.shape[0] |
|
uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool) |
|
float64_t[:, ::1] result = np.zeros((n_samples - 1, 3)) |
|
|
|
intp_t current_node = 0 |
|
intp_t new_node |
|
intp_t i |
|
intp_t j |
|
intp_t num_features = raw_data.shape[1] |
|
|
|
float64_t right_value |
|
float64_t left_value |
|
float64_t new_distance |
|
|
|
float64_t[:] current_distances = np.full(n_samples, INFINITY) |
|
|
|
for i in range(n_samples - 1): |
|
|
|
in_tree[current_node] = 1 |
|
|
|
new_distance = INFINITY |
|
new_node = 0 |
|
|
|
for j in range(n_samples): |
|
if in_tree[j]: |
|
continue |
|
|
|
right_value = current_distances[j] |
|
left_value = dist_metric.dist(&raw_data[current_node, 0], |
|
&raw_data[j, 0], |
|
num_features) |
|
|
|
if left_value < right_value: |
|
current_distances[j] = left_value |
|
|
|
if current_distances[j] < new_distance: |
|
new_distance = current_distances[j] |
|
new_node = j |
|
|
|
result[i, 0] = current_node |
|
result[i, 1] = new_node |
|
result[i, 2] = new_distance |
|
current_node = new_node |
|
|
|
return np.array(result) |
|
|