Draken007's picture
Upload 7228 files
2a0bc63 verified
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# @nolint
# not linting this file because it imports * from swigfaiss, which
# causes a ton of useless warnings.
import numpy as np
from faiss.loader import *
import faiss
import collections.abc
###########################################
# Wrapper for a few functions
###########################################
def kmin(array, k):
"""return k smallest values (and their indices) of the lines of a
float32 array"""
array = np.ascontiguousarray(array, dtype='float32')
m, n = array.shape
I = np.zeros((m, k), dtype='int64')
D = np.zeros((m, k), dtype='float32')
ha = faiss.float_maxheap_array_t()
ha.ids = swig_ptr(I)
ha.val = swig_ptr(D)
ha.nh = m
ha.k = k
ha.heapify()
ha.addn(n, swig_ptr(array))
ha.reorder()
return D, I
def kmax(array, k):
"""return k largest values (and their indices) of the lines of a
float32 array"""
array = np.ascontiguousarray(array, dtype='float32')
m, n = array.shape
I = np.zeros((m, k), dtype='int64')
D = np.zeros((m, k), dtype='float32')
ha = faiss.float_minheap_array_t()
ha.ids = swig_ptr(I)
ha.val = swig_ptr(D)
ha.nh = m
ha.k = k
ha.heapify()
ha.addn(n, swig_ptr(array))
ha.reorder()
return D, I
def pairwise_distances(xq, xb, metric=METRIC_L2, metric_arg=0):
"""compute the whole pairwise distance matrix between two sets of
vectors"""
xq = np.ascontiguousarray(xq, dtype='float32')
xb = np.ascontiguousarray(xb, dtype='float32')
nq, d = xq.shape
nb, d2 = xb.shape
assert d == d2
dis = np.empty((nq, nb), dtype='float32')
if metric == METRIC_L2:
pairwise_L2sqr(
d, nq, swig_ptr(xq),
nb, swig_ptr(xb),
swig_ptr(dis))
elif metric == METRIC_INNER_PRODUCT:
dis[:] = xq @ xb.T
else:
pairwise_extra_distances(
d, nq, swig_ptr(xq),
nb, swig_ptr(xb),
metric, metric_arg,
swig_ptr(dis))
return dis
def rand(n, seed=12345):
res = np.empty(n, dtype='float32')
float_rand(swig_ptr(res), res.size, seed)
return res
def randint(n, seed=12345, vmax=None):
res = np.empty(n, dtype='int64')
if vmax is None:
int64_rand(swig_ptr(res), res.size, seed)
else:
int64_rand_max(swig_ptr(res), res.size, vmax, seed)
return res
lrand = randint
def randn(n, seed=12345):
res = np.empty(n, dtype='float32')
float_randn(swig_ptr(res), res.size, seed)
return res
def checksum(a):
""" compute a checksum for quick-and-dirty comparisons of arrays """
a = a.view('uint8')
if a.ndim == 1:
return bvec_checksum(a.size, swig_ptr(a))
n, d = a.shape
cs = np.zeros(n, dtype='uint64')
bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
return cs
rand_smooth_vectors_c = rand_smooth_vectors
def rand_smooth_vectors(n, d, seed=1234):
res = np.empty((n, d), dtype='float32')
rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
return res
def eval_intersection(I1, I2):
""" size of intersection between each line of two result tables"""
I1 = np.ascontiguousarray(I1, dtype='int64')
I2 = np.ascontiguousarray(I2, dtype='int64')
n = I1.shape[0]
assert I2.shape[0] == n
k1, k2 = I1.shape[1], I2.shape[1]
ninter = 0
for i in range(n):
ninter += ranklist_intersection_size(
k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i]))
return ninter
def normalize_L2(x):
fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
bucket_sort_c = bucket_sort
def bucket_sort(tab, nbucket=None, nt=0):
"""Perform a bucket sort on a table of integers.
Parameters
----------
tab : array_like
elements to sort, max value nbucket - 1
nbucket : integer
number of buckets, None if unknown
nt : integer
number of threads to use (0 = use unthreaded codepath)
Returns
-------
lims : array_like
cumulative sum of bucket sizes (size vmax + 1)
perm : array_like
perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
"""
tab = np.ascontiguousarray(tab, dtype="int64")
if nbucket is None:
nbucket = int(tab.max() + 1)
lims = np.empty(nbucket + 1, dtype='int64')
perm = np.empty(tab.size, dtype='int64')
bucket_sort_c(
tab.size, faiss.swig_ptr(tab.view('uint64')),
nbucket, faiss.swig_ptr(lims), faiss.swig_ptr(perm),
nt
)
return lims, perm
matrix_bucket_sort_inplace_c = matrix_bucket_sort_inplace
def matrix_bucket_sort_inplace(tab, nbucket=None, nt=0):
"""Perform a bucket sort on a matrix, recording the original
row of each element.
Parameters
----------
tab : array_like
array of size (N, ncol) that contains the bucket ids, maximum
value nbucket - 1.
On output, it the elements are shuffled such that the flat array
tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
of each bucket entry.
nbucket : integer
number of buckets (the maximum value in tab should be nbucket - 1)
nt : integer
number of threads to use (0 = use unthreaded codepath)
Returns
-------
lims : array_like
cumulative sum of bucket sizes (size vmax + 1)
"""
assert tab.dtype == 'int32' or tab.dtype == 'int64'
nrow, ncol = tab.shape
if nbucket is None:
nbucket = int(tab.max() + 1)
lims = np.empty(nbucket + 1, dtype='int64')
matrix_bucket_sort_inplace_c(
nrow, ncol, faiss.swig_ptr(tab),
nbucket, faiss.swig_ptr(lims),
nt
)
return lims
###########################################
# ResultHeap
###########################################
class ResultHeap:
"""Accumulate query results from a sliced dataset. The final result will
be in self.D, self.I."""
def __init__(self, nq, k, keep_max=False):
"""
nq: number of query vectors,
k: number of results per query
keep_max: keep the top-k maximum values instead of the minima
"""
self.I = np.zeros((nq, k), dtype='int64')
self.D = np.zeros((nq, k), dtype='float32')
self.nq, self.k = nq, k
if keep_max:
heaps = float_minheap_array_t()
else:
heaps = float_maxheap_array_t()
heaps.k = k
heaps.nh = nq
heaps.val = swig_ptr(self.D)
heaps.ids = swig_ptr(self.I)
heaps.heapify()
self.heaps = heaps
def add_result(self, D, I):
"""
Add results for all heaps
D, I should be of size (nh, nres)
D, I do not need to be in a particular order (heap or sorted)
"""
nq, kd = D.shape
D = np.ascontiguousarray(D, dtype='float32')
I = np.ascontiguousarray(I, dtype='int64')
assert I.shape == (nq, kd)
assert nq == self.nq
self.heaps.addn_with_ids(
kd, swig_ptr(D),
swig_ptr(I), kd)
def add_result_subset(self, subset, D, I):
"""
Add results for a subset of heaps.
D, I should hold resutls for all the subset
as a special case, if I is 1D, then all ids are assumed to be the same
"""
nsubset, kd = D.shape
assert nsubset == len(subset)
assert (
I.ndim == 2 and D.shape == I.shape or
I.ndim == 1 and I.shape == (kd, )
)
D = np.ascontiguousarray(D, dtype='float32')
I = np.ascontiguousarray(I, dtype='int64')
subset = np.ascontiguousarray(subset, dtype='int64')
id_stride = 0 if I.ndim == 1 else kd
self.heaps.addn_query_subset_with_ids(
nsubset, swig_ptr(subset),
kd, swig_ptr(D), swig_ptr(I), id_stride
)
def finalize(self):
self.heaps.reorder()
def merge_knn_results(Dall, Iall, keep_max=False):
"""
Merge a set of sorted knn-results obtained from different shards in a dataset
Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
returns D, I of size (nq, k) as the merged result set
"""
assert Iall.shape == Dall.shape
nshard, n, k = Dall.shape
Dnew = np.empty((n, k), dtype=Dall.dtype)
Inew = np.empty((n, k), dtype=Iall.dtype)
func = merge_knn_results_CMax if keep_max else merge_knn_results_CMin
func(
n, k, nshard,
swig_ptr(Dall), swig_ptr(Iall),
swig_ptr(Dnew), swig_ptr(Inew)
)
return Dnew, Inew
######################################################
# Efficient ID to ID map
######################################################
class MapInt64ToInt64:
def __init__(self, capacity):
self.log2_capacity = int(np.log2(capacity))
assert capacity == 2 ** self.log2_capacity, "need power of 2 capacity"
self.capacity = capacity
self.tab = np.empty((capacity, 2), dtype='int64')
faiss.hashtable_int64_to_int64_init(self.log2_capacity, swig_ptr(self.tab))
def add(self, keys, vals):
n, = keys.shape
assert vals.shape == (n,)
faiss.hashtable_int64_to_int64_add(
self.log2_capacity, swig_ptr(self.tab),
n, swig_ptr(keys), swig_ptr(vals))
def lookup(self, keys):
n, = keys.shape
vals = np.empty((n,), dtype='int64')
faiss.hashtable_int64_to_int64_lookup(
self.log2_capacity, swig_ptr(self.tab),
n, swig_ptr(keys), swig_ptr(vals))
return vals
######################################################
# KNN function
######################################################
def knn(xq, xb, k, metric=METRIC_L2):
"""
Compute the k nearest neighbors of a vector without constructing an index
Parameters
----------
xq : array_like
Query vectors, shape (nq, d) where the dimension d is that same as xb
`dtype` must be float32.
xb : array_like
Database vectors, shape (nb, d) where dimension d is the same as xq
`dtype` must be float32.
k : int
Number of nearest neighbors.
distance_type : MetricType, optional
distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
Returns
-------
D : array_like
Distances of the nearest neighbors, shape (nq, k)
I : array_like
Labels of the nearest neighbors, shape (nq, k)
"""
xq = np.ascontiguousarray(xq, dtype='float32')
xb = np.ascontiguousarray(xb, dtype='float32')
nq, d = xq.shape
nb, d2 = xb.shape
assert d == d2
I = np.empty((nq, k), dtype='int64')
D = np.empty((nq, k), dtype='float32')
if metric == METRIC_L2:
knn_L2sqr(
swig_ptr(xq), swig_ptr(xb),
d, nq, nb, k, swig_ptr(D), swig_ptr(I)
)
elif metric == METRIC_INNER_PRODUCT:
knn_inner_product(
swig_ptr(xq), swig_ptr(xb),
d, nq, nb, k, swig_ptr(D), swig_ptr(I)
)
else:
raise NotImplementedError("only L2 and INNER_PRODUCT are supported")
return D, I
def knn_hamming(xq, xb, k, variant="hc"):
"""
Compute the k nearest neighbors of a set of vectors without constructing an index.
Parameters
----------
xq : array_like
Query vectors, shape (nq, d) where d is the number of bits / 8
`dtype` must be uint8.
xb : array_like
Database vectors, shape (nb, d) where d is the number of bits / 8
`dtype` must be uint8.
k : int
Number of nearest neighbors.
variant : string
Function variant to use, either "mc" (counter) or "hc" (heap)
Returns
-------
D : array_like
Distances of the nearest neighbors, shape (nq, k)
I : array_like
Labels of the nearest neighbors, shape (nq, k)
"""
# other variant is "mc"
nq, d = xq.shape
nb, d2 = xb.shape
assert d == d2
D = np.empty((nq, k), dtype='int32')
I = np.empty((nq, k), dtype='int64')
if variant == "hc":
heap = faiss.int_maxheap_array_t()
heap.k = k
heap.nh = nq
heap.ids = faiss.swig_ptr(I)
heap.val = faiss.swig_ptr(D)
faiss.hammings_knn_hc(
heap, faiss.swig_ptr(xq), faiss.swig_ptr(xb), nb,
d, 1
)
elif variant == "mc":
faiss.hammings_knn_mc(
faiss.swig_ptr(xq), faiss.swig_ptr(xb), nq, nb, k, d,
faiss.swig_ptr(D), faiss.swig_ptr(I)
)
else:
raise NotImplementedError
return D, I
###########################################
# Kmeans object
###########################################
class Kmeans:
"""Object that performs k-means clustering and manages the centroids.
The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object.
Parameters
----------
d : int
dimension of the vectors to cluster
k : int
number of clusters
gpu: bool or int, optional
False: don't use GPU
True: use all GPUs
number: use this many GPUs
progressive_dim_steps:
use a progressive dimension clustering (with that number of steps)
Subsequent parameters are fields of the Clustring object. The most important are:
niter: int, optional
clustering iterations
nredo: int, optional
redo clustering this many times and keep best
verbose: bool, optional
spherical: bool, optional
do we want normalized centroids?
int_centroids: bool, optional
round centroids coordinates to integer
seed: int, optional
seed for the random number generator
"""
def __init__(self, d, k, **kwargs):
"""d: input dimension, k: nb of centroids. Additional
parameters are passed on the ClusteringParameters object,
including niter=25, verbose=False, spherical = False
"""
self.d = d
self.reset(k)
self.gpu = False
if "progressive_dim_steps" in kwargs:
self.cp = ProgressiveDimClusteringParameters()
else:
self.cp = ClusteringParameters()
for k, v in kwargs.items():
if k == 'gpu':
if v == True or v == -1:
v = get_num_gpus()
self.gpu = v
else:
# if this raises an exception, it means that it is a non-existent field
getattr(self.cp, k)
setattr(self.cp, k, v)
self.set_index()
def set_index(self):
d = self.d
if self.cp.__class__ == ClusteringParameters:
if self.cp.spherical:
self.index = IndexFlatIP(d)
else:
self.index = IndexFlatL2(d)
if self.gpu:
self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
else:
if self.gpu:
fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
else:
fac = ProgressiveDimIndexFactory()
self.fac = fac
def reset(self, k=None):
""" prepare k-means object to perform a new clustering, possibly
with another number of centroids """
if k is not None:
self.k = int(k)
self.centroids = None
self.obj = None
self.iteration_stats = None
def train(self, x, weights=None, init_centroids=None):
""" Perform k-means clustering.
On output of the function call:
- the centroids are in the centroids field of size (`k`, `d`).
- the objective value at each iteration is in the array obj (size `niter`)
- detailed optimization statistics are in the array iteration_stats.
Parameters
----------
x : array_like
Training vectors, shape (n, d), `dtype` must be float32 and n should
be larger than the number of clusters `k`.
weights : array_like
weight associated to each vector, shape `n`
init_centroids : array_like
initial set of centroids, shape (n, d)
Returns
-------
final_obj: float
final optimization objective
"""
x = np.ascontiguousarray(x, dtype='float32')
n, d = x.shape
assert d == self.d
if self.cp.__class__ == ClusteringParameters:
# regular clustering
clus = Clustering(d, self.k, self.cp)
if init_centroids is not None:
nc, d2 = init_centroids.shape
assert d2 == d
faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
clus.train(x, self.index, weights)
else:
# not supported for progressive dim
assert weights is None
assert init_centroids is None
assert not self.cp.spherical
clus = ProgressiveDimClustering(d, self.k, self.cp)
clus.train(n, swig_ptr(x), self.fac)
centroids = faiss.vector_float_to_array(clus.centroids)
self.centroids = centroids.reshape(self.k, d)
stats = clus.iteration_stats
stats = [stats.at(i) for i in range(stats.size())]
self.obj = np.array([st.obj for st in stats])
# copy all the iteration_stats objects to a python array
stat_fields = 'obj time time_search imbalance_factor nsplit'.split()
self.iteration_stats = [
{field: getattr(st, field) for field in stat_fields}
for st in stats
]
return self.obj[-1] if self.obj.size > 0 else 0.0
def assign(self, x):
x = np.ascontiguousarray(x, dtype='float32')
assert self.centroids is not None, "should train before assigning"
self.index.reset()
self.index.add(self.centroids)
D, I = self.index.search(x, 1)
return D.ravel(), I.ravel()
###########################################
# Packing and unpacking bistrings
###########################################
def is_sequence(x):
return isinstance(x, collections.abc.Sequence)
pack_bitstrings_c = pack_bitstrings
def pack_bitstrings(a, nbit):
"""
Pack a set integers (i, j) where i=0:n and j=0:M into
n bitstrings.
Output is an uint8 array of size (n, code_size), where code_size is
such that at most 7 bits per code are wasted.
If nbit is an integer: all entries takes nbit bits.
If nbit is an array: entry (i, j) takes nbit[j] bits.
"""
n, M = a.shape
a = np.ascontiguousarray(a, dtype='int32')
if is_sequence(nbit):
nbit = np.ascontiguousarray(nbit, dtype='int32')
assert nbit.shape == (M,)
code_size = int((nbit.sum() + 7) // 8)
b = np.empty((n, code_size), dtype='uint8')
pack_bitstrings_c(
n, M, swig_ptr(nbit), swig_ptr(a), swig_ptr(b), code_size)
else:
code_size = (M * nbit + 7) // 8
b = np.empty((n, code_size), dtype='uint8')
pack_bitstrings_c(n, M, nbit, swig_ptr(a), swig_ptr(b), code_size)
return b
unpack_bitstrings_c = unpack_bitstrings
def unpack_bitstrings(b, M_or_nbits, nbit=None):
"""
Unpack a set integers (i, j) where i=0:n and j=0:M from
n bitstrings (encoded as uint8s).
Input is an uint8 array of size (n, code_size), where code_size is
such that at most 7 bits per code are wasted.
Two forms:
- when called with (array, M, nbit): there are M entries of size
nbit per row
- when called with (array, nbits): element (i, j) is encoded in
nbits[j] bits
"""
n, code_size = b.shape
if nbit is None:
nbit = np.ascontiguousarray(M_or_nbits, dtype='int32')
M = len(nbit)
min_code_size = int((nbit.sum() + 7) // 8)
assert code_size >= min_code_size
a = np.empty((n, M), dtype='int32')
unpack_bitstrings_c(
n, M, swig_ptr(nbit),
swig_ptr(b), code_size, swig_ptr(a))
else:
M = M_or_nbits
min_code_size = (M * nbit + 7) // 8
assert code_size >= min_code_size
a = np.empty((n, M), dtype='int32')
unpack_bitstrings_c(
n, M, nbit, swig_ptr(b), code_size, swig_ptr(a))
return a