# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # @nolint # not linting this file because it imports * from swigfaiss, which # causes a ton of useless warnings. import numpy as np from faiss.loader import * import faiss import collections.abc ########################################### # Wrapper for a few functions ########################################### def kmin(array, k): """return k smallest values (and their indices) of the lines of a float32 array""" array = np.ascontiguousarray(array, dtype='float32') m, n = array.shape I = np.zeros((m, k), dtype='int64') D = np.zeros((m, k), dtype='float32') ha = faiss.float_maxheap_array_t() ha.ids = swig_ptr(I) ha.val = swig_ptr(D) ha.nh = m ha.k = k ha.heapify() ha.addn(n, swig_ptr(array)) ha.reorder() return D, I def kmax(array, k): """return k largest values (and their indices) of the lines of a float32 array""" array = np.ascontiguousarray(array, dtype='float32') m, n = array.shape I = np.zeros((m, k), dtype='int64') D = np.zeros((m, k), dtype='float32') ha = faiss.float_minheap_array_t() ha.ids = swig_ptr(I) ha.val = swig_ptr(D) ha.nh = m ha.k = k ha.heapify() ha.addn(n, swig_ptr(array)) ha.reorder() return D, I def pairwise_distances(xq, xb, metric=METRIC_L2, metric_arg=0): """compute the whole pairwise distance matrix between two sets of vectors""" xq = np.ascontiguousarray(xq, dtype='float32') xb = np.ascontiguousarray(xb, dtype='float32') nq, d = xq.shape nb, d2 = xb.shape assert d == d2 dis = np.empty((nq, nb), dtype='float32') if metric == METRIC_L2: pairwise_L2sqr( d, nq, swig_ptr(xq), nb, swig_ptr(xb), swig_ptr(dis)) elif metric == METRIC_INNER_PRODUCT: dis[:] = xq @ xb.T else: pairwise_extra_distances( d, nq, swig_ptr(xq), nb, swig_ptr(xb), metric, metric_arg, swig_ptr(dis)) return dis def rand(n, seed=12345): res = np.empty(n, dtype='float32') float_rand(swig_ptr(res), res.size, seed) return res def randint(n, seed=12345, vmax=None): res = np.empty(n, dtype='int64') if vmax is None: int64_rand(swig_ptr(res), res.size, seed) else: int64_rand_max(swig_ptr(res), res.size, vmax, seed) return res lrand = randint def randn(n, seed=12345): res = np.empty(n, dtype='float32') float_randn(swig_ptr(res), res.size, seed) return res def checksum(a): """ compute a checksum for quick-and-dirty comparisons of arrays """ a = a.view('uint8') if a.ndim == 1: return bvec_checksum(a.size, swig_ptr(a)) n, d = a.shape cs = np.zeros(n, dtype='uint64') bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs)) return cs rand_smooth_vectors_c = rand_smooth_vectors def rand_smooth_vectors(n, d, seed=1234): res = np.empty((n, d), dtype='float32') rand_smooth_vectors_c(n, d, swig_ptr(res), seed) return res def eval_intersection(I1, I2): """ size of intersection between each line of two result tables""" I1 = np.ascontiguousarray(I1, dtype='int64') I2 = np.ascontiguousarray(I2, dtype='int64') n = I1.shape[0] assert I2.shape[0] == n k1, k2 = I1.shape[1], I2.shape[1] ninter = 0 for i in range(n): ninter += ranklist_intersection_size( k1, swig_ptr(I1[i]), k2, swig_ptr(I2[i])) return ninter def normalize_L2(x): fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x)) bucket_sort_c = bucket_sort def bucket_sort(tab, nbucket=None, nt=0): """Perform a bucket sort on a table of integers. Parameters ---------- tab : array_like elements to sort, max value nbucket - 1 nbucket : integer number of buckets, None if unknown nt : integer number of threads to use (0 = use unthreaded codepath) Returns ------- lims : array_like cumulative sum of bucket sizes (size vmax + 1) perm : array_like perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size) """ tab = np.ascontiguousarray(tab, dtype="int64") if nbucket is None: nbucket = int(tab.max() + 1) lims = np.empty(nbucket + 1, dtype='int64') perm = np.empty(tab.size, dtype='int64') bucket_sort_c( tab.size, faiss.swig_ptr(tab.view('uint64')), nbucket, faiss.swig_ptr(lims), faiss.swig_ptr(perm), nt ) return lims, perm matrix_bucket_sort_inplace_c = matrix_bucket_sort_inplace def matrix_bucket_sort_inplace(tab, nbucket=None, nt=0): """Perform a bucket sort on a matrix, recording the original row of each element. Parameters ---------- tab : array_like array of size (N, ncol) that contains the bucket ids, maximum value nbucket - 1. On output, it the elements are shuffled such that the flat array tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers of each bucket entry. nbucket : integer number of buckets (the maximum value in tab should be nbucket - 1) nt : integer number of threads to use (0 = use unthreaded codepath) Returns ------- lims : array_like cumulative sum of bucket sizes (size vmax + 1) """ assert tab.dtype == 'int32' or tab.dtype == 'int64' nrow, ncol = tab.shape if nbucket is None: nbucket = int(tab.max() + 1) lims = np.empty(nbucket + 1, dtype='int64') matrix_bucket_sort_inplace_c( nrow, ncol, faiss.swig_ptr(tab), nbucket, faiss.swig_ptr(lims), nt ) return lims ########################################### # ResultHeap ########################################### class ResultHeap: """Accumulate query results from a sliced dataset. The final result will be in self.D, self.I.""" def __init__(self, nq, k, keep_max=False): """ nq: number of query vectors, k: number of results per query keep_max: keep the top-k maximum values instead of the minima """ self.I = np.zeros((nq, k), dtype='int64') self.D = np.zeros((nq, k), dtype='float32') self.nq, self.k = nq, k if keep_max: heaps = float_minheap_array_t() else: heaps = float_maxheap_array_t() heaps.k = k heaps.nh = nq heaps.val = swig_ptr(self.D) heaps.ids = swig_ptr(self.I) heaps.heapify() self.heaps = heaps def add_result(self, D, I): """ Add results for all heaps D, I should be of size (nh, nres) D, I do not need to be in a particular order (heap or sorted) """ nq, kd = D.shape D = np.ascontiguousarray(D, dtype='float32') I = np.ascontiguousarray(I, dtype='int64') assert I.shape == (nq, kd) assert nq == self.nq self.heaps.addn_with_ids( kd, swig_ptr(D), swig_ptr(I), kd) def add_result_subset(self, subset, D, I): """ Add results for a subset of heaps. D, I should hold resutls for all the subset as a special case, if I is 1D, then all ids are assumed to be the same """ nsubset, kd = D.shape assert nsubset == len(subset) assert ( I.ndim == 2 and D.shape == I.shape or I.ndim == 1 and I.shape == (kd, ) ) D = np.ascontiguousarray(D, dtype='float32') I = np.ascontiguousarray(I, dtype='int64') subset = np.ascontiguousarray(subset, dtype='int64') id_stride = 0 if I.ndim == 1 else kd self.heaps.addn_query_subset_with_ids( nsubset, swig_ptr(subset), kd, swig_ptr(D), swig_ptr(I), id_stride ) def finalize(self): self.heaps.reorder() def merge_knn_results(Dall, Iall, keep_max=False): """ Merge a set of sorted knn-results obtained from different shards in a dataset Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted returns D, I of size (nq, k) as the merged result set """ assert Iall.shape == Dall.shape nshard, n, k = Dall.shape Dnew = np.empty((n, k), dtype=Dall.dtype) Inew = np.empty((n, k), dtype=Iall.dtype) func = merge_knn_results_CMax if keep_max else merge_knn_results_CMin func( n, k, nshard, swig_ptr(Dall), swig_ptr(Iall), swig_ptr(Dnew), swig_ptr(Inew) ) return Dnew, Inew ###################################################### # Efficient ID to ID map ###################################################### class MapInt64ToInt64: def __init__(self, capacity): self.log2_capacity = int(np.log2(capacity)) assert capacity == 2 ** self.log2_capacity, "need power of 2 capacity" self.capacity = capacity self.tab = np.empty((capacity, 2), dtype='int64') faiss.hashtable_int64_to_int64_init(self.log2_capacity, swig_ptr(self.tab)) def add(self, keys, vals): n, = keys.shape assert vals.shape == (n,) faiss.hashtable_int64_to_int64_add( self.log2_capacity, swig_ptr(self.tab), n, swig_ptr(keys), swig_ptr(vals)) def lookup(self, keys): n, = keys.shape vals = np.empty((n,), dtype='int64') faiss.hashtable_int64_to_int64_lookup( self.log2_capacity, swig_ptr(self.tab), n, swig_ptr(keys), swig_ptr(vals)) return vals ###################################################### # KNN function ###################################################### def knn(xq, xb, k, metric=METRIC_L2): """ Compute the k nearest neighbors of a vector without constructing an index Parameters ---------- xq : array_like Query vectors, shape (nq, d) where the dimension d is that same as xb `dtype` must be float32. xb : array_like Database vectors, shape (nb, d) where dimension d is the same as xq `dtype` must be float32. k : int Number of nearest neighbors. distance_type : MetricType, optional distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT) Returns ------- D : array_like Distances of the nearest neighbors, shape (nq, k) I : array_like Labels of the nearest neighbors, shape (nq, k) """ xq = np.ascontiguousarray(xq, dtype='float32') xb = np.ascontiguousarray(xb, dtype='float32') nq, d = xq.shape nb, d2 = xb.shape assert d == d2 I = np.empty((nq, k), dtype='int64') D = np.empty((nq, k), dtype='float32') if metric == METRIC_L2: knn_L2sqr( swig_ptr(xq), swig_ptr(xb), d, nq, nb, k, swig_ptr(D), swig_ptr(I) ) elif metric == METRIC_INNER_PRODUCT: knn_inner_product( swig_ptr(xq), swig_ptr(xb), d, nq, nb, k, swig_ptr(D), swig_ptr(I) ) else: raise NotImplementedError("only L2 and INNER_PRODUCT are supported") return D, I def knn_hamming(xq, xb, k, variant="hc"): """ Compute the k nearest neighbors of a set of vectors without constructing an index. Parameters ---------- xq : array_like Query vectors, shape (nq, d) where d is the number of bits / 8 `dtype` must be uint8. xb : array_like Database vectors, shape (nb, d) where d is the number of bits / 8 `dtype` must be uint8. k : int Number of nearest neighbors. variant : string Function variant to use, either "mc" (counter) or "hc" (heap) Returns ------- D : array_like Distances of the nearest neighbors, shape (nq, k) I : array_like Labels of the nearest neighbors, shape (nq, k) """ # other variant is "mc" nq, d = xq.shape nb, d2 = xb.shape assert d == d2 D = np.empty((nq, k), dtype='int32') I = np.empty((nq, k), dtype='int64') if variant == "hc": heap = faiss.int_maxheap_array_t() heap.k = k heap.nh = nq heap.ids = faiss.swig_ptr(I) heap.val = faiss.swig_ptr(D) faiss.hammings_knn_hc( heap, faiss.swig_ptr(xq), faiss.swig_ptr(xb), nb, d, 1 ) elif variant == "mc": faiss.hammings_knn_mc( faiss.swig_ptr(xq), faiss.swig_ptr(xb), nq, nb, k, d, faiss.swig_ptr(D), faiss.swig_ptr(I) ) else: raise NotImplementedError return D, I ########################################### # Kmeans object ########################################### class Kmeans: """Object that performs k-means clustering and manages the centroids. The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object. Parameters ---------- d : int dimension of the vectors to cluster k : int number of clusters gpu: bool or int, optional False: don't use GPU True: use all GPUs number: use this many GPUs progressive_dim_steps: use a progressive dimension clustering (with that number of steps) Subsequent parameters are fields of the Clustring object. The most important are: niter: int, optional clustering iterations nredo: int, optional redo clustering this many times and keep best verbose: bool, optional spherical: bool, optional do we want normalized centroids? int_centroids: bool, optional round centroids coordinates to integer seed: int, optional seed for the random number generator """ def __init__(self, d, k, **kwargs): """d: input dimension, k: nb of centroids. Additional parameters are passed on the ClusteringParameters object, including niter=25, verbose=False, spherical = False """ self.d = d self.reset(k) self.gpu = False if "progressive_dim_steps" in kwargs: self.cp = ProgressiveDimClusteringParameters() else: self.cp = ClusteringParameters() for k, v in kwargs.items(): if k == 'gpu': if v == True or v == -1: v = get_num_gpus() self.gpu = v else: # if this raises an exception, it means that it is a non-existent field getattr(self.cp, k) setattr(self.cp, k, v) self.set_index() def set_index(self): d = self.d if self.cp.__class__ == ClusteringParameters: if self.cp.spherical: self.index = IndexFlatIP(d) else: self.index = IndexFlatL2(d) if self.gpu: self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu) else: if self.gpu: fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu) else: fac = ProgressiveDimIndexFactory() self.fac = fac def reset(self, k=None): """ prepare k-means object to perform a new clustering, possibly with another number of centroids """ if k is not None: self.k = int(k) self.centroids = None self.obj = None self.iteration_stats = None def train(self, x, weights=None, init_centroids=None): """ Perform k-means clustering. On output of the function call: - the centroids are in the centroids field of size (`k`, `d`). - the objective value at each iteration is in the array obj (size `niter`) - detailed optimization statistics are in the array iteration_stats. Parameters ---------- x : array_like Training vectors, shape (n, d), `dtype` must be float32 and n should be larger than the number of clusters `k`. weights : array_like weight associated to each vector, shape `n` init_centroids : array_like initial set of centroids, shape (n, d) Returns ------- final_obj: float final optimization objective """ x = np.ascontiguousarray(x, dtype='float32') n, d = x.shape assert d == self.d if self.cp.__class__ == ClusteringParameters: # regular clustering clus = Clustering(d, self.k, self.cp) if init_centroids is not None: nc, d2 = init_centroids.shape assert d2 == d faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids) clus.train(x, self.index, weights) else: # not supported for progressive dim assert weights is None assert init_centroids is None assert not self.cp.spherical clus = ProgressiveDimClustering(d, self.k, self.cp) clus.train(n, swig_ptr(x), self.fac) centroids = faiss.vector_float_to_array(clus.centroids) self.centroids = centroids.reshape(self.k, d) stats = clus.iteration_stats stats = [stats.at(i) for i in range(stats.size())] self.obj = np.array([st.obj for st in stats]) # copy all the iteration_stats objects to a python array stat_fields = 'obj time time_search imbalance_factor nsplit'.split() self.iteration_stats = [ {field: getattr(st, field) for field in stat_fields} for st in stats ] return self.obj[-1] if self.obj.size > 0 else 0.0 def assign(self, x): x = np.ascontiguousarray(x, dtype='float32') assert self.centroids is not None, "should train before assigning" self.index.reset() self.index.add(self.centroids) D, I = self.index.search(x, 1) return D.ravel(), I.ravel() ########################################### # Packing and unpacking bistrings ########################################### def is_sequence(x): return isinstance(x, collections.abc.Sequence) pack_bitstrings_c = pack_bitstrings def pack_bitstrings(a, nbit): """ Pack a set integers (i, j) where i=0:n and j=0:M into n bitstrings. Output is an uint8 array of size (n, code_size), where code_size is such that at most 7 bits per code are wasted. If nbit is an integer: all entries takes nbit bits. If nbit is an array: entry (i, j) takes nbit[j] bits. """ n, M = a.shape a = np.ascontiguousarray(a, dtype='int32') if is_sequence(nbit): nbit = np.ascontiguousarray(nbit, dtype='int32') assert nbit.shape == (M,) code_size = int((nbit.sum() + 7) // 8) b = np.empty((n, code_size), dtype='uint8') pack_bitstrings_c( n, M, swig_ptr(nbit), swig_ptr(a), swig_ptr(b), code_size) else: code_size = (M * nbit + 7) // 8 b = np.empty((n, code_size), dtype='uint8') pack_bitstrings_c(n, M, nbit, swig_ptr(a), swig_ptr(b), code_size) return b unpack_bitstrings_c = unpack_bitstrings def unpack_bitstrings(b, M_or_nbits, nbit=None): """ Unpack a set integers (i, j) where i=0:n and j=0:M from n bitstrings (encoded as uint8s). Input is an uint8 array of size (n, code_size), where code_size is such that at most 7 bits per code are wasted. Two forms: - when called with (array, M, nbit): there are M entries of size nbit per row - when called with (array, nbits): element (i, j) is encoded in nbits[j] bits """ n, code_size = b.shape if nbit is None: nbit = np.ascontiguousarray(M_or_nbits, dtype='int32') M = len(nbit) min_code_size = int((nbit.sum() + 7) // 8) assert code_size >= min_code_size a = np.empty((n, M), dtype='int32') unpack_bitstrings_c( n, M, swig_ptr(nbit), swig_ptr(b), code_size, swig_ptr(a)) else: M = M_or_nbits min_code_size = (M * nbit + 7) // 8 assert code_size >= min_code_size a = np.empty((n, M), dtype='int32') unpack_bitstrings_c( n, M, nbit, swig_ptr(b), code_size, swig_ptr(a)) return a