Spaces:
Build error
Build error
use std::collections::{HashMap, HashSet}; | |
use std::sync::atomic::AtomicBool; | |
use common::types::TelemetryDetail; | |
use itertools::Itertools; | |
use rand::prelude::StdRng; | |
use rand::{Rng, SeedableRng}; | |
use segment::data_types::named_vectors::NamedVectors; | |
use segment::data_types::query_context::{QueryContext, VectorQueryContext}; | |
use segment::data_types::vectors::{QueryVector, VectorElementType, VectorInternal}; | |
use segment::entry::entry_point::SegmentEntry; | |
use segment::fixtures::payload_fixtures::random_vector; | |
use segment::index::sparse_index::sparse_index_config::{SparseIndexConfig, SparseIndexType}; | |
use segment::index::sparse_index::sparse_vector_index::SparseVectorIndexOpenArgs; | |
use segment::index::VectorIndex; | |
use segment::segment_constructor::{build_segment, create_sparse_vector_index_test}; | |
use segment::types::{ | |
Condition, Distance, ExtendedPointId, Filter, HasIdCondition, Indexes, PointIdType, | |
SegmentConfig, SeqNumberType, SparseVectorDataConfig, VectorDataConfig, VectorStorageDatatype, | |
VectorStorageType, DEFAULT_SPARSE_FULL_SCAN_THRESHOLD, | |
}; | |
use segment::vector_storage::query::{ContextPair, DiscoveryQuery}; | |
use sparse::common::sparse_vector::SparseVector; | |
use tempfile::Builder; | |
const MAX_EXAMPLE_PAIRS: usize = 3; | |
const SPARSE_VECTOR_NAME: &str = "sparse_test"; | |
fn convert_to_sparse_vector(vector: &[VectorElementType]) -> SparseVector { | |
let mut sparse_vector = SparseVector::default(); | |
for (idx, value) in vector.iter().enumerate() { | |
sparse_vector.indices.push(idx as u32); | |
sparse_vector.values.push(*value); | |
} | |
sparse_vector | |
} | |
fn random_named_vector<R: Rng + ?Sized>(rnd: &mut R, dim: usize) -> (NamedVectors, NamedVectors) { | |
let dense_vector = random_vector(rnd, dim); | |
let sparse_vector = convert_to_sparse_vector(&dense_vector); | |
let mut sparse_result = NamedVectors::default(); | |
sparse_result.insert(SPARSE_VECTOR_NAME.to_owned(), sparse_vector.into()); | |
let mut dense_result = NamedVectors::default(); | |
dense_result.insert(SPARSE_VECTOR_NAME.to_owned(), dense_vector.into()); | |
(sparse_result, dense_result) | |
} | |
fn random_discovery_query<R: Rng + ?Sized>(rnd: &mut R, dim: usize) -> (QueryVector, QueryVector) { | |
let num_pairs: usize = rnd.gen_range(1..MAX_EXAMPLE_PAIRS); | |
let dense_target = random_vector(rnd, dim); | |
let sparse_target = convert_to_sparse_vector(&dense_target); | |
let dense_pairs = (0..num_pairs) | |
.map(|_| { | |
let positive = random_vector(rnd, dim); | |
let negative = random_vector(rnd, dim); | |
(positive, negative) | |
}) | |
.collect_vec(); | |
let sparse_pairs = (0..num_pairs) | |
.map(|i| { | |
let positive = convert_to_sparse_vector(&dense_pairs[i].0); | |
let negative = convert_to_sparse_vector(&dense_pairs[i].1); | |
(positive, negative) | |
}) | |
.collect_vec(); | |
let dense_query = DiscoveryQuery::new( | |
dense_target.into(), | |
dense_pairs | |
.into_iter() | |
.map(|(positive, negative)| ContextPair { | |
positive: positive.into(), | |
negative: negative.into(), | |
}) | |
.collect(), | |
) | |
.into(); | |
let sparse_query = DiscoveryQuery::new( | |
sparse_target.into(), | |
sparse_pairs | |
.into_iter() | |
.map(|(positive, negative)| ContextPair { | |
positive: positive.into(), | |
negative: negative.into(), | |
}) | |
.collect(), | |
) | |
.into(); | |
(sparse_query, dense_query) | |
} | |
fn random_nearest_query<R: Rng + ?Sized>(rnd: &mut R, dim: usize) -> (QueryVector, QueryVector) { | |
let dense_target = random_vector(rnd, dim); | |
let sparse_target = convert_to_sparse_vector(&dense_target); | |
(sparse_target.into(), dense_target.into()) | |
} | |
fn sparse_index_discover_test() { | |
let stopped = AtomicBool::new(false); | |
let dim = 8; | |
let num_vectors: u64 = 5_000; | |
let distance = Distance::Dot; | |
let mut rnd = StdRng::seed_from_u64(42); | |
let dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); | |
let index_dir = Builder::new().prefix("hnsw_dir").tempdir().unwrap(); | |
let sparse_config = SegmentConfig { | |
vector_data: Default::default(), | |
sparse_vector_data: HashMap::from([( | |
SPARSE_VECTOR_NAME.to_owned(), | |
SparseVectorDataConfig { | |
index: SparseIndexConfig { | |
full_scan_threshold: Some(DEFAULT_SPARSE_FULL_SCAN_THRESHOLD), | |
index_type: SparseIndexType::MutableRam, | |
datatype: Some(VectorStorageDatatype::Float32), | |
}, | |
}, | |
)]), | |
payload_storage_type: Default::default(), | |
}; | |
let dense_config = SegmentConfig { | |
vector_data: HashMap::from([( | |
SPARSE_VECTOR_NAME.to_owned(), | |
VectorDataConfig { | |
size: dim, | |
distance, | |
storage_type: VectorStorageType::Memory, | |
index: Indexes::Plain {}, | |
quantization_config: None, | |
multivector_config: None, | |
datatype: None, | |
}, | |
)]), | |
payload_storage_type: Default::default(), | |
sparse_vector_data: Default::default(), | |
}; | |
let mut sparse_segment = build_segment(dir.path(), &sparse_config, true).unwrap(); | |
let mut dense_segment = build_segment(dir.path(), &dense_config, true).unwrap(); | |
for n in 0..num_vectors { | |
let (sparse_vector, dense_vector) = random_named_vector(&mut rnd, dim); | |
let idx = n.into(); | |
sparse_segment | |
.upsert_point(n as SeqNumberType, idx, sparse_vector) | |
.unwrap(); | |
dense_segment | |
.upsert_point(n as SeqNumberType, idx, dense_vector) | |
.unwrap(); | |
} | |
let payload_index_ptr = sparse_segment.payload_index.clone(); | |
let vector_storage = &sparse_segment.vector_data[SPARSE_VECTOR_NAME].vector_storage; | |
let sparse_index = create_sparse_vector_index_test(SparseVectorIndexOpenArgs { | |
config: SparseIndexConfig { | |
full_scan_threshold: Some(DEFAULT_SPARSE_FULL_SCAN_THRESHOLD), | |
index_type: SparseIndexType::ImmutableRam, | |
datatype: Some(VectorStorageDatatype::Float32), | |
}, | |
id_tracker: sparse_segment.id_tracker.clone(), | |
vector_storage: vector_storage.clone(), | |
payload_index: payload_index_ptr, | |
path: index_dir.path(), | |
stopped: &stopped, | |
tick_progress: || (), | |
}) | |
.unwrap(); | |
let top = 3; | |
let attempts = 100; | |
for i in 0..attempts { | |
// do discovery search | |
let (sparse_query, dense_query) = random_discovery_query(&mut rnd, dim); | |
let vec_context = VectorQueryContext::default(); | |
let sparse_discovery_result = sparse_index | |
.search(&[&sparse_query], None, top, None, &vec_context) | |
.unwrap(); | |
let dense_discovery_result = dense_segment.vector_data[SPARSE_VECTOR_NAME] | |
.vector_index | |
.borrow() | |
.search(&[&dense_query], None, top, None, &vec_context) | |
.unwrap(); | |
// check id only because scores can be epsilon-size different | |
assert_eq!( | |
sparse_discovery_result[0] | |
.iter() | |
.map(|r| r.idx) | |
.collect_vec(), | |
dense_discovery_result[0] | |
.iter() | |
.map(|r| r.idx) | |
.collect_vec(), | |
); | |
// do regular nearest search | |
let (sparse_query, dense_query) = random_nearest_query(&mut rnd, dim); | |
let query_context = QueryContext::default(); | |
let segment_query_context = query_context.get_segment_query_context(); | |
let vector_context = segment_query_context.get_vector_context(SPARSE_VECTOR_NAME); | |
let sparse_search_result = sparse_index | |
.search(&[&sparse_query], None, top, None, &vector_context) | |
.unwrap(); | |
assert!( | |
vector_context | |
.hardware_counter() | |
.unwrap() | |
.cpu_counter() | |
.get() | |
> 0 | |
); | |
let dense_search_result = dense_segment.vector_data[SPARSE_VECTOR_NAME] | |
.vector_index | |
.borrow() | |
.search(&[&dense_query], None, top, None, &vector_context) | |
.unwrap(); | |
segment_query_context | |
.take_hardware_counter() | |
.discard_results(); | |
// check that nearest search uses sparse index | |
let telemetry = sparse_index.get_telemetry_data(TelemetryDetail::default()); | |
assert_eq!(telemetry.unfiltered_sparse.count, i + 1); | |
// check id only because scores can be epsilon-size different | |
assert_eq!( | |
sparse_search_result[0].iter().map(|r| r.idx).collect_vec(), | |
dense_search_result[0].iter().map(|r| r.idx).collect_vec(), | |
); | |
} | |
} | |
fn sparse_index_hardware_measurement_test() { | |
let stopped = AtomicBool::new(false); | |
let dim = 8; | |
let num_vectors: u64 = 5_000; | |
let mut rnd = StdRng::seed_from_u64(42); | |
let dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); | |
let index_dir = Builder::new().prefix("hnsw_dir").tempdir().unwrap(); | |
let sparse_config = SegmentConfig { | |
vector_data: Default::default(), | |
sparse_vector_data: HashMap::from([( | |
SPARSE_VECTOR_NAME.to_owned(), | |
SparseVectorDataConfig { | |
index: SparseIndexConfig { | |
full_scan_threshold: Some(DEFAULT_SPARSE_FULL_SCAN_THRESHOLD), | |
index_type: SparseIndexType::MutableRam, | |
datatype: Some(VectorStorageDatatype::Float32), | |
}, | |
}, | |
)]), | |
payload_storage_type: Default::default(), | |
}; | |
let mut sparse_segment = build_segment(dir.path(), &sparse_config, true).unwrap(); | |
for n in 0..num_vectors { | |
let (sparse_vector, _) = random_named_vector(&mut rnd, dim); | |
let idx = n.into(); | |
sparse_segment | |
.upsert_point(n as SeqNumberType, idx, sparse_vector) | |
.unwrap(); | |
} | |
let payload_index_ptr = sparse_segment.payload_index.clone(); | |
let vector_storage = &sparse_segment.vector_data[SPARSE_VECTOR_NAME].vector_storage; | |
let sparse_index = create_sparse_vector_index_test(SparseVectorIndexOpenArgs { | |
config: SparseIndexConfig { | |
full_scan_threshold: Some(DEFAULT_SPARSE_FULL_SCAN_THRESHOLD), | |
index_type: SparseIndexType::ImmutableRam, | |
datatype: Some(VectorStorageDatatype::Float32), | |
}, | |
id_tracker: sparse_segment.id_tracker.clone(), | |
vector_storage: vector_storage.clone(), | |
payload_index: payload_index_ptr, | |
path: index_dir.path(), | |
stopped: &stopped, | |
tick_progress: || (), | |
}) | |
.unwrap(); | |
let query_vec = QueryVector::Nearest(VectorInternal::Sparse( | |
SparseVector::new(vec![0, 1, 2], vec![42.0, 42.42, 42.4242]).unwrap(), | |
)); | |
let query_context = QueryContext::default(); | |
let segment_query_context = query_context.get_segment_query_context(); | |
let vector_context = segment_query_context.get_vector_context(SPARSE_VECTOR_NAME); | |
assert!(vector_context.hardware_counter().is_some()); | |
assert_eq!( | |
vector_context | |
.hardware_counter() | |
.unwrap() | |
.cpu_counter() | |
.get(), | |
0 | |
); | |
// Some filter so we do plain sparse search | |
let ids: HashSet<PointIdType> = (0..3).map(ExtendedPointId::NumId).collect(); | |
let filter = Filter::new_must(Condition::HasId(HasIdCondition::from(ids))); | |
sparse_index | |
.search(&[&query_vec], Some(&filter), 1, None, &vector_context) | |
.unwrap(); | |
assert!( | |
vector_context | |
.hardware_counter() | |
.unwrap() | |
.cpu_counter() | |
.get() | |
> 0 | |
); | |
segment_query_context | |
.take_hardware_counter() | |
.discard_results(); | |
} | |