colibri.qdrant / lib /segment /tests /integration /segment_builder_test.rs
Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
use common::cpu::CpuPermit;
use itertools::Itertools;
use segment::common::operation_error::OperationError;
use segment::data_types::named_vectors::NamedVectors;
use segment::data_types::vectors::{only_default_vector, VectorRef, DEFAULT_VECTOR_NAME};
use segment::entry::entry_point::SegmentEntry;
use segment::index::hnsw_index::num_rayon_threads;
use segment::json_path::JsonPath;
use segment::segment::Segment;
use segment::segment_constructor::segment_builder::SegmentBuilder;
use segment::types::{
Indexes, PayloadContainer, PayloadKeyType, SegmentConfig, VectorDataConfig, VectorStorageType,
};
use serde_json::Value;
use sparse::common::sparse_vector::SparseVector;
use tempfile::Builder;
use crate::fixtures::segment::{
build_segment_1, build_segment_2, build_segment_sparse_1, build_segment_sparse_2,
empty_segment, PAYLOAD_KEY,
};
#[test]
fn test_building_new_segment() {
let dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap();
let stopped = AtomicBool::new(false);
let segment1 = build_segment_1(dir.path());
let mut segment2 = build_segment_2(dir.path());
let mut builder =
SegmentBuilder::new(dir.path(), temp_dir.path(), &segment1.segment_config).unwrap();
// Include overlapping with segment1 to check the
segment2
.upsert_point(100, 3.into(), only_default_vector(&[0., 0., 0., 0.]))
.unwrap();
builder
.update(&[&segment1, &segment2, &segment2], &stopped)
.unwrap();
// Check what happens if segment building fails here
let segment_count = dir.path().read_dir().unwrap().count();
assert_eq!(segment_count, 2);
let temp_segment_count = temp_dir.path().read_dir().unwrap().count();
assert_eq!(temp_segment_count, 1);
// Now we finalize building
let permit_cpu_count = num_rayon_threads(0);
let permit = CpuPermit::dummy(permit_cpu_count as u32);
let merged_segment: Segment = builder.build(permit, &stopped).unwrap();
let new_segment_count = dir.path().read_dir().unwrap().count();
assert_eq!(new_segment_count, 3);
assert_eq!(
merged_segment.iter_points().count(),
merged_segment.available_point_count(),
);
assert_eq!(
merged_segment.available_point_count(),
segment1
.iter_points()
.chain(segment2.iter_points())
.unique()
.count(),
);
assert_eq!(merged_segment.point_version(3.into()), Some(100));
}
#[test]
fn test_building_new_defragmented_segment() {
let dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap();
let stopped = AtomicBool::new(false);
let defragment_key = JsonPath::from_str(PAYLOAD_KEY).unwrap();
let mut segment1 = build_segment_1(dir.path());
segment1
.create_field_index(7, &defragment_key, None)
.unwrap();
let mut segment2 = build_segment_2(dir.path());
segment2
.create_field_index(17, &defragment_key, None)
.unwrap();
let mut builder =
SegmentBuilder::new(dir.path(), temp_dir.path(), &segment1.segment_config).unwrap();
// Include overlapping with segment1 to check the
segment2
.upsert_point(100, 3.into(), only_default_vector(&[0., 0., 0., 0.]))
.unwrap();
builder.set_defragment_keys(vec![defragment_key.clone()]);
builder.update(&[&segment1, &segment2], &stopped).unwrap();
// Check what happens if segment building fails here
let segment_count = dir.path().read_dir().unwrap().count();
assert_eq!(segment_count, 2);
let temp_segment_count = temp_dir.path().read_dir().unwrap().count();
assert_eq!(temp_segment_count, 1);
// Now we finalize building
let permit_cpu_count = num_rayon_threads(0);
let permit = CpuPermit::dummy(permit_cpu_count as u32);
let merged_segment: Segment = builder.build(permit, &stopped).unwrap();
let new_segment_count = dir.path().read_dir().unwrap().count();
assert_eq!(new_segment_count, 3);
assert_eq!(
merged_segment.iter_points().count(),
merged_segment.available_point_count(),
);
assert_eq!(
merged_segment.available_point_count(),
segment1
.iter_points()
.chain(segment2.iter_points())
.unique()
.count(),
);
assert_eq!(merged_segment.point_version(3.into()), Some(100));
if let Err(err) = check_points_defragmented(&merged_segment, &defragment_key) {
panic!("{err}");
}
}
/// Iterates over the internal point ids of the merged segment and checks that the
/// points are grouped by the payload value.
fn check_points_defragmented(
segment: &Segment,
defragment_key: &PayloadKeyType,
) -> Result<(), &'static str> {
let id_tracker = segment.id_tracker.borrow();
// Previously seen group/value.
let mut previous_value: Option<Value> = None;
// keeps track of groups/values that have already been seen while iterating
let mut seen_values: Vec<Value> = vec![];
for internal_id in id_tracker.iter_internal() {
let external_id = id_tracker.external_id(internal_id).unwrap();
let payload = segment.payload(external_id).unwrap();
let values = payload.get_value(defragment_key);
if values.is_empty() {
if !seen_values.is_empty() {
return Err(
"In a defragmented segment, points without a payload value should come first!",
);
}
continue;
}
let value = values[0].clone();
let Some(prev) = previous_value.as_ref() else {
previous_value = Some(value);
continue;
};
if *prev == value {
continue;
}
if seen_values.contains(&value) {
return Err("Segment not defragmented");
}
seen_values.push(value.clone());
previous_value = Some(value);
}
Ok(())
}
#[test]
fn test_building_new_sparse_segment() {
let dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap();
let stopped = AtomicBool::new(false);
let segment1 = build_segment_sparse_1(dir.path());
let mut segment2 = build_segment_sparse_2(dir.path());
let mut builder =
SegmentBuilder::new(dir.path(), temp_dir.path(), &segment1.segment_config).unwrap();
// Include overlapping with segment1 to check the
let vec = SparseVector::new(vec![0, 1, 2, 3], vec![0.0, 0.0, 0.0, 0.0]).unwrap();
segment2
.upsert_point(
100,
3.into(),
NamedVectors::from_ref("sparse", VectorRef::Sparse(&vec)),
)
.unwrap();
builder
.update(&[&segment1, &segment2, &segment2], &stopped)
.unwrap();
// Check what happens if segment building fails here
let segment_count = dir.path().read_dir().unwrap().count();
assert_eq!(segment_count, 2);
let temp_segment_count = temp_dir.path().read_dir().unwrap().count();
assert_eq!(temp_segment_count, 1);
// Now we finalize building
let permit_cpu_count = num_rayon_threads(0);
let permit = CpuPermit::dummy(permit_cpu_count as u32);
let merged_segment: Segment = builder.build(permit, &stopped).unwrap();
let new_segment_count = dir.path().read_dir().unwrap().count();
assert_eq!(new_segment_count, 3);
assert_eq!(
merged_segment.iter_points().count(),
merged_segment.available_point_count(),
);
assert_eq!(
merged_segment.available_point_count(),
segment1
.iter_points()
.chain(segment2.iter_points())
.unique()
.count(),
);
assert_eq!(merged_segment.point_version(3.into()), Some(100));
}
fn estimate_build_time(segment: &Segment, stop_delay_millis: Option<u64>) -> (u64, bool) {
let stopped = Arc::new(AtomicBool::new(false));
let dir = Builder::new().prefix("segment_dir1").tempdir().unwrap();
let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap();
let segment_config = SegmentConfig {
vector_data: HashMap::from([(
DEFAULT_VECTOR_NAME.to_owned(),
VectorDataConfig {
size: segment.segment_config.vector_data[DEFAULT_VECTOR_NAME].size,
distance: segment.segment_config.vector_data[DEFAULT_VECTOR_NAME].distance,
storage_type: VectorStorageType::Memory,
index: Indexes::Hnsw(Default::default()),
quantization_config: None,
multivector_config: None,
datatype: None,
},
)]),
sparse_vector_data: Default::default(),
payload_storage_type: Default::default(),
};
let mut builder = SegmentBuilder::new(dir.path(), temp_dir.path(), &segment_config).unwrap();
builder.update(&[segment], &stopped).unwrap();
let now = Instant::now();
if let Some(stop_delay_millis) = stop_delay_millis {
let stopped_t = stopped.clone();
std::thread::Builder::new()
.name("build_estimator_timeout".to_string())
.spawn(move || {
std::thread::sleep(Duration::from_millis(stop_delay_millis));
stopped_t.store(true, Ordering::Release);
})
.unwrap();
}
let permit_cpu_count = num_rayon_threads(0);
let permit = CpuPermit::dummy(permit_cpu_count as u32);
let res = builder.build(permit, &stopped);
let is_cancelled = match res {
Ok(_) => false,
Err(OperationError::Cancelled { .. }) => true,
Err(err) => {
eprintln!("Was expecting cancellation signal but got unexpected error: {err:?}");
false
}
};
(now.elapsed().as_millis() as u64, is_cancelled)
}
#[test]
fn test_building_cancellation() {
let baseline_dir = Builder::new()
.prefix("segment_dir_baseline")
.tempdir()
.unwrap();
let dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let dir_2 = Builder::new().prefix("segment_dir_2").tempdir().unwrap();
let mut baseline_segment = empty_segment(baseline_dir.path());
let mut segment = empty_segment(dir.path());
let mut segment_2 = empty_segment(dir_2.path());
for idx in 0..2000 {
baseline_segment
.upsert_point(1, idx.into(), only_default_vector(&[0., 0., 0., 0.]))
.unwrap();
segment
.upsert_point(1, idx.into(), only_default_vector(&[0., 0., 0., 0.]))
.unwrap();
segment_2
.upsert_point(1, idx.into(), only_default_vector(&[0., 0., 0., 0.]))
.unwrap();
}
// Get normal build time
let (time_baseline, was_cancelled_baseline) = estimate_build_time(&baseline_segment, None);
assert!(!was_cancelled_baseline);
eprintln!("baseline time: {time_baseline}");
// Checks that optimization with longer cancellation delay will also finish fast
let early_stop_delay = time_baseline / 20;
let (time_fast, was_cancelled_early) = estimate_build_time(&segment, Some(early_stop_delay));
let late_stop_delay = time_baseline / 5;
let (time_long, was_cancelled_later) = estimate_build_time(&segment_2, Some(late_stop_delay));
let acceptable_stopping_delay = 600; // millis
assert!(was_cancelled_early);
assert!(
time_fast < early_stop_delay + acceptable_stopping_delay,
"time_early: {time_fast}, early_stop_delay: {early_stop_delay}"
);
assert!(was_cancelled_later);
assert!(
time_long < late_stop_delay + acceptable_stopping_delay,
"time_later: {time_long}, late_stop_delay: {late_stop_delay}"
);
assert!(
time_fast < time_long,
"time_early: {time_fast}, time_later: {time_long}, was_cancelled_later: {was_cancelled_later}",
);
}