colibri.qdrant / lib /segment /tests /integration /segment_on_disk_snapshot.rs
Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::sync::atomic::AtomicBool;
use common::cpu::CpuPermit;
use common::tar_ext;
use rstest::rstest;
use segment::data_types::index::{IntegerIndexParams, KeywordIndexParams};
use segment::data_types::vectors::{only_default_vector, DEFAULT_VECTOR_NAME};
use segment::entry::entry_point::SegmentEntry;
use segment::index::hnsw_index::num_rayon_threads;
use segment::json_path::JsonPath;
use segment::segment::Segment;
use segment::segment_constructor::segment_builder::SegmentBuilder;
use segment::segment_constructor::{build_segment, load_segment};
use segment::types::{
Distance, HnswConfig, Indexes, PayloadFieldSchema, PayloadSchemaParams, PayloadStorageType,
SegmentConfig, SnapshotFormat, VectorDataConfig, VectorStorageType,
};
use tempfile::Builder;
/// This test tests snapshotting and restoring a segment with all on-disk components.
#[rstest]
#[case::regular(SnapshotFormat::Regular)]
#[case::streamable(SnapshotFormat::Streamable)]
fn test_on_disk_segment_snapshot(#[case] format: SnapshotFormat) {
let _ = env_logger::builder().is_test(true).try_init();
let data = r#"
{
"names": ["John Doe", "Bill Murray"],
"ages": [43, 51],
"metadata": {
"height": 50,
"width": 60
}
}"#;
let segment_builder_dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let building_config = SegmentConfig {
vector_data: HashMap::from([(
DEFAULT_VECTOR_NAME.to_owned(),
VectorDataConfig {
size: 2,
distance: Distance::Dot,
storage_type: VectorStorageType::Memory,
index: Indexes::Plain {},
quantization_config: None,
multivector_config: None,
datatype: None,
},
)]),
sparse_vector_data: Default::default(),
payload_storage_type: Default::default(),
};
let mut segment = build_segment(segment_builder_dir.path(), &building_config, true).unwrap();
segment
.upsert_point(0, 0.into(), only_default_vector(&[1.0, 1.0]))
.unwrap();
segment
.upsert_point(1, 1.into(), only_default_vector(&[2.0, 2.0]))
.unwrap();
segment
.set_full_payload(2, 0.into(), &serde_json::from_str(data).unwrap())
.unwrap();
segment
.set_full_payload(3, 0.into(), &serde_json::from_str(data).unwrap())
.unwrap();
segment
.create_field_index(
4,
&JsonPath::new("names"),
Some(&PayloadFieldSchema::FieldParams(
PayloadSchemaParams::Keyword(KeywordIndexParams {
r#type: segment::data_types::index::KeywordIndexType::Keyword,
is_tenant: None,
on_disk: Some(true),
}),
)),
)
.unwrap();
segment
.create_field_index(
5,
&JsonPath::new("ages"),
Some(&PayloadFieldSchema::FieldParams(
PayloadSchemaParams::Integer(IntegerIndexParams {
r#type: segment::data_types::index::IntegerIndexType::Integer,
lookup: Some(true),
range: Some(true),
is_principal: None,
on_disk: Some(true),
}),
)),
)
.unwrap();
let segment_config = SegmentConfig {
vector_data: HashMap::from([(
DEFAULT_VECTOR_NAME.to_owned(),
VectorDataConfig {
size: 2,
distance: Distance::Dot,
storage_type: VectorStorageType::Mmap, // mmap vectors
index: Indexes::Hnsw(HnswConfig {
m: 4,
ef_construct: 16,
full_scan_threshold: 8,
max_indexing_threads: 2,
on_disk: Some(true), // mmap index
payload_m: None,
}),
quantization_config: None,
multivector_config: None,
datatype: None,
},
)]),
sparse_vector_data: Default::default(),
payload_storage_type: PayloadStorageType::OnDisk, // on-disk payload
};
let segment_base_dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let segment_builder_dir = Builder::new().prefix("segment_dir").tempdir().unwrap();
let mut segment_builder = SegmentBuilder::new(
segment_base_dir.path(),
segment_builder_dir.path(),
&segment_config,
)
.unwrap();
segment_builder.update(&[&segment], &false.into()).unwrap();
let segment = segment_builder
.build(CpuPermit::dummy(num_rayon_threads(0) as u32), &false.into())
.unwrap();
let temp_dir = Builder::new().prefix("temp_dir").tempdir().unwrap();
// The segment snapshot is a part of a parent collection/shard snapshot.
let parent_snapshot_tar = Builder::new()
.prefix("parent_snapshot")
.suffix(".tar")
.tempfile()
.unwrap();
let segment_id = segment
.current_path
.file_stem()
.and_then(|f| f.to_str())
.unwrap();
// snapshotting!
let tar = tar_ext::BuilderExt::new_seekable_owned(File::create(&parent_snapshot_tar).unwrap());
segment
.take_snapshot(temp_dir.path(), &tar, format, &mut HashSet::new())
.unwrap();
tar.blocking_finish().unwrap();
let parent_snapshot_unpacked = Builder::new().prefix("parent_snapshot").tempdir().unwrap();
tar::Archive::new(File::open(&parent_snapshot_tar).unwrap())
.unpack(parent_snapshot_unpacked.path())
.unwrap();
// Should be exactly one entry in the snapshot.
let mut entries = parent_snapshot_unpacked.path().read_dir().unwrap();
let entry = entries.next().unwrap().unwrap();
assert!(entries.next().is_none());
match format {
SnapshotFormat::Ancient => unreachable!("The old days are gone"),
SnapshotFormat::Regular => {
assert_eq!(entry.file_name(), format!("{segment_id}.tar").as_str());
assert!(entry.path().is_file());
}
SnapshotFormat::Streamable => {
assert_eq!(entry.file_name(), segment_id);
assert!(entry.path().is_dir());
}
}
// restore snapshot
Segment::restore_snapshot_in_place(&entry.path()).unwrap();
// Should be exactly one entry in the snapshot.
let mut entries = parent_snapshot_unpacked.path().read_dir().unwrap();
let entry = entries.next().unwrap().unwrap();
assert!(entries.next().is_none());
// It should be unpacked entry, not tar archive.
assert!(entry.path().is_dir());
assert_eq!(entry.file_name(), segment_id);
let restored_segment = load_segment(&entry.path(), &AtomicBool::new(false))
.unwrap()
.unwrap();
// validate restored snapshot is the same as original segment
assert_eq!(
segment.total_point_count(),
restored_segment.total_point_count(),
);
assert_eq!(
segment.available_point_count(),
restored_segment.available_point_count(),
);
assert_eq!(
segment.deleted_point_count(),
restored_segment.deleted_point_count(),
);
for id in segment.iter_points() {
let vectors = segment.all_vectors(id).unwrap();
let restored_vectors = restored_segment.all_vectors(id).unwrap();
assert_eq!(vectors, restored_vectors);
let payload = segment.payload(id).unwrap();
let restored_payload = restored_segment.payload(id).unwrap();
assert_eq!(payload, restored_payload);
}
}