Spaces:
Build error
Build error
use std::collections::{HashMap, HashSet}; | |
use std::fs::File; | |
use std::sync::atomic::AtomicBool; | |
use common::cpu::CpuPermit; | |
use common::tar_ext; | |
use rstest::rstest; | |
use segment::data_types::index::{IntegerIndexParams, KeywordIndexParams}; | |
use segment::data_types::vectors::{only_default_vector, DEFAULT_VECTOR_NAME}; | |
use segment::entry::entry_point::SegmentEntry; | |
use segment::index::hnsw_index::num_rayon_threads; | |
use segment::json_path::JsonPath; | |
use segment::segment::Segment; | |
use segment::segment_constructor::segment_builder::SegmentBuilder; | |
use segment::segment_constructor::{build_segment, load_segment}; | |
use segment::types::{ | |
Distance, HnswConfig, Indexes, PayloadFieldSchema, PayloadSchemaParams, PayloadStorageType, | |
SegmentConfig, SnapshotFormat, VectorDataConfig, VectorStorageType, | |
}; | |
use tempfile::Builder; | |
/// This test tests snapshotting and restoring a segment with all on-disk components. | |
fn test_on_disk_segment_snapshot( format: SnapshotFormat) { | |
let _ = env_logger::builder().is_test(true).try_init(); | |
let data = r#" | |
{ | |
"names": ["John Doe", "Bill Murray"], | |
"ages": [43, 51], | |
"metadata": { | |
"height": 50, | |
"width": 60 | |
} | |
}"#; | |
let segment_builder_dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); | |
let building_config = SegmentConfig { | |
vector_data: HashMap::from([( | |
DEFAULT_VECTOR_NAME.to_owned(), | |
VectorDataConfig { | |
size: 2, | |
distance: Distance::Dot, | |
storage_type: VectorStorageType::Memory, | |
index: Indexes::Plain {}, | |
quantization_config: None, | |
multivector_config: None, | |
datatype: None, | |
}, | |
)]), | |
sparse_vector_data: Default::default(), | |
payload_storage_type: Default::default(), | |
}; | |
let mut segment = build_segment(segment_builder_dir.path(), &building_config, true).unwrap(); | |
segment | |
.upsert_point(0, 0.into(), only_default_vector(&[1.0, 1.0])) | |
.unwrap(); | |
segment | |
.upsert_point(1, 1.into(), only_default_vector(&[2.0, 2.0])) | |
.unwrap(); | |
segment | |
.set_full_payload(2, 0.into(), &serde_json::from_str(data).unwrap()) | |
.unwrap(); | |
segment | |
.set_full_payload(3, 0.into(), &serde_json::from_str(data).unwrap()) | |
.unwrap(); | |
segment | |
.create_field_index( | |
4, | |
&JsonPath::new("names"), | |
Some(&PayloadFieldSchema::FieldParams( | |
PayloadSchemaParams::Keyword(KeywordIndexParams { | |
r#type: segment::data_types::index::KeywordIndexType::Keyword, | |
is_tenant: None, | |
on_disk: Some(true), | |
}), | |
)), | |
) | |
.unwrap(); | |
segment | |
.create_field_index( | |
5, | |
&JsonPath::new("ages"), | |
Some(&PayloadFieldSchema::FieldParams( | |
PayloadSchemaParams::Integer(IntegerIndexParams { | |
r#type: segment::data_types::index::IntegerIndexType::Integer, | |
lookup: Some(true), | |
range: Some(true), | |
is_principal: None, | |
on_disk: Some(true), | |
}), | |
)), | |
) | |
.unwrap(); | |
let segment_config = SegmentConfig { | |
vector_data: HashMap::from([( | |
DEFAULT_VECTOR_NAME.to_owned(), | |
VectorDataConfig { | |
size: 2, | |
distance: Distance::Dot, | |
storage_type: VectorStorageType::Mmap, // mmap vectors | |
index: Indexes::Hnsw(HnswConfig { | |
m: 4, | |
ef_construct: 16, | |
full_scan_threshold: 8, | |
max_indexing_threads: 2, | |
on_disk: Some(true), // mmap index | |
payload_m: None, | |
}), | |
quantization_config: None, | |
multivector_config: None, | |
datatype: None, | |
}, | |
)]), | |
sparse_vector_data: Default::default(), | |
payload_storage_type: PayloadStorageType::OnDisk, // on-disk payload | |
}; | |
let segment_base_dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); | |
let segment_builder_dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); | |
let mut segment_builder = SegmentBuilder::new( | |
segment_base_dir.path(), | |
segment_builder_dir.path(), | |
&segment_config, | |
) | |
.unwrap(); | |
segment_builder.update(&[&segment], &false.into()).unwrap(); | |
let segment = segment_builder | |
.build(CpuPermit::dummy(num_rayon_threads(0) as u32), &false.into()) | |
.unwrap(); | |
let temp_dir = Builder::new().prefix("temp_dir").tempdir().unwrap(); | |
// The segment snapshot is a part of a parent collection/shard snapshot. | |
let parent_snapshot_tar = Builder::new() | |
.prefix("parent_snapshot") | |
.suffix(".tar") | |
.tempfile() | |
.unwrap(); | |
let segment_id = segment | |
.current_path | |
.file_stem() | |
.and_then(|f| f.to_str()) | |
.unwrap(); | |
// snapshotting! | |
let tar = tar_ext::BuilderExt::new_seekable_owned(File::create(&parent_snapshot_tar).unwrap()); | |
segment | |
.take_snapshot(temp_dir.path(), &tar, format, &mut HashSet::new()) | |
.unwrap(); | |
tar.blocking_finish().unwrap(); | |
let parent_snapshot_unpacked = Builder::new().prefix("parent_snapshot").tempdir().unwrap(); | |
tar::Archive::new(File::open(&parent_snapshot_tar).unwrap()) | |
.unpack(parent_snapshot_unpacked.path()) | |
.unwrap(); | |
// Should be exactly one entry in the snapshot. | |
let mut entries = parent_snapshot_unpacked.path().read_dir().unwrap(); | |
let entry = entries.next().unwrap().unwrap(); | |
assert!(entries.next().is_none()); | |
match format { | |
SnapshotFormat::Ancient => unreachable!("The old days are gone"), | |
SnapshotFormat::Regular => { | |
assert_eq!(entry.file_name(), format!("{segment_id}.tar").as_str()); | |
assert!(entry.path().is_file()); | |
} | |
SnapshotFormat::Streamable => { | |
assert_eq!(entry.file_name(), segment_id); | |
assert!(entry.path().is_dir()); | |
} | |
} | |
// restore snapshot | |
Segment::restore_snapshot_in_place(&entry.path()).unwrap(); | |
// Should be exactly one entry in the snapshot. | |
let mut entries = parent_snapshot_unpacked.path().read_dir().unwrap(); | |
let entry = entries.next().unwrap().unwrap(); | |
assert!(entries.next().is_none()); | |
// It should be unpacked entry, not tar archive. | |
assert!(entry.path().is_dir()); | |
assert_eq!(entry.file_name(), segment_id); | |
let restored_segment = load_segment(&entry.path(), &AtomicBool::new(false)) | |
.unwrap() | |
.unwrap(); | |
// validate restored snapshot is the same as original segment | |
assert_eq!( | |
segment.total_point_count(), | |
restored_segment.total_point_count(), | |
); | |
assert_eq!( | |
segment.available_point_count(), | |
restored_segment.available_point_count(), | |
); | |
assert_eq!( | |
segment.deleted_point_count(), | |
restored_segment.deleted_point_count(), | |
); | |
for id in segment.iter_points() { | |
let vectors = segment.all_vectors(id).unwrap(); | |
let restored_vectors = restored_segment.all_vectors(id).unwrap(); | |
assert_eq!(vectors, restored_vectors); | |
let payload = segment.payload(id).unwrap(); | |
let restored_payload = restored_segment.payload(id).unwrap(); | |
assert_eq!(payload, restored_payload); | |
} | |
} | |