use std::cmp; use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::path::{Path, PathBuf}; use std::sync::atomic::AtomicBool; use std::sync::Arc; use ahash::AHasher; use atomic_refcell::AtomicRefCell; use bitvec::macros::internal::funty::Integral; use common::cpu::CpuPermit; use common::types::PointOffsetType; use io::storage_version::StorageVersion; use tempfile::TempDir; use uuid::Uuid; use super::{ create_mutable_id_tracker, create_payload_storage, create_sparse_vector_index, create_sparse_vector_storage, create_vector_index, get_payload_index_path, get_vector_index_path, get_vector_storage_path, new_segment_path, open_segment_db, open_vector_storage, }; use crate::common::error_logging::LogError; use crate::common::operation_error::{check_process_stopped, OperationError, OperationResult}; use crate::entry::entry_point::SegmentEntry; use crate::id_tracker::immutable_id_tracker::ImmutableIdTracker; use crate::id_tracker::in_memory_id_tracker::InMemoryIdTracker; use crate::id_tracker::{IdTracker, IdTrackerEnum}; use crate::index::field_index::FieldIndex; use crate::index::sparse_index::sparse_vector_index::SparseVectorIndexOpenArgs; use crate::index::struct_payload_index::StructPayloadIndex; use crate::index::PayloadIndex; use crate::payload_storage::payload_storage_enum::PayloadStorageEnum; use crate::payload_storage::PayloadStorage; use crate::segment::{Segment, SegmentVersion}; use crate::segment_constructor::load_segment; use crate::types::{ ExtendedPointId, PayloadFieldSchema, PayloadKeyType, SegmentConfig, SegmentState, SeqNumberType, }; use crate::vector_storage::quantized::quantized_vectors::QuantizedVectors; use crate::vector_storage::{VectorStorage, VectorStorageEnum}; /// Structure for constructing segment out of several other segments pub struct SegmentBuilder { version: SeqNumberType, id_tracker: IdTrackerEnum, payload_storage: PayloadStorageEnum, vector_storages: HashMap, segment_config: SegmentConfig, // The path, where fully created segment will be moved destination_path: PathBuf, // The temporary segment directory temp_dir: TempDir, indexed_fields: HashMap, // Payload key to deframent data to defragment_keys: Vec, } impl SegmentBuilder { pub fn new( segment_path: &Path, temp_dir: &Path, segment_config: &SegmentConfig, ) -> OperationResult { // When we build a new segment, it is empty at first, // so we can ignore the `stopped` flag let stopped = AtomicBool::new(false); let temp_dir = create_temp_dir(temp_dir)?; let database = open_segment_db(temp_dir.path(), segment_config)?; let id_tracker = if segment_config.is_appendable() { IdTrackerEnum::MutableIdTracker(create_mutable_id_tracker(database.clone())?) } else { IdTrackerEnum::InMemoryIdTracker(InMemoryIdTracker::new()) }; let payload_storage = create_payload_storage(database.clone(), segment_config, segment_path)?; let mut vector_storages = HashMap::new(); for (vector_name, vector_config) in &segment_config.vector_data { let vector_storage_path = get_vector_storage_path(temp_dir.path(), vector_name); let vector_storage = open_vector_storage( &database, vector_config, &stopped, &vector_storage_path, vector_name, )?; vector_storages.insert(vector_name.to_owned(), vector_storage); } #[allow(clippy::for_kv_map)] for (vector_name, _sparse_vector_config) in &segment_config.sparse_vector_data { // `_sparse_vector_config` should be used, once we are able to initialize storage with // different datatypes let vector_storage = create_sparse_vector_storage(database.clone(), vector_name, &stopped)?; vector_storages.insert(vector_name.to_owned(), vector_storage); } let destination_path = new_segment_path(segment_path); Ok(SegmentBuilder { version: Default::default(), // default version is 0 id_tracker, payload_storage, vector_storages, segment_config: segment_config.clone(), destination_path, temp_dir, indexed_fields: Default::default(), defragment_keys: vec![], }) } pub fn set_defragment_keys(&mut self, keys: Vec) { self.defragment_keys = keys; } pub fn remove_indexed_field(&mut self, field: &PayloadKeyType) { self.indexed_fields.remove(field); } pub fn add_indexed_field(&mut self, field: PayloadKeyType, schema: PayloadFieldSchema) { self.indexed_fields.insert(field, schema); } /// Get ordering value from the payload index /// /// Ordering value is used to sort points to keep points with the same payload together /// Under the assumption that points are queried together, this will reduce the number of /// random disk reads. /// /// Note: This value doesn't guarantee strict ordering in ambiguous cases. /// It should only be used in optimization purposes, not for correctness. fn _get_ordering_value(internal_id: PointOffsetType, indices: &[FieldIndex]) -> u64 { let mut ordering = 0; for payload_index in indices { match payload_index { FieldIndex::IntMapIndex(index) => { if let Some(numbers) = index.get_values(internal_id) { for number in numbers { ordering = ordering.wrapping_add(*number as u64); } } break; } FieldIndex::KeywordIndex(index) => { if let Some(keywords) = index.get_values(internal_id) { for keyword in keywords { let mut hasher = AHasher::default(); keyword.hash(&mut hasher); ordering = ordering.wrapping_add(hasher.finish()); } } break; } FieldIndex::IntIndex(index) => { if let Some(numbers) = index.get_values(internal_id) { for number in numbers { ordering = ordering.wrapping_add(number as u64); } } break; } FieldIndex::FloatIndex(index) => { if let Some(numbers) = index.get_values(internal_id) { for number in numbers { // Bit-level conversion of f64 to u64 preserves ordering // (for positive numbers) // // 0.001 -> 4562254508917369340 // 0.01 -> 4576918229304087675 // 0.05 -> 4587366580439587226 // 0.1 -> 4591870180066957722 // 1 -> 4607182418800017408 // 2 -> 4611686018427387904 // 10 -> 4621819117588971520 ordering = ordering.wrapping_add(number.to_bits()); } } break; } FieldIndex::DatetimeIndex(index) => { if let Some(dates) = index.get_values(internal_id) { for date in dates { ordering = ordering.wrapping_add(date as u64); } } break; } FieldIndex::UuidMapIndex(index) => { if let Some(ids) = index.get_values(internal_id) { uuid_hash(&mut ordering, ids.copied()); } break; } FieldIndex::UuidIndex(index) => { if let Some(ids) = index.get_values(internal_id) { uuid_hash(&mut ordering, ids); } break; } FieldIndex::GeoIndex(_) => {} FieldIndex::FullTextIndex(_) => {} FieldIndex::BinaryIndex(_) => {} } } ordering } /// Update current segment builder with all (not deleted) vectors and payload from `segments`. /// Also defragments if the `defragment_key` is set. /// However only points in the same call get defragmented and grouped together. /// Therefore this function should only be called once, unless this behavior is desired. /// /// # Result /// /// * `bool` - if `true` - data successfully added, if `false` - process was interrupted /// pub fn update(&mut self, segments: &[&Segment], stopped: &AtomicBool) -> OperationResult { if segments.is_empty() { return Ok(true); } let mut merged_points: HashMap = HashMap::new(); for (segment_index, segment) in segments.iter().enumerate() { for external_id in segment.iter_points() { let version = segment.point_version(external_id).unwrap_or(0); merged_points .entry(external_id) .and_modify(|entry| { if entry.version < version { entry.segment_index = segment_index; entry.version = version; } }) .or_insert_with(|| { let internal_id = segment.get_internal_id(external_id).unwrap(); PositionedPointMetadata { segment_index, internal_id, external_id, version, ordering: 0, } }); } } let payloads: Vec<_> = segments.iter().map(|i| i.payload_index.borrow()).collect(); let mut points_to_insert: Vec<_> = merged_points.into_values().collect(); for defragment_key in &self.defragment_keys { for point_data in &mut points_to_insert { let Some(payload_indices) = payloads[point_data.segment_index] .field_indexes .get(defragment_key) else { continue; }; point_data.ordering = point_data.ordering.wrapping_add(Self::_get_ordering_value( point_data.internal_id, payload_indices, )); } } if !self.defragment_keys.is_empty() { points_to_insert.sort_unstable_by_key(|i| i.ordering); } let src_segment_max_version = segments.iter().map(|i| i.version()).max().unwrap(); self.version = cmp::max(self.version, src_segment_max_version); let vector_storages: Vec<_> = segments.iter().map(|i| &i.vector_data).collect(); let mut new_internal_range = None; for (vector_name, vector_storage) in &mut self.vector_storages { check_process_stopped(stopped)?; let other_vector_storages = vector_storages .iter() .map(|i| { let other_vector_storage = i.get(vector_name).ok_or_else(|| { OperationError::service_error(format!( "Cannot update from other segment because if missing vector name {vector_name}" )) })?; Ok(other_vector_storage.vector_storage.borrow()) }) .collect::, OperationError>>()?; let mut iter = points_to_insert.iter().map(|point_data| { let other_vector_storage = &other_vector_storages[point_data.segment_index]; let vec = other_vector_storage.get_vector(point_data.internal_id); let vector_deleted = other_vector_storage.is_deleted_vector(point_data.internal_id); (vec, vector_deleted) }); let internal_range = vector_storage.update_from(&mut iter, stopped)?; match &new_internal_range { Some(new_internal_range) => { if new_internal_range != &internal_range { return Err(OperationError::service_error( "Internal ids range mismatch between self segment vectors and other segment vectors", )); } } None => new_internal_range = Some(internal_range), } } if let Some(new_internal_range) = new_internal_range { let internal_id_iter = new_internal_range.zip(points_to_insert.iter()); for (new_internal_id, point_data) in internal_id_iter { check_process_stopped(stopped)?; let old_internal_id = point_data.internal_id; let other_payload = payloads[point_data.segment_index].get_payload(old_internal_id)?; match self.id_tracker.internal_id(point_data.external_id) { Some(existing_internal_id) => { debug_assert!( false, "This code should not be reachable, cause points were resolved with `merged_points`" ); let existing_external_version = self .id_tracker .internal_version(existing_internal_id) .unwrap(); let remove_id = if existing_external_version < point_data.version { // Other version is the newest, remove the existing one and replace self.id_tracker.drop(point_data.external_id)?; self.id_tracker .set_link(point_data.external_id, new_internal_id)?; self.id_tracker .set_internal_version(new_internal_id, point_data.version)?; self.payload_storage.clear(existing_internal_id)?; existing_internal_id } else { // Old version is still good, do not move anything else // Mark newly added vector as removed new_internal_id }; for vector_storage in self.vector_storages.values_mut() { vector_storage.delete_vector(remove_id)?; } } None => { self.id_tracker .set_link(point_data.external_id, new_internal_id)?; self.id_tracker .set_internal_version(new_internal_id, point_data.version)?; } } // Propagate payload to new segment if !other_payload.is_empty() { self.payload_storage.set(new_internal_id, &other_payload)?; } } } for payload in payloads { for (field, payload_schema) in payload.indexed_fields() { self.indexed_fields.insert(field, payload_schema); } } Ok(true) } pub fn build(self, permit: CpuPermit, stopped: &AtomicBool) -> Result { let (temp_dir, destination_path) = { let SegmentBuilder { version, id_tracker, payload_storage, mut vector_storages, segment_config, destination_path, temp_dir, indexed_fields, defragment_keys: _, } = self; let appendable_flag = segment_config.is_appendable(); payload_storage.flusher()()?; let payload_storage_arc = Arc::new(AtomicRefCell::new(payload_storage)); let id_tracker = match id_tracker { IdTrackerEnum::InMemoryIdTracker(in_memory_id_tracker) => { let (versions, mappings) = in_memory_id_tracker.into_internal(); let immutable_id_tracker = ImmutableIdTracker::new(temp_dir.path(), &versions, mappings)?; IdTrackerEnum::ImmutableIdTracker(immutable_id_tracker) } IdTrackerEnum::MutableIdTracker(_) => id_tracker, IdTrackerEnum::ImmutableIdTracker(_) => { unreachable!("ImmutableIdTracker should not be used for building segment") } }; id_tracker.mapping_flusher()()?; id_tracker.versions_flusher()()?; let id_tracker_arc = Arc::new(AtomicRefCell::new(id_tracker)); // Arc permit to share it with each vector store let permit = Arc::new(permit); let mut quantized_vectors = Self::update_quantization( &segment_config, &vector_storages, temp_dir.path(), &permit, stopped, )?; let mut vector_storages_arc = HashMap::new(); for vector_name in segment_config.vector_data.keys() { let Some(vector_storage) = vector_storages.remove(vector_name) else { return Err(OperationError::service_error(format!( "Vector storage for vector name {vector_name} not found on segment build" ))); }; vector_storage.flusher()()?; let vector_storage_arc = Arc::new(AtomicRefCell::new(vector_storage)); vector_storages_arc.insert(vector_name.to_owned(), vector_storage_arc); } for vector_name in segment_config.sparse_vector_data.keys() { let Some(vector_storage) = vector_storages.remove(vector_name) else { return Err(OperationError::service_error(format!( "Vector storage for vector name {vector_name} not found on sparse segment build" ))); }; vector_storage.flusher()()?; let vector_storage_arc = Arc::new(AtomicRefCell::new(vector_storage)); vector_storages_arc.insert(vector_name.to_owned(), vector_storage_arc); } let payload_index_path = get_payload_index_path(temp_dir.path()); let mut payload_index = StructPayloadIndex::open( payload_storage_arc, id_tracker_arc.clone(), vector_storages_arc.clone(), &payload_index_path, appendable_flag, )?; for (field, payload_schema) in indexed_fields { payload_index.set_indexed(&field, payload_schema)?; check_process_stopped(stopped)?; } payload_index.flusher()()?; let payload_index_arc = Arc::new(AtomicRefCell::new(payload_index)); for (vector_name, vector_config) in &segment_config.vector_data { let vector_storage_arc = vector_storages_arc.remove(vector_name).unwrap(); let vector_index_path = get_vector_index_path(temp_dir.path(), vector_name); let quantized_vectors = quantized_vectors.remove(vector_name); let quantized_vectors_arc = Arc::new(AtomicRefCell::new(quantized_vectors)); create_vector_index( vector_config, &vector_index_path, id_tracker_arc.clone(), vector_storage_arc, payload_index_arc.clone(), quantized_vectors_arc, Some(permit.clone()), stopped, )?; } for (vector_name, sparse_vector_config) in &segment_config.sparse_vector_data { let vector_index_path = get_vector_index_path(temp_dir.path(), vector_name); let vector_storage_arc = vector_storages_arc.remove(vector_name).unwrap(); create_sparse_vector_index(SparseVectorIndexOpenArgs { config: sparse_vector_config.index, id_tracker: id_tracker_arc.clone(), vector_storage: vector_storage_arc.clone(), payload_index: payload_index_arc.clone(), path: &vector_index_path, stopped, tick_progress: || (), })?; } // We're done with CPU-intensive tasks, release CPU permit debug_assert_eq!( Arc::strong_count(&permit), 1, "Must release CPU permit Arc everywhere", ); drop(permit); // Finalize the newly created segment by saving config and version Segment::save_state( &SegmentState { version: Some(version), config: segment_config, }, temp_dir.path(), )?; // After version is saved, segment can be loaded on restart SegmentVersion::save(temp_dir.path())?; // All temp data is evicted from RAM (temp_dir, destination_path) }; // Move fully constructed segment into collection directory and load back to RAM std::fs::rename(temp_dir.into_path(), &destination_path) .describe("Moving segment data after optimization")?; let loaded_segment = load_segment(&destination_path, stopped)?.ok_or_else(|| { OperationError::service_error(format!( "Segment loading error: {}", destination_path.display() )) })?; Ok(loaded_segment) } fn update_quantization( segment_config: &SegmentConfig, vector_storages: &HashMap, temp_path: &Path, permit: &CpuPermit, stopped: &AtomicBool, ) -> OperationResult> { let config = segment_config.clone(); let mut quantized_vectors_map = HashMap::new(); for (vector_name, vector_storage) in vector_storages { let Some(vector_config) = config.vector_data.get(vector_name) else { continue; }; let is_appendable = vector_config.is_appendable(); // Don't build quantization for appendable vectors if is_appendable { continue; } let max_threads = permit.num_cpus as usize; if let Some(quantization) = config.quantization_config(vector_name) { let segment_path = temp_path; check_process_stopped(stopped)?; let vector_storage_path = get_vector_storage_path(segment_path, vector_name); let quantized_vectors = QuantizedVectors::create( vector_storage, quantization, &vector_storage_path, max_threads, stopped, )?; quantized_vectors_map.insert(vector_name.to_owned(), quantized_vectors); } } Ok(quantized_vectors_map) } } fn uuid_hash(hash: &mut u64, ids: I) where I: Iterator, { for id in ids { let uuid = Uuid::from_u128(id); // Not all Uuid versions hold timestamp data. The most common version, v4 for example is completely // random and can't be sorted. To still allow defragmentation, we assume that usually the same // version gets used for a payload key and implement an alternative sorting criteria, that just // takes the Uuids bytes to group equal Uuids together. if let Some(timestamp) = uuid.get_timestamp() { *hash = hash.wrapping_add(timestamp.to_gregorian().0); } else { // First part of u128 *hash = hash.wrapping_add((id >> 64) as u64); // Second part of u128 *hash = hash.wrapping_add(id as u64); } } } fn create_temp_dir(parent_path: &Path) -> Result { // Ensure parent path exists std::fs::create_dir_all(parent_path) .and_then(|_| TempDir::with_prefix_in("segment_builder_", parent_path)) .map_err(|err| { OperationError::service_error(format!( "Could not create temp directory in `{}`: {}", parent_path.display(), err )) }) } /// Internal point ID and metadata of a point. struct PositionedPointMetadata { segment_index: usize, internal_id: PointOffsetType, external_id: ExtendedPointId, version: SeqNumberType, ordering: u64, }