use std::borrow::Cow; use std::collections::HashSet; use std::path::{Path, PathBuf}; use std::sync::Arc; use parking_lot::Mutex; use segment::common::operation_time_statistics::OperationDurationsAggregator; use segment::index::sparse_index::sparse_index_config::SparseIndexType; use segment::types::{HnswConfig, Indexes, QuantizationConfig, SegmentType}; use crate::collection_manager::holders::segment_holder::{LockedSegmentHolder, SegmentId}; use crate::collection_manager::optimizers::segment_optimizer::{ OptimizerThresholds, SegmentOptimizer, }; use crate::config::CollectionParams; use crate::operations::config_diff::DiffConfig; /// Looks for segments having a mismatch between configured and actual parameters /// /// For example, a user may change the HNSW parameters for a collection. A segment that was already /// indexed with different parameters now has a mismatch. This segment should be optimized (and /// indexed) again in order to update the effective configuration. pub struct ConfigMismatchOptimizer { thresholds_config: OptimizerThresholds, segments_path: PathBuf, collection_temp_dir: PathBuf, collection_params: CollectionParams, hnsw_config: HnswConfig, quantization_config: Option, telemetry_durations_aggregator: Arc>, } impl ConfigMismatchOptimizer { pub fn new( thresholds_config: OptimizerThresholds, segments_path: PathBuf, collection_temp_dir: PathBuf, collection_params: CollectionParams, hnsw_config: HnswConfig, quantization_config: Option, ) -> Self { ConfigMismatchOptimizer { thresholds_config, segments_path, collection_temp_dir, collection_params, hnsw_config, quantization_config, telemetry_durations_aggregator: OperationDurationsAggregator::new(), } } /// Check if current configuration requires vectors to be stored on disk fn check_if_vectors_on_disk(&self, vector_name: &str) -> Option { self.collection_params .vectors .get_params(vector_name) .and_then(|vector_params| vector_params.on_disk) } /// Check if current configuration requires sparse vectors index to be stored on disk fn check_if_sparse_vectors_index_on_disk(&self, vector_name: &str) -> Option { self.collection_params .sparse_vectors .as_ref() .and_then(|vector_params| vector_params.get(vector_name)) .and_then(|params| params.index) .and_then(|index| index.on_disk) } /// Calculates and HNSW config that should be used for a given vector /// with current configuration. /// /// Takes vector-specific HNSW config (if any) and merges it with the collection-wide config. fn get_required_hnsw_config(&self, vector_name: &str) -> Cow { let target_hnsw_collection = &self.hnsw_config; // Select vector specific target HNSW config let target_hnsw_vector = self .collection_params .vectors .get_params(vector_name) .and_then(|vector_params| vector_params.hnsw_config) .map(|vector_hnsw| vector_hnsw.update(target_hnsw_collection)) .and_then(|hnsw| match hnsw { Ok(hnsw) => Some(hnsw), Err(err) => { log::warn!( "Failed to merge collection and vector HNSW config, ignoring: {err}" ); None } }); match target_hnsw_vector { Some(target_hnsw) => Cow::Owned(target_hnsw), None => Cow::Borrowed(target_hnsw_collection), } } fn worst_segment( &self, segments: LockedSegmentHolder, excluded_ids: &HashSet, ) -> Vec { let segments_read_guard = segments.read(); let candidates: Vec<_> = segments_read_guard .iter() // Excluded externally, might already be scheduled for optimization .filter(|(idx, _)| !excluded_ids.contains(idx)) .filter_map(|(idx, segment)| { let segment_entry = segment.get(); let read_segment = segment_entry.read(); let vector_size = read_segment .max_available_vectors_size_in_bytes() .unwrap_or_default(); let segment_config = read_segment.config(); if read_segment.segment_type() == SegmentType::Special { return None; // Never optimize already optimized segment } if self.collection_params.on_disk_payload != segment_config.payload_storage_type.is_on_disk() { return Some((*idx, vector_size)); // Skip segments with payload mismatch } // Determine whether dense data in segment has mismatch let dense_has_mismatch = segment_config .vector_data .iter() .any(|(vector_name, vector_data)| { // Check HNSW mismatch match &vector_data.index { Indexes::Plain {} => {} Indexes::Hnsw(effective_hnsw) => { // Select segment if we have an HNSW mismatch that requires rebuild let target_hnsw = self.get_required_hnsw_config(vector_name); if effective_hnsw.mismatch_requires_rebuild(&target_hnsw) { return true; } } } if let Some(is_required_on_disk) = self.check_if_vectors_on_disk(vector_name) { if is_required_on_disk != vector_data.storage_type.is_on_disk() { return true; } } // Check quantization mismatch let target_quantization_collection = self.quantization_config.as_ref(); let target_quantization_vector = self .collection_params .vectors .get_params(vector_name) .and_then(|vector_params| { vector_params.quantization_config.clone() }); let target_quantization = target_quantization_vector .as_ref() .or(target_quantization_collection); let quantization_mismatch = vector_data .quantization_config .as_ref() .zip(target_quantization) // Rebuild if current parameters differ from target parameters .map(|(current, target)| current.mismatch_requires_rebuild(target)) // Or rebuild if we now change the enabled state on an indexed segment .unwrap_or_else(|| { vector_data.index.is_indexed() && (vector_data.quantization_config.is_some() != target_quantization.is_some()) }); quantization_mismatch }); // Determine whether dense data in segment has mismatch let sparse_has_mismatch = segment_config .sparse_vector_data .iter() .any(|(vector_name, vector_data)| { let Some(is_required_on_disk) = self.check_if_sparse_vectors_index_on_disk(vector_name) else { return false; // Do nothing if not specified }; match vector_data.index.index_type { SparseIndexType::MutableRam => false, // Do nothing for mutable RAM SparseIndexType::ImmutableRam => is_required_on_disk, // Rebuild if we require on disk SparseIndexType::Mmap => !is_required_on_disk, // Rebuild if we require in RAM } }); (sparse_has_mismatch || dense_has_mismatch).then_some((*idx, vector_size)) }) .collect(); // Select segment with largest vector size candidates .into_iter() .max_by_key(|(_, vector_size)| *vector_size) .map(|(segment_id, _)| segment_id) .into_iter() .collect() } } impl SegmentOptimizer for ConfigMismatchOptimizer { fn name(&self) -> &str { "config mismatch" } fn segments_path(&self) -> &Path { self.segments_path.as_path() } fn temp_path(&self) -> &Path { self.collection_temp_dir.as_path() } fn collection_params(&self) -> CollectionParams { self.collection_params.clone() } fn hnsw_config(&self) -> &HnswConfig { &self.hnsw_config } fn quantization_config(&self) -> Option { self.quantization_config.clone() } fn threshold_config(&self) -> &OptimizerThresholds { &self.thresholds_config } fn check_condition( &self, segments: LockedSegmentHolder, excluded_ids: &HashSet, ) -> Vec { self.worst_segment(segments, excluded_ids) } fn get_telemetry_counter(&self) -> &Mutex { &self.telemetry_durations_aggregator } } #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; use common::cpu::CpuPermit; use parking_lot::RwLock; use segment::entry::entry_point::SegmentEntry; use segment::index::hnsw_index::num_rayon_threads; use segment::types::{ CompressionRatio, Distance, ProductQuantization, ProductQuantizationConfig, ScalarQuantizationConfig, ScalarType, }; use tempfile::Builder; use super::*; use crate::collection_manager::fixtures::{random_multi_vec_segment, random_segment}; use crate::collection_manager::holders::segment_holder::{LockedSegment, SegmentHolder}; use crate::collection_manager::optimizers::indexing_optimizer::IndexingOptimizer; use crate::operations::config_diff::HnswConfigDiff; use crate::operations::types::VectorsConfig; use crate::operations::vector_params_builder::VectorParamsBuilder; /// This test the config mismatch optimizer for a changed HNSW config /// /// It tests whether: /// - the condition check for HNSW mismatches works /// - optimized segments (and vector storages) use the updated configuration /// /// In short, this is what happens in this test: /// - create randomized segment as base /// - use indexing optimizer to build index for our segment /// - test config mismatch condition: should not trigger yet /// - change collection HNSW config /// - test config mismatch condition: should trigger due to HNSW change /// - optimize segment with config mismatch optimizer /// - assert segment uses changed configuration #[test] fn test_hnsw_config_mismatch() { // Collection configuration let (point_count, dim) = (1000, 10); let thresholds_config = OptimizerThresholds { max_segment_size_kb: usize::MAX, memmap_threshold_kb: usize::MAX, indexing_threshold_kb: 10, }; let collection_params = CollectionParams { vectors: VectorsConfig::Single( VectorParamsBuilder::new(dim as u64, Distance::Dot).build(), ), ..CollectionParams::empty() }; // Base segment let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap(); let dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); let mut holder = SegmentHolder::default(); let segment = random_segment(dir.path(), 100, point_count, dim as usize); let segment_id = holder.add_new(segment); let locked_holder: Arc> = Arc::new(RwLock::new(holder)); let hnsw_config = HnswConfig { m: 16, ef_construct: 100, full_scan_threshold: 10, max_indexing_threads: 0, on_disk: None, payload_m: None, }; // Optimizers used in test let index_optimizer = IndexingOptimizer::new( 2, thresholds_config, dir.path().to_owned(), temp_dir.path().to_owned(), collection_params.clone(), hnsw_config.clone(), Default::default(), ); let mut config_mismatch_optimizer = ConfigMismatchOptimizer::new( thresholds_config, dir.path().to_owned(), temp_dir.path().to_owned(), collection_params, hnsw_config.clone(), Default::default(), ); let permit_cpu_count = num_rayon_threads(hnsw_config.max_indexing_threads); let permit = CpuPermit::dummy(permit_cpu_count as u32); // Use indexing optimizer to build index for HNSW mismatch test let changed = index_optimizer .optimize( locked_holder.clone(), vec![segment_id], permit, &false.into(), ) .unwrap(); assert!(changed > 0, "optimizer should have rebuilt this segment"); assert!( locked_holder.read().get(segment_id).is_none(), "optimized segment should be gone", ); assert_eq!(locked_holder.read().len(), 2, "index must be built"); // Mismatch optimizer should not optimize yet, HNSW config is not changed yet let suggested_to_optimize = config_mismatch_optimizer.check_condition(locked_holder.clone(), &Default::default()); assert_eq!(suggested_to_optimize.len(), 0); // Create changed HNSW config with other m/ef_construct value, update it in the optimizer let mut changed_hnsw_config = hnsw_config; changed_hnsw_config.m /= 2; changed_hnsw_config.ef_construct /= 5; config_mismatch_optimizer.hnsw_config = changed_hnsw_config.clone(); // Run mismatch optimizer again, make sure it optimizes now let permit = CpuPermit::dummy(permit_cpu_count as u32); let suggested_to_optimize = config_mismatch_optimizer.check_condition(locked_holder.clone(), &Default::default()); assert_eq!(suggested_to_optimize.len(), 1); let changed = config_mismatch_optimizer .optimize( locked_holder.clone(), suggested_to_optimize, permit, &false.into(), ) .unwrap(); assert!(changed > 0, "optimizer should have rebuilt this segment"); // Ensure new segment has changed HNSW config locked_holder .read() .iter() .map(|(_, segment)| match segment { LockedSegment::Original(s) => s.read(), LockedSegment::Proxy(_) => unreachable!(), }) .filter(|segment| segment.total_point_count() > 0) .for_each(|segment| { assert_eq!( segment.config().vector_data[""].index, Indexes::Hnsw(changed_hnsw_config.clone()), "segment must be optimized with changed HNSW config", ); }); } /// This test the config mismatch optimizer for a changed vector specific HNSW config /// /// Similar to `test_hnsw_config_mismatch` but for multi vector segment with a vector specific /// change. /// /// It tests whether: /// - the condition check for HNSW mismatches works for a vector specific change /// - optimized segments (and vector storages) use the updated configuration /// /// In short, this is what happens in this test: /// - create randomized multi segment as base /// - use indexing optimizer to build index for our segment /// - test config mismatch condition: should not trigger yet /// - change HNSW config for vector2 /// - test config mismatch condition: should trigger due to HNSW change /// - optimize segment with config mismatch optimizer /// - assert segment uses changed configuration #[test] fn test_hnsw_config_mismatch_vector_specific() { // Collection configuration let (point_count, vector1_dim, vector2_dim) = (1000, 10, 20); let thresholds_config = OptimizerThresholds { max_segment_size_kb: usize::MAX, memmap_threshold_kb: usize::MAX, indexing_threshold_kb: 10, }; let hnsw_config_vector1 = HnswConfigDiff { m: Some(10), ef_construct: Some(40), on_disk: Some(true), ..Default::default() }; let collection_params = CollectionParams { vectors: VectorsConfig::Multi(BTreeMap::from([ ( "vector1".into(), VectorParamsBuilder::new(vector1_dim as u64, Distance::Dot) .with_hnsw_config(hnsw_config_vector1) .build(), ), ( "vector2".into(), VectorParamsBuilder::new(vector2_dim as u64, Distance::Dot).build(), ), ])), ..CollectionParams::empty() }; // Base segment let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap(); let dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); let mut holder = SegmentHolder::default(); let segment = random_multi_vec_segment( dir.path(), 100, point_count, vector1_dim as usize, vector2_dim as usize, ); let segment_id = holder.add_new(segment); let locked_holder: Arc> = Arc::new(RwLock::new(holder)); let hnsw_config_collection = HnswConfig { m: 16, ef_construct: 100, full_scan_threshold: 10, max_indexing_threads: 0, on_disk: None, payload_m: None, }; let permit_cpu_count = num_rayon_threads(hnsw_config_collection.max_indexing_threads); let permit = CpuPermit::dummy(permit_cpu_count as u32); // Optimizers used in test let index_optimizer = IndexingOptimizer::new( 2, thresholds_config, dir.path().to_owned(), temp_dir.path().to_owned(), collection_params.clone(), hnsw_config_collection.clone(), Default::default(), ); let mut config_mismatch_optimizer = ConfigMismatchOptimizer::new( thresholds_config, dir.path().to_owned(), temp_dir.path().to_owned(), collection_params, hnsw_config_collection.clone(), Default::default(), ); // Use indexing optimizer to build index for HNSW mismatch test let changed = index_optimizer .optimize( locked_holder.clone(), vec![segment_id], permit, &false.into(), ) .unwrap(); assert!(changed > 0, "optimizer should have rebuilt this segment"); assert!( locked_holder.read().get(segment_id).is_none(), "optimized segment should be gone", ); assert_eq!(locked_holder.read().len(), 2, "index must be built"); // Mismatch optimizer should not optimize yet, HNSW config is not changed yet let suggested_to_optimize = config_mismatch_optimizer.check_condition(locked_holder.clone(), &Default::default()); assert_eq!(suggested_to_optimize.len(), 0); // Create changed HNSW config for vector2, update it in the optimizer let mut hnsw_config_vector2 = hnsw_config_vector1; hnsw_config_vector2.m = hnsw_config_vector1.m.map(|m| m / 2); hnsw_config_vector2.ef_construct = None; match config_mismatch_optimizer.collection_params.vectors { VectorsConfig::Single(_) => unreachable!(), VectorsConfig::Multi(ref mut map) => { map.get_mut("vector2") .unwrap() .hnsw_config .replace(hnsw_config_vector2); } } // Run mismatch optimizer again, make sure it optimizes now let permit = CpuPermit::dummy(permit_cpu_count as u32); let suggested_to_optimize = config_mismatch_optimizer.check_condition(locked_holder.clone(), &Default::default()); assert_eq!(suggested_to_optimize.len(), 1); let changed = config_mismatch_optimizer .optimize( locked_holder.clone(), suggested_to_optimize, permit, &false.into(), ) .unwrap(); assert!(changed > 0, "optimizer should have rebuilt this segment"); // Ensure new segment has changed HNSW config locked_holder .read() .iter() .map(|(_, segment)| match segment { LockedSegment::Original(s) => s.read(), LockedSegment::Proxy(_) => unreachable!(), }) .filter(|segment| segment.total_point_count() > 0) .for_each(|segment| { assert_eq!( segment.config().vector_data["vector1"].index, Indexes::Hnsw(hnsw_config_vector1.update(&hnsw_config_collection).unwrap()), "HNSW config of vector1 is not what we expect", ); assert_eq!( segment.config().vector_data["vector2"].index, Indexes::Hnsw(hnsw_config_vector2.update(&hnsw_config_collection).unwrap()), "HNSW config of vector2 is not what we expect", ); }); } /// This test the config mismatch optimizer for a changed vector specific HNSW config /// /// Similar to `test_hnsw_config_mismatch` but for multi vector segment with a vector specific /// change. /// /// It tests whether: /// - the condition check for HNSW mismatches works for a vector specific change /// - optimized segments (and vector storages) use the updated configuration /// /// In short, this is what happens in this test: /// - create randomized multi segment as base /// - use indexing optimizer to build index for our segment /// - test config mismatch condition: should not trigger yet /// - change HNSW config for vector2 /// - test config mismatch condition: should trigger due to HNSW change /// - optimize segment with config mismatch optimizer /// - assert segment uses changed configuration #[test] fn test_quantization_config_mismatch_vector_specific() { // Collection configuration let (point_count, vector1_dim, vector2_dim) = (1000, 10, 20); let thresholds_config = OptimizerThresholds { max_segment_size_kb: usize::MAX, memmap_threshold_kb: usize::MAX, indexing_threshold_kb: 10, }; let quantization_config_vector1 = QuantizationConfig::Scalar(segment::types::ScalarQuantization { scalar: ScalarQuantizationConfig { r#type: ScalarType::Int8, quantile: Some(0.99), always_ram: Some(true), }, }); let collection_params = CollectionParams { vectors: VectorsConfig::Multi(BTreeMap::from([ ( "vector1".into(), VectorParamsBuilder::new(vector1_dim as u64, Distance::Dot) .with_quantization_config(quantization_config_vector1.clone()) .build(), ), ( "vector2".into(), VectorParamsBuilder::new(vector2_dim as u64, Distance::Dot).build(), ), ])), ..CollectionParams::empty() }; // Base segment let temp_dir = Builder::new().prefix("segment_temp_dir").tempdir().unwrap(); let dir = Builder::new().prefix("segment_dir").tempdir().unwrap(); let mut holder = SegmentHolder::default(); let segment = random_multi_vec_segment( dir.path(), 100, point_count, vector1_dim as usize, vector2_dim as usize, ); let segment_id = holder.add_new(segment); let locked_holder: Arc> = Arc::new(RwLock::new(holder)); let quantization_config_collection = QuantizationConfig::Scalar(segment::types::ScalarQuantization { scalar: ScalarQuantizationConfig { r#type: ScalarType::Int8, quantile: Some(0.91), always_ram: None, }, }); // Optimizers used in test let index_optimizer = IndexingOptimizer::new( 2, thresholds_config, dir.path().to_owned(), temp_dir.path().to_owned(), collection_params.clone(), Default::default(), Some(quantization_config_collection.clone()), ); let mut config_mismatch_optimizer = ConfigMismatchOptimizer::new( thresholds_config, dir.path().to_owned(), temp_dir.path().to_owned(), collection_params, Default::default(), Some(quantization_config_collection), ); let permit_cpu_count = num_rayon_threads(0); let permit = CpuPermit::dummy(permit_cpu_count as u32); // Use indexing optimizer to build index for quantization mismatch test let changed = index_optimizer .optimize( locked_holder.clone(), vec![segment_id], permit, &false.into(), ) .unwrap(); assert!(changed > 0, "optimizer should have rebuilt this segment"); assert!( locked_holder.read().get(segment_id).is_none(), "optimized segment should be gone", ); assert_eq!(locked_holder.read().len(), 2, "index must be built"); // Mismatch optimizer should not optimize yet, quantization config is not changed yet let suggested_to_optimize = config_mismatch_optimizer.check_condition(locked_holder.clone(), &Default::default()); assert_eq!(suggested_to_optimize.len(), 0); // Create changed quantization config for vector2, update it in the optimizer let quantization_config_vector2 = QuantizationConfig::Product(ProductQuantization { product: ProductQuantizationConfig { compression: CompressionRatio::X32, always_ram: Some(true), }, }); match config_mismatch_optimizer.collection_params.vectors { VectorsConfig::Single(_) => unreachable!(), VectorsConfig::Multi(ref mut map) => { map.get_mut("vector2") .unwrap() .quantization_config .replace(quantization_config_vector2.clone()); } } // Run mismatch optimizer again, make sure it optimizes now let permit = CpuPermit::dummy(permit_cpu_count as u32); let suggested_to_optimize = config_mismatch_optimizer.check_condition(locked_holder.clone(), &Default::default()); assert_eq!(suggested_to_optimize.len(), 1); let changed = config_mismatch_optimizer .optimize( locked_holder.clone(), suggested_to_optimize, permit, &false.into(), ) .unwrap(); assert!(changed > 0, "optimizer should have rebuilt this segment"); // Ensure new segment has changed quantization config locked_holder .read() .iter() .map(|(_, segment)| match segment { LockedSegment::Original(s) => s.read(), LockedSegment::Proxy(_) => unreachable!(), }) .filter(|segment| segment.total_point_count() > 0) .for_each(|segment| { assert_eq!( segment.config().vector_data["vector1"].quantization_config, Some(quantization_config_vector1.clone()), "Quantization config of vector1 is not what we expect", ); assert_eq!( segment.config().vector_data["vector2"].quantization_config, Some(quantization_config_vector2.clone()), "Quantization config of vector2 is not what we expect", ); }); } }