pub mod clock_map; pub mod disk_usage_watcher; pub(super) mod facet; pub(super) mod query; pub(super) mod scroll; pub(super) mod search; pub(super) mod shard_ops; use std::collections::{BTreeSet, HashMap}; use std::mem::size_of; use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use std::thread; use std::time::{Duration, Instant}; use arc_swap::ArcSwap; use common::cpu::CpuBudget; use common::types::TelemetryDetail; use common::{panic, tar_ext}; use indicatif::{ProgressBar, ProgressStyle}; use itertools::Itertools; use parking_lot::{Mutex as ParkingMutex, RwLock}; use segment::data_types::vectors::VectorElementType; use segment::entry::entry_point::SegmentEntry as _; use segment::index::field_index::CardinalityEstimation; use segment::segment::Segment; use segment::segment_constructor::{build_segment, load_segment}; use segment::types::{ CompressionRatio, Filter, PayloadIndexInfo, PayloadKeyType, PointIdType, QuantizationConfig, SegmentConfig, SegmentType, SnapshotFormat, }; use segment::utils::mem::Mem; use segment::vector_storage::common::get_async_scorer; use tokio::fs::{create_dir_all, remove_dir_all, remove_file}; use tokio::runtime::Handle; use tokio::sync::mpsc::Sender; use tokio::sync::{mpsc, oneshot, Mutex, RwLock as TokioRwLock}; use wal::{Wal, WalOptions}; use self::clock_map::{ClockMap, RecoveryPoint}; use self::disk_usage_watcher::DiskUsageWatcher; use super::update_tracker::UpdateTracker; use crate::collection::payload_index_schema::PayloadIndexSchema; use crate::collection_manager::collection_updater::CollectionUpdater; use crate::collection_manager::holders::segment_holder::{ LockedSegment, LockedSegmentHolder, SegmentHolder, }; use crate::collection_manager::optimizers::TrackerLog; use crate::collection_manager::segments_searcher::SegmentsSearcher; use crate::common::file_utils::{move_dir, move_file}; use crate::config::CollectionConfigInternal; use crate::operations::shared_storage_config::SharedStorageConfig; use crate::operations::types::{ check_sparse_compatible_with_segment_config, CollectionError, CollectionResult, OptimizersStatus, ShardInfoInternal, ShardStatus, }; use crate::operations::OperationWithClockTag; use crate::optimizers_builder::{build_optimizers, clear_temp_segments, OptimizersConfig}; use crate::save_on_disk::SaveOnDisk; use crate::shards::shard::ShardId; use crate::shards::shard_config::ShardConfig; use crate::shards::telemetry::{LocalShardTelemetry, OptimizerTelemetry}; use crate::shards::CollectionId; use crate::update_handler::{Optimizer, UpdateHandler, UpdateSignal}; use crate::wal::SerdeWal; use crate::wal_delta::{LockedWal, RecoverableWal}; /// If rendering WAL load progression in basic text form, report progression every 60 seconds. const WAL_LOAD_REPORT_EVERY: Duration = Duration::from_secs(60); const WAL_PATH: &str = "wal"; const SEGMENTS_PATH: &str = "segments"; /// LocalShard /// /// LocalShard is an entity that can be moved between peers and contains some part of one collections data. /// /// Holds all object, required for collection functioning pub struct LocalShard { pub(super) segments: LockedSegmentHolder, pub(super) collection_config: Arc>, pub(super) shared_storage_config: Arc, pub(crate) payload_index_schema: Arc>, pub(super) wal: RecoverableWal, pub(super) update_handler: Arc>, pub(super) update_sender: ArcSwap>, pub(super) update_tracker: UpdateTracker, pub(super) path: PathBuf, pub(super) optimizers: Arc>>, pub(super) optimizers_log: Arc>, pub(super) total_optimized_points: Arc, update_runtime: Handle, pub(super) search_runtime: Handle, disk_usage_watcher: DiskUsageWatcher, } /// Shard holds information about segments and WAL. impl LocalShard { pub async fn move_data(from: &Path, to: &Path) -> CollectionResult<()> { let wal_from = Self::wal_path(from); let wal_to = Self::wal_path(to); let segments_from = Self::segments_path(from); let segments_to = Self::segments_path(to); move_dir(wal_from, wal_to).await?; move_dir(segments_from, segments_to).await?; LocalShardClocks::move_data(from, to).await?; Ok(()) } /// Checks if path have local shard data present pub fn check_data(shard_path: &Path) -> bool { let wal_path = Self::wal_path(shard_path); let segments_path = Self::segments_path(shard_path); wal_path.exists() && segments_path.exists() } /// Clear local shard related data. /// /// Do NOT remove config file. pub async fn clear(shard_path: &Path) -> CollectionResult<()> { // Delete WAL let wal_path = Self::wal_path(shard_path); if wal_path.exists() { remove_dir_all(wal_path).await?; } // Delete segments let segments_path = Self::segments_path(shard_path); if segments_path.exists() { remove_dir_all(segments_path).await?; } LocalShardClocks::delete_data(shard_path).await?; Ok(()) } #[allow(clippy::too_many_arguments)] pub async fn new( segment_holder: SegmentHolder, collection_config: Arc>, shared_storage_config: Arc, payload_index_schema: Arc>, wal: SerdeWal, optimizers: Arc>>, optimizer_cpu_budget: CpuBudget, shard_path: &Path, clocks: LocalShardClocks, update_runtime: Handle, search_runtime: Handle, ) -> Self { let segment_holder = Arc::new(RwLock::new(segment_holder)); let config = collection_config.read().await; let locked_wal = Arc::new(ParkingMutex::new(wal)); let optimizers_log = Arc::new(ParkingMutex::new(Default::default())); let total_optimized_points = Arc::new(AtomicUsize::new(0)); // default to 2x the WAL capacity let disk_buffer_threshold_mb = 2 * (collection_config.read().await.wal_config.wal_capacity_mb); let disk_usage_watcher = disk_usage_watcher::DiskUsageWatcher::new( shard_path.to_owned(), disk_buffer_threshold_mb, ) .await; let mut update_handler = UpdateHandler::new( shared_storage_config.clone(), payload_index_schema.clone(), optimizers.clone(), optimizers_log.clone(), total_optimized_points.clone(), optimizer_cpu_budget.clone(), update_runtime.clone(), segment_holder.clone(), locked_wal.clone(), config.optimizer_config.flush_interval_sec, config.optimizer_config.max_optimization_threads, clocks.clone(), shard_path.into(), ); let (update_sender, update_receiver) = mpsc::channel(shared_storage_config.update_queue_size); update_handler.run_workers(update_receiver); let update_tracker = segment_holder.read().update_tracker(); drop(config); // release `shared_config` from borrow checker Self { segments: segment_holder, collection_config, shared_storage_config, payload_index_schema, wal: RecoverableWal::new(locked_wal, clocks.newest_clocks, clocks.oldest_clocks), update_handler: Arc::new(Mutex::new(update_handler)), update_sender: ArcSwap::from_pointee(update_sender), update_tracker, path: shard_path.to_owned(), update_runtime, search_runtime, optimizers, optimizers_log, total_optimized_points, disk_usage_watcher, } } pub(super) fn segments(&self) -> &RwLock { self.segments.deref() } /// Recovers shard from disk. #[allow(clippy::too_many_arguments)] pub async fn load( id: ShardId, collection_id: CollectionId, shard_path: &Path, collection_config: Arc>, effective_optimizers_config: OptimizersConfig, shared_storage_config: Arc, payload_index_schema: Arc>, update_runtime: Handle, search_runtime: Handle, optimizer_cpu_budget: CpuBudget, ) -> CollectionResult { let collection_config_read = collection_config.read().await; let wal_path = Self::wal_path(shard_path); let segments_path = Self::segments_path(shard_path); let wal: SerdeWal = SerdeWal::new( wal_path.to_str().unwrap(), (&collection_config_read.wal_config).into(), ) .map_err(|e| CollectionError::service_error(format!("Wal error: {e}")))?; let segment_dirs = std::fs::read_dir(&segments_path).map_err(|err| { CollectionError::service_error(format!( "Can't read segments directory due to {}\nat {}", err, segments_path.to_str().unwrap() )) })?; let mut load_handlers = vec![]; // This semaphore is used to limit the number of threads that load segments concurrently. // Uncomment it if you need to debug segment loading. // let semaphore = Arc::new(parking_lot::Mutex::new(())); for entry in segment_dirs { let segments_path = entry.unwrap().path(); let payload_index_schema = payload_index_schema.clone(); // let semaphore_clone = semaphore.clone(); load_handlers.push( thread::Builder::new() .name(format!("shard-load-{collection_id}-{id}")) .spawn(move || { // let _guard = semaphore_clone.lock(); let mut res = load_segment(&segments_path, &AtomicBool::new(false))?; if let Some(segment) = &mut res { segment.check_consistency_and_repair()?; segment.update_all_field_indices( &payload_index_schema.read().schema.clone(), )?; } else { std::fs::remove_dir_all(&segments_path).map_err(|err| { CollectionError::service_error(format!( "Can't remove leftover segment {}, due to {err}", segments_path.to_str().unwrap(), )) })?; } Ok::<_, CollectionError>(res) })?, ); } let mut segment_holder = SegmentHolder::default(); for handler in load_handlers { let segment = handler.join().map_err(|err| { CollectionError::service_error(format!( "Can't join segment load thread: {:?}", err.type_id() )) })??; let Some(segment) = segment else { continue; }; collection_config_read .params .vectors .check_compatible_with_segment_config(&segment.config().vector_data, true)?; collection_config_read .params .sparse_vectors .as_ref() .map(|sparse_vectors| { check_sparse_compatible_with_segment_config( sparse_vectors, &segment.config().sparse_vector_data, true, ) }) .unwrap_or(Ok(()))?; segment_holder.add_new(segment); } let res = segment_holder.deduplicate_points().await?; if res > 0 { log::debug!("Deduplicated {} points", res); } clear_temp_segments(shard_path); let optimizers = build_optimizers( shard_path, &collection_config_read.params, &effective_optimizers_config, &collection_config_read.hnsw_config, &collection_config_read.quantization_config, ); drop(collection_config_read); // release `shared_config` from borrow checker let clocks = LocalShardClocks::load(shard_path)?; // Always make sure we have any appendable segments, needed for update operations if !segment_holder.has_appendable_segment() { debug_assert!( false, "Shard has no appendable segments, this should never happen", ); log::warn!("Shard has no appendable segments, this should never happen. Creating new appendable segment now"); let segments_path = LocalShard::segments_path(shard_path); let collection_params = collection_config.read().await.params.clone(); let payload_index_schema = payload_index_schema.read(); segment_holder.create_appendable_segment( &segments_path, &collection_params, &payload_index_schema, )?; } let local_shard = LocalShard::new( segment_holder, collection_config, shared_storage_config, payload_index_schema, wal, optimizers, optimizer_cpu_budget, shard_path, clocks, update_runtime, search_runtime, ) .await; // Apply outstanding operations from WAL local_shard.load_from_wal(collection_id).await?; let available_memory_bytes = Mem::new().available_memory_bytes() as usize; let vectors_size_bytes = local_shard.estimate_vector_data_size().await; // Simple heuristic to exclude mmap prefaulting for collections that won't benefit from it. // // We assume that mmap prefaulting is beneficial if we can put significant part of data // into RAM in advance. However, if we can see that the data is too big to fit into RAM, // it is better to avoid prefaulting, because it will only cause extra disk IO. // // This heuristic is not perfect, but it exclude cases when we don't have enough RAM // even to store half of the vector data. let do_mmap_prefault = available_memory_bytes * 2 > vectors_size_bytes; if do_mmap_prefault { for (_, segment) in local_shard.segments.read().iter() { if let LockedSegment::Original(segment) = segment { segment.read().prefault_mmap_pages(); } } } Ok(local_shard) } pub fn shard_path(&self) -> PathBuf { self.path.clone() } pub fn wal_path(shard_path: &Path) -> PathBuf { shard_path.join(WAL_PATH) } pub fn segments_path(shard_path: &Path) -> PathBuf { shard_path.join(SEGMENTS_PATH) } #[allow(clippy::too_many_arguments)] pub async fn build_local( id: ShardId, collection_id: CollectionId, shard_path: &Path, collection_config: Arc>, shared_storage_config: Arc, payload_index_schema: Arc>, update_runtime: Handle, search_runtime: Handle, optimizer_cpu_budget: CpuBudget, effective_optimizers_config: OptimizersConfig, ) -> CollectionResult { // initialize local shard config file let local_shard_config = ShardConfig::new_replica_set(); let shard = Self::build( id, collection_id, shard_path, collection_config, shared_storage_config, payload_index_schema, update_runtime, search_runtime, optimizer_cpu_budget, effective_optimizers_config, ) .await?; local_shard_config.save(shard_path)?; Ok(shard) } /// Creates new empty shard with given configuration, initializing all storages, optimizers and directories. #[allow(clippy::too_many_arguments)] pub async fn build( id: ShardId, collection_id: CollectionId, shard_path: &Path, collection_config: Arc>, shared_storage_config: Arc, payload_index_schema: Arc>, update_runtime: Handle, search_runtime: Handle, optimizer_cpu_budget: CpuBudget, effective_optimizers_config: OptimizersConfig, ) -> CollectionResult { let config = collection_config.read().await; let wal_path = Self::wal_path(shard_path); create_dir_all(&wal_path).await.map_err(|err| { CollectionError::service_error(format!( "Can't create shard wal directory. Error: {err}" )) })?; let segments_path = Self::segments_path(shard_path); create_dir_all(&segments_path).await.map_err(|err| { CollectionError::service_error(format!( "Can't create shard segments directory. Error: {err}" )) })?; let mut segment_holder = SegmentHolder::default(); let mut build_handlers = vec![]; let vector_params = config.params.to_base_vector_data()?; let sparse_vector_params = config.params.to_sparse_vector_data()?; let segment_number = config.optimizer_config.get_number_segments(); for _sid in 0..segment_number { let path_clone = segments_path.clone(); let segment_config = SegmentConfig { vector_data: vector_params.clone(), sparse_vector_data: sparse_vector_params.clone(), payload_storage_type: config.params.payload_storage_type(), }; let segment = thread::Builder::new() .name(format!("shard-build-{collection_id}-{id}")) .spawn(move || build_segment(&path_clone, &segment_config, true)) .unwrap(); build_handlers.push(segment); } let join_results = build_handlers .into_iter() .map(|handler| handler.join()) .collect_vec(); for join_result in join_results { let segment = join_result.map_err(|err| { let message = panic::downcast_str(&err).unwrap_or(""); let separator = if !message.is_empty() { "with:\n" } else { "" }; CollectionError::service_error(format!( "Segment DB create panicked{separator}{message}", )) })??; segment_holder.add_new(segment); } let wal: SerdeWal = SerdeWal::new(wal_path.to_str().unwrap(), (&config.wal_config).into())?; let optimizers = build_optimizers( shard_path, &config.params, &effective_optimizers_config, &config.hnsw_config, &config.quantization_config, ); drop(config); // release `shared_config` from borrow checker let collection = LocalShard::new( segment_holder, collection_config, shared_storage_config, payload_index_schema, wal, optimizers, optimizer_cpu_budget, shard_path, LocalShardClocks::default(), update_runtime, search_runtime, ) .await; Ok(collection) } pub async fn stop_flush_worker(&self) { let mut update_handler = self.update_handler.lock().await; update_handler.stop_flush_worker() } pub async fn wait_update_workers_stop(&self) -> CollectionResult<()> { let mut update_handler = self.update_handler.lock().await; update_handler.wait_workers_stops().await } /// Loads latest collection operations from WAL pub async fn load_from_wal(&self, collection_id: CollectionId) -> CollectionResult<()> { let mut newest_clocks = self.wal.newest_clocks.lock().await; let wal = self.wal.wal.lock(); let bar = ProgressBar::new(wal.len(false)); let progress_style = ProgressStyle::default_bar() .template("{msg} [{elapsed_precise}] {wide_bar} {pos}/{len} (eta:{eta})") .expect("Failed to create progress style"); bar.set_style(progress_style); bar.set_message(format!("Recovering collection {collection_id}")); let segments = self.segments(); // Fall back to basic text output if the progress bar is hidden (e.g. not a tty) let show_progress_bar = !bar.is_hidden(); let mut last_progress_report = Instant::now(); if !show_progress_bar { log::info!( "Recovering collection {collection_id}: 0/{} (0%)", wal.len(false), ); } // When `Segment`s are flushed, WAL is truncated up to the index of the last operation // that has been applied and flushed. // // `SerdeWal` wrapper persists/keeps track of this index (in addition to any handling // in the `wal` crate itself). // // `SerdeWal::read_all` starts reading WAL from the first "un-truncated" index, // so no additional handling required to "skip" any potentially applied entries. // // Note, that it's not guaranteed that some operation won't be re-applied to the storage. // (`SerdeWal::read_all` may even start reading WAL from some already truncated // index *occasionally*), but the storage can handle it. for (op_num, update) in wal.read_all(false) { if let Some(clock_tag) = update.clock_tag { newest_clocks.advance_clock(clock_tag); } // Propagate `CollectionError::ServiceError`, but skip other error types. match &CollectionUpdater::update(segments, op_num, update.operation) { Err(err @ CollectionError::ServiceError { error, backtrace }) => { let path = self.path.display(); log::error!( "Can't apply WAL operation: {error}, \ collection: {collection_id}, \ shard: {path}, \ op_num: {op_num}" ); if let Some(backtrace) = &backtrace { log::error!("Backtrace: {}", backtrace); } return Err(err.clone()); } Err(err @ CollectionError::OutOfMemory { .. }) => { log::error!("{err}"); return Err(err.clone()); } Err(err @ CollectionError::NotFound { .. }) => log::warn!("{err}"), Err(err) => log::error!("{err}"), Ok(_) => (), } // Update progress bar or show text progress every WAL_LOAD_REPORT_EVERY bar.inc(1); if !show_progress_bar && last_progress_report.elapsed() >= WAL_LOAD_REPORT_EVERY { let progress = bar.position(); log::info!( "{progress}/{} ({}%)", wal.len(false), (progress as f32 / wal.len(false) as f32 * 100.0) as usize, ); last_progress_report = Instant::now(); } } { let segments = self.segments.read(); // It is possible, that after recovery, if WAL flush was not enforced. // We could be left with some un-versioned points. // To maintain consistency, we can either remove them or try to recover. for (_idx, segment) in segments.iter() { match segment { LockedSegment::Original(raw_segment) => { raw_segment.write().cleanup_versions()?; } LockedSegment::Proxy(_) => { debug_assert!(false, "Proxy segment found in load_from_wal"); } } } // Force a flush after re-applying WAL operations, to ensure we maintain on-disk data // consistency, if we happened to only apply *past* operations to a segment with newer // version. segments.flush_all(true, true)?; } bar.finish(); if !show_progress_bar { log::info!( "Recovered collection {collection_id}: {0}/{0} (100%)", wal.len(false), ); } // The storage is expected to be consistent after WAL recovery #[cfg(feature = "data-consistency-check")] self.check_data_consistency()?; Ok(()) } /// Check data consistency for all segments /// /// Returns an error at the first inconsistent segment pub fn check_data_consistency(&self) -> CollectionResult<()> { log::info!("Checking data consistency for shard {:?}", self.path); let segments = self.segments.read(); for (_idx, segment) in segments.iter() { match segment { LockedSegment::Original(raw_segment) => { let segment_guard = raw_segment.read(); if let Err(err) = segment_guard.check_data_consistency() { log::error!( "Segment {:?} is inconsistent: {}", segment_guard.current_path, err ); return Err(err.into()); } } LockedSegment::Proxy(_) => { return Err(CollectionError::service_error( "Proxy segment found in check_data_consistency", )); } } } Ok(()) } pub async fn on_optimizer_config_update(&self) -> CollectionResult<()> { let config = self.collection_config.read().await; let mut update_handler = self.update_handler.lock().await; let (update_sender, update_receiver) = mpsc::channel(self.shared_storage_config.update_queue_size); // makes sure that the Stop signal is the last one in this channel let old_sender = self.update_sender.swap(Arc::new(update_sender)); old_sender.send(UpdateSignal::Stop).await?; update_handler.stop_flush_worker(); update_handler.wait_workers_stops().await?; let new_optimizers = build_optimizers( &self.path, &config.params, &config.optimizer_config, &config.hnsw_config, &config.quantization_config, ); update_handler.optimizers = new_optimizers; update_handler.flush_interval_sec = config.optimizer_config.flush_interval_sec; update_handler.max_optimization_threads = config.optimizer_config.max_optimization_threads; update_handler.run_workers(update_receiver); self.update_sender.load().send(UpdateSignal::Nop).await?; Ok(()) } pub fn trigger_optimizers(&self) { // Send a trigger signal and ignore errors because all error cases are acceptable: // - If receiver is already dead - we do not care // - If channel is full - optimization will be triggered by some other signal let _ = self.update_sender.load().try_send(UpdateSignal::Nop); } /// Finishes ongoing update tasks pub async fn stop_gracefully(&self) { if let Err(err) = self.update_sender.load().send(UpdateSignal::Stop).await { log::warn!("Error sending stop signal to update handler: {}", err); } self.stop_flush_worker().await; if let Err(err) = self.wait_update_workers_stop().await { log::warn!("Update workers failed with: {}", err); } } pub fn restore_snapshot(snapshot_path: &Path) -> CollectionResult<()> { // Read dir first as the directory contents would change during restore. let entries = std::fs::read_dir(LocalShard::segments_path(snapshot_path))? .collect::, _>>()?; for entry in entries { Segment::restore_snapshot_in_place(&entry.path())?; } Ok(()) } /// Create snapshot for local shard into `target_path` pub async fn create_snapshot( &self, temp_path: &Path, tar: &tar_ext::BuilderExt, format: SnapshotFormat, save_wal: bool, ) -> CollectionResult<()> { let segments = self.segments.clone(); let wal = self.wal.wal.clone(); if !save_wal { // If we are not saving WAL, we still need to make sure that all submitted by this point // updates have made it to the segments. So we use the Plunger to achieve that. // It will notify us when all submitted updates so far have been processed. let (tx, rx) = oneshot::channel(); let plunger = UpdateSignal::Plunger(tx); self.update_sender.load().send(plunger).await?; rx.await?; } let segments_path = Self::segments_path(&self.path); let collection_params = self.collection_config.read().await.params.clone(); let temp_path = temp_path.to_owned(); let payload_index_schema = self.payload_index_schema.clone(); let tar_c = tar.clone(); tokio::task::spawn_blocking(move || { // Do not change segments while snapshotting SegmentHolder::snapshot_all_segments( segments.clone(), &segments_path, Some(&collection_params), &payload_index_schema.read().clone(), &temp_path, &tar_c.descend(Path::new(SEGMENTS_PATH))?, format, )?; if save_wal { // snapshot all shard's WAL Self::snapshot_wal(wal, &tar_c) } else { Self::snapshot_empty_wal(wal, &temp_path, &tar_c) } }) .await??; LocalShardClocks::archive_data(&self.path, tar).await?; Ok(()) } /// Create empty WAL which is compatible with currently stored data pub fn snapshot_empty_wal( wal: LockedWal, temp_path: &Path, tar: &tar_ext::BuilderExt, ) -> CollectionResult<()> { let (segment_capacity, latest_op_num) = { let wal_guard = wal.lock(); (wal_guard.segment_capacity(), wal_guard.last_index()) }; let temp_dir = tempfile::tempdir_in(temp_path).map_err(|err| { CollectionError::service_error(format!( "Can not create temporary directory for WAL: {err}", )) })?; Wal::generate_empty_wal_starting_at_index( temp_dir.path(), &WalOptions { segment_capacity, segment_queue_len: 0, }, latest_op_num, ) .map_err(|err| { CollectionError::service_error(format!("Error while create empty WAL: {err}")) })?; tar.blocking_append_dir_all(temp_dir.path(), Path::new(WAL_PATH)) .map_err(|err| { CollectionError::service_error(format!("Error while archiving WAL: {err}")) }) } /// snapshot WAL pub fn snapshot_wal(wal: LockedWal, tar: &tar_ext::BuilderExt) -> CollectionResult<()> { // lock wal during snapshot let mut wal_guard = wal.lock(); wal_guard.flush()?; let source_wal_path = wal_guard.path(); let tar = tar.descend(Path::new(WAL_PATH))?; for entry in std::fs::read_dir(source_wal_path).map_err(|err| { CollectionError::service_error(format!("Can't read WAL directory: {err}",)) })? { let entry = entry.map_err(|err| { CollectionError::service_error(format!("Can't read WAL directory: {err}",)) })?; if entry.file_name() == ".wal" { // This sentinel file is used for WAL locking. Trying to archive // or open it will cause the following error on Windows: // > The process cannot access the file because another process // > has locked a portion of the file. (os error 33) // https://github.com/qdrant/wal/blob/7c9202d0874/src/lib.rs#L125-L145 continue; } tar.blocking_append_file(&entry.path(), Path::new(&entry.file_name())) .map_err(|err| { CollectionError::service_error(format!("Error while archiving WAL: {err}")) })?; } Ok(()) } pub fn estimate_cardinality<'a>( &'a self, filter: Option<&'a Filter>, ) -> CollectionResult { let segments = self.segments().read(); let cardinality = segments .iter() .map(|(_id, segment)| segment.get().read().estimate_point_count(filter)) .fold(CardinalityEstimation::exact(0), |acc, x| { CardinalityEstimation { primary_clauses: vec![], min: acc.min + x.min, exp: acc.exp + x.exp, max: acc.max + x.max, } }); Ok(cardinality) } pub async fn read_filtered<'a>( &'a self, filter: Option<&'a Filter>, runtime_handle: &Handle, ) -> CollectionResult> { let segments = self.segments.clone(); SegmentsSearcher::read_filtered(segments, filter, runtime_handle).await } pub fn get_telemetry_data(&self, detail: TelemetryDetail) -> LocalShardTelemetry { let segments_read_guard = self.segments.read(); let segments: Vec<_> = segments_read_guard .iter() .map(|(_id, segment)| segment.get().read().get_telemetry_data(detail)) .collect(); let optimizer_status = match &segments_read_guard.optimizer_errors { None => OptimizersStatus::Ok, Some(error) => OptimizersStatus::Error(error.to_string()), }; drop(segments_read_guard); let optimizations = self .optimizers .iter() .map(|optimizer| { optimizer .get_telemetry_counter() .lock() .get_statistics(detail) }) .fold(Default::default(), |acc, x| acc + x); let total_optimized_points = self.total_optimized_points.load(Ordering::Relaxed); LocalShardTelemetry { variant_name: None, status: None, total_optimized_points, segments, optimizations: OptimizerTelemetry { status: optimizer_status, optimizations, log: self.optimizers_log.lock().to_telemetry(), }, async_scorer: Some(get_async_scorer()), } } /// Returns estimated size of vector data in bytes async fn estimate_vector_data_size(&self) -> usize { let info = self.local_shard_info().await; let vector_size: usize = info .config .params .vectors .params_iter() .map(|(_, value)| { let vector_size = value.size.get() as usize; let quantization_config = value .quantization_config .as_ref() .or(info.config.quantization_config.as_ref()); let quantized_size_bytes = match quantization_config { None => 0, Some(QuantizationConfig::Scalar(_)) => vector_size, Some(QuantizationConfig::Product(pq)) => match pq.product.compression { CompressionRatio::X4 => vector_size, CompressionRatio::X8 => vector_size / 2, CompressionRatio::X16 => vector_size / 4, CompressionRatio::X32 => vector_size / 8, CompressionRatio::X64 => vector_size / 16, }, Some(QuantizationConfig::Binary(_)) => vector_size / 8, }; vector_size * size_of::() + quantized_size_bytes }) .sum(); vector_size * info.points_count } pub async fn local_shard_status(&self) -> (ShardStatus, OptimizersStatus) { { let segments = self.segments().read(); // Red status on failed operation or optimizer error if !segments.failed_operation.is_empty() || segments.optimizer_errors.is_some() { let optimizer_status = segments .optimizer_errors .as_ref() .map_or(OptimizersStatus::Ok, |err| { OptimizersStatus::Error(err.to_string()) }); return (ShardStatus::Red, optimizer_status); } // Yellow status if we have a special segment, indicates a proxy segment used during optimization // TODO: snapshotting also creates temp proxy segments. should differentiate. let has_special_segment = segments .iter() .map(|(_, segment)| segment.get().read().info().segment_type) .any(|segment_type| segment_type == SegmentType::Special); if has_special_segment { return (ShardStatus::Yellow, OptimizersStatus::Ok); } } // Yellow or grey status if there are pending optimizations // Grey if optimizers were not triggered yet after restart, // we don't automatically trigger them to prevent a crash loop let (has_triggered_any_optimizers, has_suboptimal_optimizers) = self .update_handler .lock() .await .check_optimizer_conditions(); if has_suboptimal_optimizers { let status = if has_triggered_any_optimizers { ShardStatus::Yellow } else { ShardStatus::Grey }; return (status, OptimizersStatus::Ok); } // Green status because everything is fine (ShardStatus::Green, OptimizersStatus::Ok) } pub async fn local_shard_info(&self) -> ShardInfoInternal { let collection_config = self.collection_config.read().await.clone(); let mut vectors_count = 0; let mut indexed_vectors_count = 0; let mut points_count = 0; let mut segments_count = 0; let mut schema: HashMap = Default::default(); { let segments = self.segments().read(); for (_idx, segment) in segments.iter() { segments_count += 1; let segment_info = segment.get().read().info(); vectors_count += segment_info.num_vectors; indexed_vectors_count += segment_info.num_indexed_vectors; points_count += segment_info.num_points; for (key, val) in segment_info.index_schema { schema .entry(key) .and_modify(|entry| entry.points += val.points) .or_insert(val); } } } let (status, optimizer_status) = self.local_shard_status().await; ShardInfoInternal { status, optimizer_status, vectors_count, indexed_vectors_count, points_count, segments_count, config: collection_config, payload_schema: schema, } } pub fn update_tracker(&self) -> &UpdateTracker { &self.update_tracker } /// Get the recovery point for the current shard /// /// This is sourced from the last seen clocks from other nodes that we know about. pub async fn recovery_point(&self) -> RecoveryPoint { self.wal.recovery_point().await } /// Update the cutoff point on the current shard /// /// This also updates the highest seen clocks. pub async fn update_cutoff(&self, cutoff: &RecoveryPoint) { self.wal.update_cutoff(cutoff).await } } impl Drop for LocalShard { fn drop(&mut self) { thread::scope(|s| { let handle = thread::Builder::new() .name("drop-shard".to_string()) .spawn_scoped(s, || { // Needs dedicated thread to avoid `Cannot start a runtime from within a runtime` error. self.update_runtime .block_on(async { self.stop_gracefully().await }) }); handle.expect("Failed to create thread for shard drop"); }) } } const NEWEST_CLOCKS_PATH: &str = "newest_clocks.json"; const OLDEST_CLOCKS_PATH: &str = "oldest_clocks.json"; /// Convenience struct for combining clock maps belonging to a shard /// /// Holds a clock map for tracking the highest clocks and the cutoff clocks. #[derive(Clone, Debug, Default)] pub struct LocalShardClocks { newest_clocks: Arc>, oldest_clocks: Arc>, } impl LocalShardClocks { fn new(newest_clocks: ClockMap, oldest_clocks: ClockMap) -> Self { Self { newest_clocks: Arc::new(Mutex::new(newest_clocks)), oldest_clocks: Arc::new(Mutex::new(oldest_clocks)), } } // Load clock maps from disk pub fn load(shard_path: &Path) -> CollectionResult { let newest_clocks = ClockMap::load_or_default(&Self::newest_clocks_path(shard_path))?; let oldest_clocks = ClockMap::load_or_default(&Self::oldest_clocks_path(shard_path))?; Ok(Self::new(newest_clocks, oldest_clocks)) } /// Persist clock maps to disk pub async fn store_if_changed(&self, shard_path: &Path) -> CollectionResult<()> { self.oldest_clocks .lock() .await .store_if_changed(&Self::oldest_clocks_path(shard_path))?; self.newest_clocks .lock() .await .store_if_changed(&Self::newest_clocks_path(shard_path))?; Ok(()) } /// Put clock data from the disk into an archive. pub async fn archive_data(from: &Path, tar: &tar_ext::BuilderExt) -> CollectionResult<()> { let newest_clocks_from = Self::newest_clocks_path(from); let oldest_clocks_from = Self::oldest_clocks_path(from); if newest_clocks_from.exists() { tar.append_file(&newest_clocks_from, Path::new(NEWEST_CLOCKS_PATH)) .await?; } if oldest_clocks_from.exists() { tar.append_file(&oldest_clocks_from, Path::new(OLDEST_CLOCKS_PATH)) .await?; } Ok(()) } /// Move clock data on disk from one shard path to another. pub async fn move_data(from: &Path, to: &Path) -> CollectionResult<()> { let newest_clocks_from = Self::newest_clocks_path(from); let oldest_clocks_from = Self::oldest_clocks_path(from); if newest_clocks_from.exists() { let newest_clocks_to = Self::newest_clocks_path(to); move_file(newest_clocks_from, newest_clocks_to).await?; } if oldest_clocks_from.exists() { let oldest_clocks_to = Self::oldest_clocks_path(to); move_file(oldest_clocks_from, oldest_clocks_to).await?; } Ok(()) } /// Delete clock data from disk at the given shard path. pub async fn delete_data(shard_path: &Path) -> CollectionResult<()> { let newest_clocks_path = Self::newest_clocks_path(shard_path); let oldest_clocks_path = Self::oldest_clocks_path(shard_path); if newest_clocks_path.exists() { remove_file(newest_clocks_path).await?; } if oldest_clocks_path.exists() { remove_file(oldest_clocks_path).await?; } Ok(()) } fn newest_clocks_path(shard_path: &Path) -> PathBuf { shard_path.join(NEWEST_CLOCKS_PATH) } fn oldest_clocks_path(shard_path: &Path) -> PathBuf { shard_path.join(OLDEST_CLOCKS_PATH) } }