Spaces:
Build error
Build error
pub mod clock_map; | |
pub mod disk_usage_watcher; | |
pub(super) mod facet; | |
pub(super) mod query; | |
pub(super) mod scroll; | |
pub(super) mod search; | |
pub(super) mod shard_ops; | |
use std::collections::{BTreeSet, HashMap}; | |
use std::mem::size_of; | |
use std::ops::Deref; | |
use std::path::{Path, PathBuf}; | |
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; | |
use std::sync::Arc; | |
use std::thread; | |
use std::time::{Duration, Instant}; | |
use arc_swap::ArcSwap; | |
use common::cpu::CpuBudget; | |
use common::types::TelemetryDetail; | |
use common::{panic, tar_ext}; | |
use indicatif::{ProgressBar, ProgressStyle}; | |
use itertools::Itertools; | |
use parking_lot::{Mutex as ParkingMutex, RwLock}; | |
use segment::data_types::vectors::VectorElementType; | |
use segment::entry::entry_point::SegmentEntry as _; | |
use segment::index::field_index::CardinalityEstimation; | |
use segment::segment::Segment; | |
use segment::segment_constructor::{build_segment, load_segment}; | |
use segment::types::{ | |
CompressionRatio, Filter, PayloadIndexInfo, PayloadKeyType, PointIdType, QuantizationConfig, | |
SegmentConfig, SegmentType, SnapshotFormat, | |
}; | |
use segment::utils::mem::Mem; | |
use segment::vector_storage::common::get_async_scorer; | |
use tokio::fs::{create_dir_all, remove_dir_all, remove_file}; | |
use tokio::runtime::Handle; | |
use tokio::sync::mpsc::Sender; | |
use tokio::sync::{mpsc, oneshot, Mutex, RwLock as TokioRwLock}; | |
use wal::{Wal, WalOptions}; | |
use self::clock_map::{ClockMap, RecoveryPoint}; | |
use self::disk_usage_watcher::DiskUsageWatcher; | |
use super::update_tracker::UpdateTracker; | |
use crate::collection::payload_index_schema::PayloadIndexSchema; | |
use crate::collection_manager::collection_updater::CollectionUpdater; | |
use crate::collection_manager::holders::segment_holder::{ | |
LockedSegment, LockedSegmentHolder, SegmentHolder, | |
}; | |
use crate::collection_manager::optimizers::TrackerLog; | |
use crate::collection_manager::segments_searcher::SegmentsSearcher; | |
use crate::common::file_utils::{move_dir, move_file}; | |
use crate::config::CollectionConfigInternal; | |
use crate::operations::shared_storage_config::SharedStorageConfig; | |
use crate::operations::types::{ | |
check_sparse_compatible_with_segment_config, CollectionError, CollectionResult, | |
OptimizersStatus, ShardInfoInternal, ShardStatus, | |
}; | |
use crate::operations::OperationWithClockTag; | |
use crate::optimizers_builder::{build_optimizers, clear_temp_segments, OptimizersConfig}; | |
use crate::save_on_disk::SaveOnDisk; | |
use crate::shards::shard::ShardId; | |
use crate::shards::shard_config::ShardConfig; | |
use crate::shards::telemetry::{LocalShardTelemetry, OptimizerTelemetry}; | |
use crate::shards::CollectionId; | |
use crate::update_handler::{Optimizer, UpdateHandler, UpdateSignal}; | |
use crate::wal::SerdeWal; | |
use crate::wal_delta::{LockedWal, RecoverableWal}; | |
/// If rendering WAL load progression in basic text form, report progression every 60 seconds. | |
const WAL_LOAD_REPORT_EVERY: Duration = Duration::from_secs(60); | |
const WAL_PATH: &str = "wal"; | |
const SEGMENTS_PATH: &str = "segments"; | |
/// LocalShard | |
/// | |
/// LocalShard is an entity that can be moved between peers and contains some part of one collections data. | |
/// | |
/// Holds all object, required for collection functioning | |
pub struct LocalShard { | |
pub(super) segments: LockedSegmentHolder, | |
pub(super) collection_config: Arc<TokioRwLock<CollectionConfigInternal>>, | |
pub(super) shared_storage_config: Arc<SharedStorageConfig>, | |
pub(crate) payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>, | |
pub(super) wal: RecoverableWal, | |
pub(super) update_handler: Arc<Mutex<UpdateHandler>>, | |
pub(super) update_sender: ArcSwap<Sender<UpdateSignal>>, | |
pub(super) update_tracker: UpdateTracker, | |
pub(super) path: PathBuf, | |
pub(super) optimizers: Arc<Vec<Arc<Optimizer>>>, | |
pub(super) optimizers_log: Arc<ParkingMutex<TrackerLog>>, | |
pub(super) total_optimized_points: Arc<AtomicUsize>, | |
update_runtime: Handle, | |
pub(super) search_runtime: Handle, | |
disk_usage_watcher: DiskUsageWatcher, | |
} | |
/// Shard holds information about segments and WAL. | |
impl LocalShard { | |
pub async fn move_data(from: &Path, to: &Path) -> CollectionResult<()> { | |
let wal_from = Self::wal_path(from); | |
let wal_to = Self::wal_path(to); | |
let segments_from = Self::segments_path(from); | |
let segments_to = Self::segments_path(to); | |
move_dir(wal_from, wal_to).await?; | |
move_dir(segments_from, segments_to).await?; | |
LocalShardClocks::move_data(from, to).await?; | |
Ok(()) | |
} | |
/// Checks if path have local shard data present | |
pub fn check_data(shard_path: &Path) -> bool { | |
let wal_path = Self::wal_path(shard_path); | |
let segments_path = Self::segments_path(shard_path); | |
wal_path.exists() && segments_path.exists() | |
} | |
/// Clear local shard related data. | |
/// | |
/// Do NOT remove config file. | |
pub async fn clear(shard_path: &Path) -> CollectionResult<()> { | |
// Delete WAL | |
let wal_path = Self::wal_path(shard_path); | |
if wal_path.exists() { | |
remove_dir_all(wal_path).await?; | |
} | |
// Delete segments | |
let segments_path = Self::segments_path(shard_path); | |
if segments_path.exists() { | |
remove_dir_all(segments_path).await?; | |
} | |
LocalShardClocks::delete_data(shard_path).await?; | |
Ok(()) | |
} | |
pub async fn new( | |
segment_holder: SegmentHolder, | |
collection_config: Arc<TokioRwLock<CollectionConfigInternal>>, | |
shared_storage_config: Arc<SharedStorageConfig>, | |
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>, | |
wal: SerdeWal<OperationWithClockTag>, | |
optimizers: Arc<Vec<Arc<Optimizer>>>, | |
optimizer_cpu_budget: CpuBudget, | |
shard_path: &Path, | |
clocks: LocalShardClocks, | |
update_runtime: Handle, | |
search_runtime: Handle, | |
) -> Self { | |
let segment_holder = Arc::new(RwLock::new(segment_holder)); | |
let config = collection_config.read().await; | |
let locked_wal = Arc::new(ParkingMutex::new(wal)); | |
let optimizers_log = Arc::new(ParkingMutex::new(Default::default())); | |
let total_optimized_points = Arc::new(AtomicUsize::new(0)); | |
// default to 2x the WAL capacity | |
let disk_buffer_threshold_mb = | |
2 * (collection_config.read().await.wal_config.wal_capacity_mb); | |
let disk_usage_watcher = disk_usage_watcher::DiskUsageWatcher::new( | |
shard_path.to_owned(), | |
disk_buffer_threshold_mb, | |
) | |
.await; | |
let mut update_handler = UpdateHandler::new( | |
shared_storage_config.clone(), | |
payload_index_schema.clone(), | |
optimizers.clone(), | |
optimizers_log.clone(), | |
total_optimized_points.clone(), | |
optimizer_cpu_budget.clone(), | |
update_runtime.clone(), | |
segment_holder.clone(), | |
locked_wal.clone(), | |
config.optimizer_config.flush_interval_sec, | |
config.optimizer_config.max_optimization_threads, | |
clocks.clone(), | |
shard_path.into(), | |
); | |
let (update_sender, update_receiver) = | |
mpsc::channel(shared_storage_config.update_queue_size); | |
update_handler.run_workers(update_receiver); | |
let update_tracker = segment_holder.read().update_tracker(); | |
drop(config); // release `shared_config` from borrow checker | |
Self { | |
segments: segment_holder, | |
collection_config, | |
shared_storage_config, | |
payload_index_schema, | |
wal: RecoverableWal::new(locked_wal, clocks.newest_clocks, clocks.oldest_clocks), | |
update_handler: Arc::new(Mutex::new(update_handler)), | |
update_sender: ArcSwap::from_pointee(update_sender), | |
update_tracker, | |
path: shard_path.to_owned(), | |
update_runtime, | |
search_runtime, | |
optimizers, | |
optimizers_log, | |
total_optimized_points, | |
disk_usage_watcher, | |
} | |
} | |
pub(super) fn segments(&self) -> &RwLock<SegmentHolder> { | |
self.segments.deref() | |
} | |
/// Recovers shard from disk. | |
pub async fn load( | |
id: ShardId, | |
collection_id: CollectionId, | |
shard_path: &Path, | |
collection_config: Arc<TokioRwLock<CollectionConfigInternal>>, | |
effective_optimizers_config: OptimizersConfig, | |
shared_storage_config: Arc<SharedStorageConfig>, | |
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>, | |
update_runtime: Handle, | |
search_runtime: Handle, | |
optimizer_cpu_budget: CpuBudget, | |
) -> CollectionResult<LocalShard> { | |
let collection_config_read = collection_config.read().await; | |
let wal_path = Self::wal_path(shard_path); | |
let segments_path = Self::segments_path(shard_path); | |
let wal: SerdeWal<OperationWithClockTag> = SerdeWal::new( | |
wal_path.to_str().unwrap(), | |
(&collection_config_read.wal_config).into(), | |
) | |
.map_err(|e| CollectionError::service_error(format!("Wal error: {e}")))?; | |
let segment_dirs = std::fs::read_dir(&segments_path).map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can't read segments directory due to {}\nat {}", | |
err, | |
segments_path.to_str().unwrap() | |
)) | |
})?; | |
let mut load_handlers = vec![]; | |
// This semaphore is used to limit the number of threads that load segments concurrently. | |
// Uncomment it if you need to debug segment loading. | |
// let semaphore = Arc::new(parking_lot::Mutex::new(())); | |
for entry in segment_dirs { | |
let segments_path = entry.unwrap().path(); | |
let payload_index_schema = payload_index_schema.clone(); | |
// let semaphore_clone = semaphore.clone(); | |
load_handlers.push( | |
thread::Builder::new() | |
.name(format!("shard-load-{collection_id}-{id}")) | |
.spawn(move || { | |
// let _guard = semaphore_clone.lock(); | |
let mut res = load_segment(&segments_path, &AtomicBool::new(false))?; | |
if let Some(segment) = &mut res { | |
segment.check_consistency_and_repair()?; | |
segment.update_all_field_indices( | |
&payload_index_schema.read().schema.clone(), | |
)?; | |
} else { | |
std::fs::remove_dir_all(&segments_path).map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can't remove leftover segment {}, due to {err}", | |
segments_path.to_str().unwrap(), | |
)) | |
})?; | |
} | |
Ok::<_, CollectionError>(res) | |
})?, | |
); | |
} | |
let mut segment_holder = SegmentHolder::default(); | |
for handler in load_handlers { | |
let segment = handler.join().map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can't join segment load thread: {:?}", | |
err.type_id() | |
)) | |
})??; | |
let Some(segment) = segment else { | |
continue; | |
}; | |
collection_config_read | |
.params | |
.vectors | |
.check_compatible_with_segment_config(&segment.config().vector_data, true)?; | |
collection_config_read | |
.params | |
.sparse_vectors | |
.as_ref() | |
.map(|sparse_vectors| { | |
check_sparse_compatible_with_segment_config( | |
sparse_vectors, | |
&segment.config().sparse_vector_data, | |
true, | |
) | |
}) | |
.unwrap_or(Ok(()))?; | |
segment_holder.add_new(segment); | |
} | |
let res = segment_holder.deduplicate_points().await?; | |
if res > 0 { | |
log::debug!("Deduplicated {} points", res); | |
} | |
clear_temp_segments(shard_path); | |
let optimizers = build_optimizers( | |
shard_path, | |
&collection_config_read.params, | |
&effective_optimizers_config, | |
&collection_config_read.hnsw_config, | |
&collection_config_read.quantization_config, | |
); | |
drop(collection_config_read); // release `shared_config` from borrow checker | |
let clocks = LocalShardClocks::load(shard_path)?; | |
// Always make sure we have any appendable segments, needed for update operations | |
if !segment_holder.has_appendable_segment() { | |
debug_assert!( | |
false, | |
"Shard has no appendable segments, this should never happen", | |
); | |
log::warn!("Shard has no appendable segments, this should never happen. Creating new appendable segment now"); | |
let segments_path = LocalShard::segments_path(shard_path); | |
let collection_params = collection_config.read().await.params.clone(); | |
let payload_index_schema = payload_index_schema.read(); | |
segment_holder.create_appendable_segment( | |
&segments_path, | |
&collection_params, | |
&payload_index_schema, | |
)?; | |
} | |
let local_shard = LocalShard::new( | |
segment_holder, | |
collection_config, | |
shared_storage_config, | |
payload_index_schema, | |
wal, | |
optimizers, | |
optimizer_cpu_budget, | |
shard_path, | |
clocks, | |
update_runtime, | |
search_runtime, | |
) | |
.await; | |
// Apply outstanding operations from WAL | |
local_shard.load_from_wal(collection_id).await?; | |
let available_memory_bytes = Mem::new().available_memory_bytes() as usize; | |
let vectors_size_bytes = local_shard.estimate_vector_data_size().await; | |
// Simple heuristic to exclude mmap prefaulting for collections that won't benefit from it. | |
// | |
// We assume that mmap prefaulting is beneficial if we can put significant part of data | |
// into RAM in advance. However, if we can see that the data is too big to fit into RAM, | |
// it is better to avoid prefaulting, because it will only cause extra disk IO. | |
// | |
// This heuristic is not perfect, but it exclude cases when we don't have enough RAM | |
// even to store half of the vector data. | |
let do_mmap_prefault = available_memory_bytes * 2 > vectors_size_bytes; | |
if do_mmap_prefault { | |
for (_, segment) in local_shard.segments.read().iter() { | |
if let LockedSegment::Original(segment) = segment { | |
segment.read().prefault_mmap_pages(); | |
} | |
} | |
} | |
Ok(local_shard) | |
} | |
pub fn shard_path(&self) -> PathBuf { | |
self.path.clone() | |
} | |
pub fn wal_path(shard_path: &Path) -> PathBuf { | |
shard_path.join(WAL_PATH) | |
} | |
pub fn segments_path(shard_path: &Path) -> PathBuf { | |
shard_path.join(SEGMENTS_PATH) | |
} | |
pub async fn build_local( | |
id: ShardId, | |
collection_id: CollectionId, | |
shard_path: &Path, | |
collection_config: Arc<TokioRwLock<CollectionConfigInternal>>, | |
shared_storage_config: Arc<SharedStorageConfig>, | |
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>, | |
update_runtime: Handle, | |
search_runtime: Handle, | |
optimizer_cpu_budget: CpuBudget, | |
effective_optimizers_config: OptimizersConfig, | |
) -> CollectionResult<LocalShard> { | |
// initialize local shard config file | |
let local_shard_config = ShardConfig::new_replica_set(); | |
let shard = Self::build( | |
id, | |
collection_id, | |
shard_path, | |
collection_config, | |
shared_storage_config, | |
payload_index_schema, | |
update_runtime, | |
search_runtime, | |
optimizer_cpu_budget, | |
effective_optimizers_config, | |
) | |
.await?; | |
local_shard_config.save(shard_path)?; | |
Ok(shard) | |
} | |
/// Creates new empty shard with given configuration, initializing all storages, optimizers and directories. | |
pub async fn build( | |
id: ShardId, | |
collection_id: CollectionId, | |
shard_path: &Path, | |
collection_config: Arc<TokioRwLock<CollectionConfigInternal>>, | |
shared_storage_config: Arc<SharedStorageConfig>, | |
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>, | |
update_runtime: Handle, | |
search_runtime: Handle, | |
optimizer_cpu_budget: CpuBudget, | |
effective_optimizers_config: OptimizersConfig, | |
) -> CollectionResult<LocalShard> { | |
let config = collection_config.read().await; | |
let wal_path = Self::wal_path(shard_path); | |
create_dir_all(&wal_path).await.map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can't create shard wal directory. Error: {err}" | |
)) | |
})?; | |
let segments_path = Self::segments_path(shard_path); | |
create_dir_all(&segments_path).await.map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can't create shard segments directory. Error: {err}" | |
)) | |
})?; | |
let mut segment_holder = SegmentHolder::default(); | |
let mut build_handlers = vec![]; | |
let vector_params = config.params.to_base_vector_data()?; | |
let sparse_vector_params = config.params.to_sparse_vector_data()?; | |
let segment_number = config.optimizer_config.get_number_segments(); | |
for _sid in 0..segment_number { | |
let path_clone = segments_path.clone(); | |
let segment_config = SegmentConfig { | |
vector_data: vector_params.clone(), | |
sparse_vector_data: sparse_vector_params.clone(), | |
payload_storage_type: config.params.payload_storage_type(), | |
}; | |
let segment = thread::Builder::new() | |
.name(format!("shard-build-{collection_id}-{id}")) | |
.spawn(move || build_segment(&path_clone, &segment_config, true)) | |
.unwrap(); | |
build_handlers.push(segment); | |
} | |
let join_results = build_handlers | |
.into_iter() | |
.map(|handler| handler.join()) | |
.collect_vec(); | |
for join_result in join_results { | |
let segment = join_result.map_err(|err| { | |
let message = panic::downcast_str(&err).unwrap_or(""); | |
let separator = if !message.is_empty() { "with:\n" } else { "" }; | |
CollectionError::service_error(format!( | |
"Segment DB create panicked{separator}{message}", | |
)) | |
})??; | |
segment_holder.add_new(segment); | |
} | |
let wal: SerdeWal<OperationWithClockTag> = | |
SerdeWal::new(wal_path.to_str().unwrap(), (&config.wal_config).into())?; | |
let optimizers = build_optimizers( | |
shard_path, | |
&config.params, | |
&effective_optimizers_config, | |
&config.hnsw_config, | |
&config.quantization_config, | |
); | |
drop(config); // release `shared_config` from borrow checker | |
let collection = LocalShard::new( | |
segment_holder, | |
collection_config, | |
shared_storage_config, | |
payload_index_schema, | |
wal, | |
optimizers, | |
optimizer_cpu_budget, | |
shard_path, | |
LocalShardClocks::default(), | |
update_runtime, | |
search_runtime, | |
) | |
.await; | |
Ok(collection) | |
} | |
pub async fn stop_flush_worker(&self) { | |
let mut update_handler = self.update_handler.lock().await; | |
update_handler.stop_flush_worker() | |
} | |
pub async fn wait_update_workers_stop(&self) -> CollectionResult<()> { | |
let mut update_handler = self.update_handler.lock().await; | |
update_handler.wait_workers_stops().await | |
} | |
/// Loads latest collection operations from WAL | |
pub async fn load_from_wal(&self, collection_id: CollectionId) -> CollectionResult<()> { | |
let mut newest_clocks = self.wal.newest_clocks.lock().await; | |
let wal = self.wal.wal.lock(); | |
let bar = ProgressBar::new(wal.len(false)); | |
let progress_style = ProgressStyle::default_bar() | |
.template("{msg} [{elapsed_precise}] {wide_bar} {pos}/{len} (eta:{eta})") | |
.expect("Failed to create progress style"); | |
bar.set_style(progress_style); | |
bar.set_message(format!("Recovering collection {collection_id}")); | |
let segments = self.segments(); | |
// Fall back to basic text output if the progress bar is hidden (e.g. not a tty) | |
let show_progress_bar = !bar.is_hidden(); | |
let mut last_progress_report = Instant::now(); | |
if !show_progress_bar { | |
log::info!( | |
"Recovering collection {collection_id}: 0/{} (0%)", | |
wal.len(false), | |
); | |
} | |
// When `Segment`s are flushed, WAL is truncated up to the index of the last operation | |
// that has been applied and flushed. | |
// | |
// `SerdeWal` wrapper persists/keeps track of this index (in addition to any handling | |
// in the `wal` crate itself). | |
// | |
// `SerdeWal::read_all` starts reading WAL from the first "un-truncated" index, | |
// so no additional handling required to "skip" any potentially applied entries. | |
// | |
// Note, that it's not guaranteed that some operation won't be re-applied to the storage. | |
// (`SerdeWal::read_all` may even start reading WAL from some already truncated | |
// index *occasionally*), but the storage can handle it. | |
for (op_num, update) in wal.read_all(false) { | |
if let Some(clock_tag) = update.clock_tag { | |
newest_clocks.advance_clock(clock_tag); | |
} | |
// Propagate `CollectionError::ServiceError`, but skip other error types. | |
match &CollectionUpdater::update(segments, op_num, update.operation) { | |
Err(err @ CollectionError::ServiceError { error, backtrace }) => { | |
let path = self.path.display(); | |
log::error!( | |
"Can't apply WAL operation: {error}, \ | |
collection: {collection_id}, \ | |
shard: {path}, \ | |
op_num: {op_num}" | |
); | |
if let Some(backtrace) = &backtrace { | |
log::error!("Backtrace: {}", backtrace); | |
} | |
return Err(err.clone()); | |
} | |
Err(err @ CollectionError::OutOfMemory { .. }) => { | |
log::error!("{err}"); | |
return Err(err.clone()); | |
} | |
Err(err @ CollectionError::NotFound { .. }) => log::warn!("{err}"), | |
Err(err) => log::error!("{err}"), | |
Ok(_) => (), | |
} | |
// Update progress bar or show text progress every WAL_LOAD_REPORT_EVERY | |
bar.inc(1); | |
if !show_progress_bar && last_progress_report.elapsed() >= WAL_LOAD_REPORT_EVERY { | |
let progress = bar.position(); | |
log::info!( | |
"{progress}/{} ({}%)", | |
wal.len(false), | |
(progress as f32 / wal.len(false) as f32 * 100.0) as usize, | |
); | |
last_progress_report = Instant::now(); | |
} | |
} | |
{ | |
let segments = self.segments.read(); | |
// It is possible, that after recovery, if WAL flush was not enforced. | |
// We could be left with some un-versioned points. | |
// To maintain consistency, we can either remove them or try to recover. | |
for (_idx, segment) in segments.iter() { | |
match segment { | |
LockedSegment::Original(raw_segment) => { | |
raw_segment.write().cleanup_versions()?; | |
} | |
LockedSegment::Proxy(_) => { | |
debug_assert!(false, "Proxy segment found in load_from_wal"); | |
} | |
} | |
} | |
// Force a flush after re-applying WAL operations, to ensure we maintain on-disk data | |
// consistency, if we happened to only apply *past* operations to a segment with newer | |
// version. | |
segments.flush_all(true, true)?; | |
} | |
bar.finish(); | |
if !show_progress_bar { | |
log::info!( | |
"Recovered collection {collection_id}: {0}/{0} (100%)", | |
wal.len(false), | |
); | |
} | |
// The storage is expected to be consistent after WAL recovery | |
self.check_data_consistency()?; | |
Ok(()) | |
} | |
/// Check data consistency for all segments | |
/// | |
/// Returns an error at the first inconsistent segment | |
pub fn check_data_consistency(&self) -> CollectionResult<()> { | |
log::info!("Checking data consistency for shard {:?}", self.path); | |
let segments = self.segments.read(); | |
for (_idx, segment) in segments.iter() { | |
match segment { | |
LockedSegment::Original(raw_segment) => { | |
let segment_guard = raw_segment.read(); | |
if let Err(err) = segment_guard.check_data_consistency() { | |
log::error!( | |
"Segment {:?} is inconsistent: {}", | |
segment_guard.current_path, | |
err | |
); | |
return Err(err.into()); | |
} | |
} | |
LockedSegment::Proxy(_) => { | |
return Err(CollectionError::service_error( | |
"Proxy segment found in check_data_consistency", | |
)); | |
} | |
} | |
} | |
Ok(()) | |
} | |
pub async fn on_optimizer_config_update(&self) -> CollectionResult<()> { | |
let config = self.collection_config.read().await; | |
let mut update_handler = self.update_handler.lock().await; | |
let (update_sender, update_receiver) = | |
mpsc::channel(self.shared_storage_config.update_queue_size); | |
// makes sure that the Stop signal is the last one in this channel | |
let old_sender = self.update_sender.swap(Arc::new(update_sender)); | |
old_sender.send(UpdateSignal::Stop).await?; | |
update_handler.stop_flush_worker(); | |
update_handler.wait_workers_stops().await?; | |
let new_optimizers = build_optimizers( | |
&self.path, | |
&config.params, | |
&config.optimizer_config, | |
&config.hnsw_config, | |
&config.quantization_config, | |
); | |
update_handler.optimizers = new_optimizers; | |
update_handler.flush_interval_sec = config.optimizer_config.flush_interval_sec; | |
update_handler.max_optimization_threads = config.optimizer_config.max_optimization_threads; | |
update_handler.run_workers(update_receiver); | |
self.update_sender.load().send(UpdateSignal::Nop).await?; | |
Ok(()) | |
} | |
pub fn trigger_optimizers(&self) { | |
// Send a trigger signal and ignore errors because all error cases are acceptable: | |
// - If receiver is already dead - we do not care | |
// - If channel is full - optimization will be triggered by some other signal | |
let _ = self.update_sender.load().try_send(UpdateSignal::Nop); | |
} | |
/// Finishes ongoing update tasks | |
pub async fn stop_gracefully(&self) { | |
if let Err(err) = self.update_sender.load().send(UpdateSignal::Stop).await { | |
log::warn!("Error sending stop signal to update handler: {}", err); | |
} | |
self.stop_flush_worker().await; | |
if let Err(err) = self.wait_update_workers_stop().await { | |
log::warn!("Update workers failed with: {}", err); | |
} | |
} | |
pub fn restore_snapshot(snapshot_path: &Path) -> CollectionResult<()> { | |
// Read dir first as the directory contents would change during restore. | |
let entries = std::fs::read_dir(LocalShard::segments_path(snapshot_path))? | |
.collect::<Result<Vec<_>, _>>()?; | |
for entry in entries { | |
Segment::restore_snapshot_in_place(&entry.path())?; | |
} | |
Ok(()) | |
} | |
/// Create snapshot for local shard into `target_path` | |
pub async fn create_snapshot( | |
&self, | |
temp_path: &Path, | |
tar: &tar_ext::BuilderExt, | |
format: SnapshotFormat, | |
save_wal: bool, | |
) -> CollectionResult<()> { | |
let segments = self.segments.clone(); | |
let wal = self.wal.wal.clone(); | |
if !save_wal { | |
// If we are not saving WAL, we still need to make sure that all submitted by this point | |
// updates have made it to the segments. So we use the Plunger to achieve that. | |
// It will notify us when all submitted updates so far have been processed. | |
let (tx, rx) = oneshot::channel(); | |
let plunger = UpdateSignal::Plunger(tx); | |
self.update_sender.load().send(plunger).await?; | |
rx.await?; | |
} | |
let segments_path = Self::segments_path(&self.path); | |
let collection_params = self.collection_config.read().await.params.clone(); | |
let temp_path = temp_path.to_owned(); | |
let payload_index_schema = self.payload_index_schema.clone(); | |
let tar_c = tar.clone(); | |
tokio::task::spawn_blocking(move || { | |
// Do not change segments while snapshotting | |
SegmentHolder::snapshot_all_segments( | |
segments.clone(), | |
&segments_path, | |
Some(&collection_params), | |
&payload_index_schema.read().clone(), | |
&temp_path, | |
&tar_c.descend(Path::new(SEGMENTS_PATH))?, | |
format, | |
)?; | |
if save_wal { | |
// snapshot all shard's WAL | |
Self::snapshot_wal(wal, &tar_c) | |
} else { | |
Self::snapshot_empty_wal(wal, &temp_path, &tar_c) | |
} | |
}) | |
.await??; | |
LocalShardClocks::archive_data(&self.path, tar).await?; | |
Ok(()) | |
} | |
/// Create empty WAL which is compatible with currently stored data | |
pub fn snapshot_empty_wal( | |
wal: LockedWal, | |
temp_path: &Path, | |
tar: &tar_ext::BuilderExt, | |
) -> CollectionResult<()> { | |
let (segment_capacity, latest_op_num) = { | |
let wal_guard = wal.lock(); | |
(wal_guard.segment_capacity(), wal_guard.last_index()) | |
}; | |
let temp_dir = tempfile::tempdir_in(temp_path).map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can not create temporary directory for WAL: {err}", | |
)) | |
})?; | |
Wal::generate_empty_wal_starting_at_index( | |
temp_dir.path(), | |
&WalOptions { | |
segment_capacity, | |
segment_queue_len: 0, | |
}, | |
latest_op_num, | |
) | |
.map_err(|err| { | |
CollectionError::service_error(format!("Error while create empty WAL: {err}")) | |
})?; | |
tar.blocking_append_dir_all(temp_dir.path(), Path::new(WAL_PATH)) | |
.map_err(|err| { | |
CollectionError::service_error(format!("Error while archiving WAL: {err}")) | |
}) | |
} | |
/// snapshot WAL | |
pub fn snapshot_wal(wal: LockedWal, tar: &tar_ext::BuilderExt) -> CollectionResult<()> { | |
// lock wal during snapshot | |
let mut wal_guard = wal.lock(); | |
wal_guard.flush()?; | |
let source_wal_path = wal_guard.path(); | |
let tar = tar.descend(Path::new(WAL_PATH))?; | |
for entry in std::fs::read_dir(source_wal_path).map_err(|err| { | |
CollectionError::service_error(format!("Can't read WAL directory: {err}",)) | |
})? { | |
let entry = entry.map_err(|err| { | |
CollectionError::service_error(format!("Can't read WAL directory: {err}",)) | |
})?; | |
if entry.file_name() == ".wal" { | |
// This sentinel file is used for WAL locking. Trying to archive | |
// or open it will cause the following error on Windows: | |
// > The process cannot access the file because another process | |
// > has locked a portion of the file. (os error 33) | |
// https://github.com/qdrant/wal/blob/7c9202d0874/src/lib.rs#L125-L145 | |
continue; | |
} | |
tar.blocking_append_file(&entry.path(), Path::new(&entry.file_name())) | |
.map_err(|err| { | |
CollectionError::service_error(format!("Error while archiving WAL: {err}")) | |
})?; | |
} | |
Ok(()) | |
} | |
pub fn estimate_cardinality<'a>( | |
&'a self, | |
filter: Option<&'a Filter>, | |
) -> CollectionResult<CardinalityEstimation> { | |
let segments = self.segments().read(); | |
let cardinality = segments | |
.iter() | |
.map(|(_id, segment)| segment.get().read().estimate_point_count(filter)) | |
.fold(CardinalityEstimation::exact(0), |acc, x| { | |
CardinalityEstimation { | |
primary_clauses: vec![], | |
min: acc.min + x.min, | |
exp: acc.exp + x.exp, | |
max: acc.max + x.max, | |
} | |
}); | |
Ok(cardinality) | |
} | |
pub async fn read_filtered<'a>( | |
&'a self, | |
filter: Option<&'a Filter>, | |
runtime_handle: &Handle, | |
) -> CollectionResult<BTreeSet<PointIdType>> { | |
let segments = self.segments.clone(); | |
SegmentsSearcher::read_filtered(segments, filter, runtime_handle).await | |
} | |
pub fn get_telemetry_data(&self, detail: TelemetryDetail) -> LocalShardTelemetry { | |
let segments_read_guard = self.segments.read(); | |
let segments: Vec<_> = segments_read_guard | |
.iter() | |
.map(|(_id, segment)| segment.get().read().get_telemetry_data(detail)) | |
.collect(); | |
let optimizer_status = match &segments_read_guard.optimizer_errors { | |
None => OptimizersStatus::Ok, | |
Some(error) => OptimizersStatus::Error(error.to_string()), | |
}; | |
drop(segments_read_guard); | |
let optimizations = self | |
.optimizers | |
.iter() | |
.map(|optimizer| { | |
optimizer | |
.get_telemetry_counter() | |
.lock() | |
.get_statistics(detail) | |
}) | |
.fold(Default::default(), |acc, x| acc + x); | |
let total_optimized_points = self.total_optimized_points.load(Ordering::Relaxed); | |
LocalShardTelemetry { | |
variant_name: None, | |
status: None, | |
total_optimized_points, | |
segments, | |
optimizations: OptimizerTelemetry { | |
status: optimizer_status, | |
optimizations, | |
log: self.optimizers_log.lock().to_telemetry(), | |
}, | |
async_scorer: Some(get_async_scorer()), | |
} | |
} | |
/// Returns estimated size of vector data in bytes | |
async fn estimate_vector_data_size(&self) -> usize { | |
let info = self.local_shard_info().await; | |
let vector_size: usize = info | |
.config | |
.params | |
.vectors | |
.params_iter() | |
.map(|(_, value)| { | |
let vector_size = value.size.get() as usize; | |
let quantization_config = value | |
.quantization_config | |
.as_ref() | |
.or(info.config.quantization_config.as_ref()); | |
let quantized_size_bytes = match quantization_config { | |
None => 0, | |
Some(QuantizationConfig::Scalar(_)) => vector_size, | |
Some(QuantizationConfig::Product(pq)) => match pq.product.compression { | |
CompressionRatio::X4 => vector_size, | |
CompressionRatio::X8 => vector_size / 2, | |
CompressionRatio::X16 => vector_size / 4, | |
CompressionRatio::X32 => vector_size / 8, | |
CompressionRatio::X64 => vector_size / 16, | |
}, | |
Some(QuantizationConfig::Binary(_)) => vector_size / 8, | |
}; | |
vector_size * size_of::<VectorElementType>() + quantized_size_bytes | |
}) | |
.sum(); | |
vector_size * info.points_count | |
} | |
pub async fn local_shard_status(&self) -> (ShardStatus, OptimizersStatus) { | |
{ | |
let segments = self.segments().read(); | |
// Red status on failed operation or optimizer error | |
if !segments.failed_operation.is_empty() || segments.optimizer_errors.is_some() { | |
let optimizer_status = segments | |
.optimizer_errors | |
.as_ref() | |
.map_or(OptimizersStatus::Ok, |err| { | |
OptimizersStatus::Error(err.to_string()) | |
}); | |
return (ShardStatus::Red, optimizer_status); | |
} | |
// Yellow status if we have a special segment, indicates a proxy segment used during optimization | |
// TODO: snapshotting also creates temp proxy segments. should differentiate. | |
let has_special_segment = segments | |
.iter() | |
.map(|(_, segment)| segment.get().read().info().segment_type) | |
.any(|segment_type| segment_type == SegmentType::Special); | |
if has_special_segment { | |
return (ShardStatus::Yellow, OptimizersStatus::Ok); | |
} | |
} | |
// Yellow or grey status if there are pending optimizations | |
// Grey if optimizers were not triggered yet after restart, | |
// we don't automatically trigger them to prevent a crash loop | |
let (has_triggered_any_optimizers, has_suboptimal_optimizers) = self | |
.update_handler | |
.lock() | |
.await | |
.check_optimizer_conditions(); | |
if has_suboptimal_optimizers { | |
let status = if has_triggered_any_optimizers { | |
ShardStatus::Yellow | |
} else { | |
ShardStatus::Grey | |
}; | |
return (status, OptimizersStatus::Ok); | |
} | |
// Green status because everything is fine | |
(ShardStatus::Green, OptimizersStatus::Ok) | |
} | |
pub async fn local_shard_info(&self) -> ShardInfoInternal { | |
let collection_config = self.collection_config.read().await.clone(); | |
let mut vectors_count = 0; | |
let mut indexed_vectors_count = 0; | |
let mut points_count = 0; | |
let mut segments_count = 0; | |
let mut schema: HashMap<PayloadKeyType, PayloadIndexInfo> = Default::default(); | |
{ | |
let segments = self.segments().read(); | |
for (_idx, segment) in segments.iter() { | |
segments_count += 1; | |
let segment_info = segment.get().read().info(); | |
vectors_count += segment_info.num_vectors; | |
indexed_vectors_count += segment_info.num_indexed_vectors; | |
points_count += segment_info.num_points; | |
for (key, val) in segment_info.index_schema { | |
schema | |
.entry(key) | |
.and_modify(|entry| entry.points += val.points) | |
.or_insert(val); | |
} | |
} | |
} | |
let (status, optimizer_status) = self.local_shard_status().await; | |
ShardInfoInternal { | |
status, | |
optimizer_status, | |
vectors_count, | |
indexed_vectors_count, | |
points_count, | |
segments_count, | |
config: collection_config, | |
payload_schema: schema, | |
} | |
} | |
pub fn update_tracker(&self) -> &UpdateTracker { | |
&self.update_tracker | |
} | |
/// Get the recovery point for the current shard | |
/// | |
/// This is sourced from the last seen clocks from other nodes that we know about. | |
pub async fn recovery_point(&self) -> RecoveryPoint { | |
self.wal.recovery_point().await | |
} | |
/// Update the cutoff point on the current shard | |
/// | |
/// This also updates the highest seen clocks. | |
pub async fn update_cutoff(&self, cutoff: &RecoveryPoint) { | |
self.wal.update_cutoff(cutoff).await | |
} | |
} | |
impl Drop for LocalShard { | |
fn drop(&mut self) { | |
thread::scope(|s| { | |
let handle = thread::Builder::new() | |
.name("drop-shard".to_string()) | |
.spawn_scoped(s, || { | |
// Needs dedicated thread to avoid `Cannot start a runtime from within a runtime` error. | |
self.update_runtime | |
.block_on(async { self.stop_gracefully().await }) | |
}); | |
handle.expect("Failed to create thread for shard drop"); | |
}) | |
} | |
} | |
const NEWEST_CLOCKS_PATH: &str = "newest_clocks.json"; | |
const OLDEST_CLOCKS_PATH: &str = "oldest_clocks.json"; | |
/// Convenience struct for combining clock maps belonging to a shard | |
/// | |
/// Holds a clock map for tracking the highest clocks and the cutoff clocks. | |
pub struct LocalShardClocks { | |
newest_clocks: Arc<Mutex<ClockMap>>, | |
oldest_clocks: Arc<Mutex<ClockMap>>, | |
} | |
impl LocalShardClocks { | |
fn new(newest_clocks: ClockMap, oldest_clocks: ClockMap) -> Self { | |
Self { | |
newest_clocks: Arc::new(Mutex::new(newest_clocks)), | |
oldest_clocks: Arc::new(Mutex::new(oldest_clocks)), | |
} | |
} | |
// Load clock maps from disk | |
pub fn load(shard_path: &Path) -> CollectionResult<Self> { | |
let newest_clocks = ClockMap::load_or_default(&Self::newest_clocks_path(shard_path))?; | |
let oldest_clocks = ClockMap::load_or_default(&Self::oldest_clocks_path(shard_path))?; | |
Ok(Self::new(newest_clocks, oldest_clocks)) | |
} | |
/// Persist clock maps to disk | |
pub async fn store_if_changed(&self, shard_path: &Path) -> CollectionResult<()> { | |
self.oldest_clocks | |
.lock() | |
.await | |
.store_if_changed(&Self::oldest_clocks_path(shard_path))?; | |
self.newest_clocks | |
.lock() | |
.await | |
.store_if_changed(&Self::newest_clocks_path(shard_path))?; | |
Ok(()) | |
} | |
/// Put clock data from the disk into an archive. | |
pub async fn archive_data(from: &Path, tar: &tar_ext::BuilderExt) -> CollectionResult<()> { | |
let newest_clocks_from = Self::newest_clocks_path(from); | |
let oldest_clocks_from = Self::oldest_clocks_path(from); | |
if newest_clocks_from.exists() { | |
tar.append_file(&newest_clocks_from, Path::new(NEWEST_CLOCKS_PATH)) | |
.await?; | |
} | |
if oldest_clocks_from.exists() { | |
tar.append_file(&oldest_clocks_from, Path::new(OLDEST_CLOCKS_PATH)) | |
.await?; | |
} | |
Ok(()) | |
} | |
/// Move clock data on disk from one shard path to another. | |
pub async fn move_data(from: &Path, to: &Path) -> CollectionResult<()> { | |
let newest_clocks_from = Self::newest_clocks_path(from); | |
let oldest_clocks_from = Self::oldest_clocks_path(from); | |
if newest_clocks_from.exists() { | |
let newest_clocks_to = Self::newest_clocks_path(to); | |
move_file(newest_clocks_from, newest_clocks_to).await?; | |
} | |
if oldest_clocks_from.exists() { | |
let oldest_clocks_to = Self::oldest_clocks_path(to); | |
move_file(oldest_clocks_from, oldest_clocks_to).await?; | |
} | |
Ok(()) | |
} | |
/// Delete clock data from disk at the given shard path. | |
pub async fn delete_data(shard_path: &Path) -> CollectionResult<()> { | |
let newest_clocks_path = Self::newest_clocks_path(shard_path); | |
let oldest_clocks_path = Self::oldest_clocks_path(shard_path); | |
if newest_clocks_path.exists() { | |
remove_file(newest_clocks_path).await?; | |
} | |
if oldest_clocks_path.exists() { | |
remove_file(oldest_clocks_path).await?; | |
} | |
Ok(()) | |
} | |
fn newest_clocks_path(shard_path: &Path) -> PathBuf { | |
shard_path.join(NEWEST_CLOCKS_PATH) | |
} | |
fn oldest_clocks_path(shard_path: &Path) -> PathBuf { | |
shard_path.join(OLDEST_CLOCKS_PATH) | |
} | |
} | |