Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / lib /collection /src /shards /queue_proxy_shard.rs

Gouzi Mohaled

Ajout du dossier lib

84d2a97 8 months ago

24 kB

	use std::path::Path;
	use std::sync::atomic::{AtomicU64, Ordering};
	use std::sync::Arc;
	use std::time::Duration;

	use async_trait::async_trait;
	use common::counter::hardware_accumulator::HwMeasurementAcc;
	use common::tar_ext;
	use common::types::TelemetryDetail;
	use parking_lot::Mutex as ParkingMutex;
	use segment::data_types::facets::{FacetParams, FacetResponse};
	use segment::data_types::order_by::OrderBy;
	use segment::types::{
	ExtendedPointId, Filter, ScoredPoint, SnapshotFormat, WithPayload, WithPayloadInterface,
	WithVector,
	};
	use tokio::runtime::Handle;
	use tokio::sync::Mutex;

	use super::remote_shard::RemoteShard;
	use super::transfer::driver::MAX_RETRY_COUNT;
	use super::transfer::transfer_tasks_pool::TransferTaskProgress;
	use super::update_tracker::UpdateTracker;
	use crate::operations::point_ops::WriteOrdering;
	use crate::operations::types::{
	CollectionError, CollectionInfo, CollectionResult, CoreSearchRequestBatch,
	CountRequestInternal, CountResult, PointRequestInternal, RecordInternal, UpdateResult,
	};
	use crate::operations::universal_query::shard_query::{ShardQueryRequest, ShardQueryResponse};
	use crate::operations::OperationWithClockTag;
	use crate::shards::local_shard::LocalShard;
	use crate::shards::shard_trait::ShardOperation;
	use crate::shards::telemetry::LocalShardTelemetry;

	/// Number of operations in batch when syncing
	const BATCH_SIZE: usize = 10;

	/// Number of times to retry transferring updates batch
	const BATCH_RETRIES: usize = MAX_RETRY_COUNT;

	/// QueueProxyShard shard
	///
	/// QueueProxyShard is a wrapper type for a LocalShard.
	///
	/// It can be used to provide all read and write operations while the wrapped shard is being
	/// snapshotted and transferred to another node. It keeps track of all collection updates since its
	/// creation, and allows to transfer these updates to a remote shard at a given time to assure
	/// consistency.
	///
	/// This keeps track of all updates through the WAL of the wrapped shard. It therefore doesn't have
	/// any memory overhead while updates are accumulated. This type is called 'queue' even though it
	/// doesn't use a real queue, just so it is easy to understand its purpose.
	pub struct QueueProxyShard {
	/// Inner queue proxy shard.
	///
	/// This is always `Some` until `finalize()` is called. This architecture is used to allow
	/// taking out the queue proxy shard for destructing when finalizing. Destructing the current
	/// type directly is not possible because it implements `Drop`.
	inner: Option<Inner>,
	}

	impl QueueProxyShard {
	/// Queue proxy the given local shard and point to the remote shard.
	///
	/// This starts queueing all new updates on the local shard at the point of creation.
	pub fn new(
	wrapped_shard: LocalShard,
	remote_shard: RemoteShard,
	wal_keep_from: Arc<AtomicU64>,
	progress: Arc<ParkingMutex<TransferTaskProgress>>,
	) -> Self {
	Self {
	inner: Some(Inner::new(
	wrapped_shard,
	remote_shard,
	wal_keep_from,
	progress,
	)),
	}
	}

	/// Queue proxy the given local shard and point to the remote shard, from a specific WAL version.
	///
	/// This queues all (existing) updates from a specific WAL `version` and onwards. In other
	/// words, this will ensure we transfer updates we already have and all new updates from a
	/// specific point in our WAL. The `version` may be in the past, but must always be within
	/// range of the current WAL.
	///
	/// # Errors
	///
	/// This fails if the given `version` is not in bounds of our current WAL. If the given
	/// `version` is too old or too new, queue proxy creation is rejected.
	pub fn new_from_version(
	wrapped_shard: LocalShard,
	remote_shard: RemoteShard,
	wal_keep_from: Arc<AtomicU64>,
	version: u64,
	progress: Arc<ParkingMutex<TransferTaskProgress>>,
	) -> Result<Self, (LocalShard, CollectionError)> {
	// Lock WAL until we've successfully created the queue proxy shard
	let wal = wrapped_shard.wal.wal.clone();
	let wal_lock = wal.lock();

	// If start version is not in current WAL bounds [first_idx, last_idx + 1], we cannot reliably transfer WAL
	// Allow it to be one higher than the last index to only send new updates
	let (first_idx, last_idx) = (wal_lock.first_closed_index(), wal_lock.last_index());
	if !(first_idx..=last_idx + 1).contains(&version) {
	return Err((wrapped_shard, CollectionError::service_error(format!("Cannot create queue proxy shard from version {version} because it is out of WAL bounds ({first_idx}..={last_idx})"))));
	}

	Ok(Self {
	inner: Some(Inner::new_from_version(
	wrapped_shard,
	remote_shard,
	wal_keep_from,
	version,
	progress,
	)),
	})
	}

	/// Get inner queue proxy shard. Will panic if the queue proxy has been finalized.
	fn inner_unchecked(&self) -> &Inner {
	self.inner.as_ref().expect("Queue proxy has been finalized")
	}

	pub async fn create_snapshot(
	&self,
	temp_path: &Path,
	tar: &tar_ext::BuilderExt,
	format: SnapshotFormat,
	save_wal: bool,
	) -> CollectionResult<()> {
	self.inner_unchecked()
	.wrapped_shard
	.create_snapshot(temp_path, tar, format, save_wal)
	.await
	}

	/// Transfer all updates that the remote missed from WAL
	///
	/// # Cancel safety
	///
	/// This method is cancel safe.
	///
	/// If cancelled - none, some or all operations may be transmitted to the remote.
	///
	/// The internal field keeping track of the last transfer and maximum acknowledged WAL version
	/// likely won't be updated. In the worst case this might cause double sending operations.
	/// This should be fine as operations are idempotent.
	pub async fn transfer_all_missed_updates(&self) -> CollectionResult<()> {
	self.inner_unchecked().transfer_all_missed_updates().await
	}

	pub async fn on_optimizer_config_update(&self) -> CollectionResult<()> {
	self.inner_unchecked()
	.wrapped_shard
	.on_optimizer_config_update()
	.await
	}

	pub fn trigger_optimizers(&self) {
	self.inner_unchecked().wrapped_shard.trigger_optimizers();
	}

	pub fn get_telemetry_data(&self, detail: TelemetryDetail) -> LocalShardTelemetry {
	self.inner_unchecked()
	.wrapped_shard
	.get_telemetry_data(detail)
	}

	pub fn update_tracker(&self) -> &UpdateTracker {
	self.inner_unchecked().wrapped_shard.update_tracker()
	}

	/// Check if the queue proxy shard is already finalized
	#[cfg(debug_assertions)]
	fn is_finalized(&self) -> bool {
	self.inner.is_none()
	}

	/// Forget all updates and finalize.
	///
	/// Forget all missed updates since creation of this queue proxy shard and finalize. This
	/// unwraps the inner wrapped and remote shard.
	///
	/// It also releases the max acknowledged WAL version.
	///
	/// # Warning
	///
	/// This intentionally forgets and drops updates pending to be transferred to the remote shard.
	/// The remote shard is therefore left in an inconsistent state, which should be resolved
	/// separately.
	pub fn forget_updates_and_finalize(mut self) -> (LocalShard, RemoteShard) {
	// Unwrap queue proxy shards and release max acknowledged version for WAL
	let queue_proxy = self
	.inner
	.take()
	.expect("Queue proxy has already been finalized");
	queue_proxy.set_wal_keep_from(None);

	(queue_proxy.wrapped_shard, queue_proxy.remote_shard)
	}
	}

	#[async_trait]
	impl ShardOperation for QueueProxyShard {
	/// Update `wrapped_shard` while keeping track of operations
	///
	/// # Cancel safety
	///
	/// This method is cancel safe.
	async fn update(
	&self,
	operation: OperationWithClockTag,
	wait: bool,
	) -> CollectionResult<UpdateResult> {
	// `Inner::update` is cancel safe, so this is also cancel safe.
	self.inner_unchecked().update(operation, wait).await
	}

	/// Forward read-only `scroll_by` to `wrapped_shard`
	async fn scroll_by(
	&self,
	offset: Option<ExtendedPointId>,
	limit: usize,
	with_payload_interface: &WithPayloadInterface,
	with_vector: &WithVector,
	filter: Option<&Filter>,
	search_runtime_handle: &Handle,
	order_by: Option<&OrderBy>,
	timeout: Option<Duration>,
	) -> CollectionResult<Vec<RecordInternal>> {
	self.inner_unchecked()
	.scroll_by(
	offset,
	limit,
	with_payload_interface,
	with_vector,
	filter,
	search_runtime_handle,
	order_by,
	timeout,
	)
	.await
	}

	/// Forward read-only `info` to `wrapped_shard`
	async fn info(&self) -> CollectionResult<CollectionInfo> {
	self.inner_unchecked().info().await
	}
	async fn core_search(
	&self,
	request: Arc<CoreSearchRequestBatch>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	hw_measurement_acc: &HwMeasurementAcc,
	) -> CollectionResult<Vec<Vec<ScoredPoint>>> {
	self.inner_unchecked()
	.core_search(request, search_runtime_handle, timeout, hw_measurement_acc)
	.await
	}

	/// Forward read-only `count` to `wrapped_shard`
	async fn count(
	&self,
	request: Arc<CountRequestInternal>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	hw_measurement_acc: &HwMeasurementAcc,
	) -> CollectionResult<CountResult> {
	self.inner_unchecked()
	.count(request, search_runtime_handle, timeout, hw_measurement_acc)
	.await
	}

	/// Forward read-only `retrieve` to `wrapped_shard`
	async fn retrieve(
	&self,
	request: Arc<PointRequestInternal>,
	with_payload: &WithPayload,
	with_vector: &WithVector,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	) -> CollectionResult<Vec<RecordInternal>> {
	self.inner_unchecked()
	.retrieve(
	request,
	with_payload,
	with_vector,
	search_runtime_handle,
	timeout,
	)
	.await
	}

	/// Forward read-only `query` to `wrapped_shard`
	async fn query_batch(
	&self,
	requests: Arc<Vec<ShardQueryRequest>>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	hw_measurement_acc: &HwMeasurementAcc,
	) -> CollectionResult<Vec<ShardQueryResponse>> {
	self.inner_unchecked()
	.wrapped_shard
	.query_batch(requests, search_runtime_handle, timeout, hw_measurement_acc)
	.await
	}

	async fn facet(
	&self,
	request: Arc<FacetParams>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	) -> CollectionResult<FacetResponse> {
	self.inner_unchecked()
	.wrapped_shard
	.facet(request, search_runtime_handle, timeout)
	.await
	}
	}

	// Safe guard in debug mode to ensure that `finalize()` is called before dropping
	#[cfg(debug_assertions)]
	impl Drop for QueueProxyShard {
	fn drop(&mut self) {
	if !self.is_finalized() && !std::thread::panicking() {
	panic!("To drop a queue proxy shard, finalize() must be used");
	}
	}
	}

	struct Inner {
	/// Wrapped local shard to operate on.
	pub(super) wrapped_shard: LocalShard,
	/// Wrapped remote shard, to transfer operations to.
	pub(super) remote_shard: RemoteShard,
	/// WAL record at which we started the transfer.
	started_at: u64,
	/// ID of the WAL operation we should transfer next. We consider everything before it to be
	/// transferred.
	transfer_from: AtomicU64,
	/// Lock required to protect transfer-in-progress updates.
	/// It should block data updating operations while the batch is being transferred.
	update_lock: Mutex<()>,
	/// Always keep this WAL version and later and prevent acknowledgment/truncation from the WAL.
	/// We keep it here for access in `set_wal_keep_from()` without needing async locks.
	/// See `set_wal_keep_from()` and `UpdateHandler::wal_keep_from` for more details.
	/// Defaults to `u64::MAX` to allow acknowledging all confirmed versions.
	wal_keep_from: Arc<AtomicU64>,
	/// Progression tracker.
	progress: Arc<ParkingMutex<TransferTaskProgress>>,
	}

	impl Inner {
	pub fn new(
	wrapped_shard: LocalShard,
	remote_shard: RemoteShard,
	wal_keep_from: Arc<AtomicU64>,
	progress: Arc<ParkingMutex<TransferTaskProgress>>,
	) -> Self {
	let start_from = wrapped_shard.wal.wal.lock().last_index() + 1;
	Self::new_from_version(
	wrapped_shard,
	remote_shard,
	wal_keep_from,
	start_from,
	progress,
	)
	}

	pub fn new_from_version(
	wrapped_shard: LocalShard,
	remote_shard: RemoteShard,
	wal_keep_from: Arc<AtomicU64>,
	version: u64,
	progress: Arc<ParkingMutex<TransferTaskProgress>>,
	) -> Self {
	let shard = Self {
	wrapped_shard,
	remote_shard,
	transfer_from: version.into(),
	started_at: version,
	update_lock: Default::default(),
	wal_keep_from,
	progress,
	};

	// Keep all WAL entries from `version` so we don't truncate them off when we still need to transfer
	shard.set_wal_keep_from(Some(version));

	shard
	}

	/// Transfer all updates that the remote missed from WAL
	///
	/// # Cancel safety
	///
	/// This method is cancel safe.
	///
	/// If cancelled - none, some or all operations of that batch may be transmitted to the remote.
	///
	/// The internal field keeping track of the last transfer and maximum acknowledged WAL version
	/// likely won't be updated. In the worst case this might cause double sending operations.
	/// This should be fine as operations are idempotent.
	pub async fn transfer_all_missed_updates(&self) -> CollectionResult<()> {
	while !self.transfer_wal_batch().await? {}

	// Set the WAL version to keep to the next item we should transfer
	let transfer_from = self.transfer_from.load(Ordering::Relaxed);
	self.set_wal_keep_from(Some(transfer_from));

	Ok(())
	}

	/// Grab and transfer single new batch of updates from the WAL
	///
	/// Returns `true` if this was the last batch and we're now done. `false` if more batches must
	/// be sent.
	///
	/// # Cancel safety
	///
	/// This method is cancel safe.
	///
	/// If cancelled - none, some or all operations may be transmitted to the remote.
	///
	/// The internal field keeping track of the last transfer likely won't be updated. In the worst
	/// case this might cause double sending operations. This should be fine as operations are
	/// idempotent.
	async fn transfer_wal_batch(&self) -> CollectionResult<bool> {
	let mut update_lock = Some(self.update_lock.lock().await);
	let transfer_from = self.transfer_from.load(Ordering::Relaxed);

	// Lock wall, count pending items to transfer, grab batch
	let (pending_count, total, batch) = {
	let wal = self.wrapped_shard.wal.wal.lock();
	let items_left = (wal.last_index() + 1).saturating_sub(transfer_from);
	let items_total = (transfer_from - self.started_at) + items_left;
	let batch = wal.read(transfer_from).take(BATCH_SIZE).collect::<Vec<_>>();
	debug_assert!(
	batch.len() <= items_left as usize,
	"batch cannot be larger than items_left",
	);
	(items_left, items_total, batch)
	};

	log::trace!(
	"Queue proxy transferring batch of {} updates to peer {}",
	batch.len(),
	self.remote_shard.peer_id,
	);

	// Normally, we immediately release the update lock to allow new updates.
	// On the last batch we keep the lock to prevent accumulating more updates on the WAL,
	// so we can finalize the transfer after this batch, before accepting new updates.
	let last_batch = pending_count <= BATCH_SIZE as u64 \|\| batch.is_empty();
	if !last_batch {
	drop(update_lock.take());
	}

	// Set initial progress on the first batch
	let is_first = transfer_from == self.started_at;
	if is_first {
	self.update_progress(0, total as usize);
	}

	// Transfer batch with retries and store last transferred ID
	let last_idx = batch.last().map(\|(idx, _)\| *idx);
	for remaining_attempts in (0..BATCH_RETRIES).rev() {
	match transfer_operations_batch(&batch, &self.remote_shard).await {
	Ok(()) => {
	if let Some(idx) = last_idx {
	self.transfer_from.store(idx + 1, Ordering::Relaxed);

	let transferred = (idx + 1 - self.started_at) as usize;
	self.update_progress(transferred, total as usize);
	}
	break;
	}
	Err(err) if remaining_attempts > 0 => {
	log::error!(
	"Failed to transfer batch of updates to peer {}, retrying: {err}",
	self.remote_shard.peer_id,
	);
	continue;
	}
	Err(err) => return Err(err),
	}
	}

	Ok(last_batch)
	}

	/// Set or release what WAL versions to keep preventing acknowledgment/truncation.
	///
	/// Because this proxy shard relies on the WAL to obtain operations in the past, it cannot be
	/// truncated before all these update operations have been flushed.
	/// Using this function we set the WAL not to acknowledge and truncate from a specific point.
	///
	/// Providing `None` will release this limitation.
	fn set_wal_keep_from(&self, version: Option<u64>) {
	let version = version.unwrap_or(u64::MAX);
	self.wal_keep_from.store(version, Ordering::Relaxed);
	}

	fn update_progress(&self, transferred: usize, total: usize) {
	let mut progress = self.progress.lock();
	progress.points_transferred = transferred;
	progress.points_total = total;
	}
	}

	#[async_trait]
	impl ShardOperation for Inner {
	/// Update `wrapped_shard` while keeping track of operations
	///
	/// # Cancel safety
	///
	/// This method is cancel safe.
	async fn update(
	&self,
	operation: OperationWithClockTag,
	wait: bool,
	) -> CollectionResult<UpdateResult> {
	// `LocalShard::update` is cancel safe, so this is also cancel safe.

	let _update_lock = self.update_lock.lock().await;

	let local_shard = &self.wrapped_shard;
	// Shard update is within a write lock scope, because we need a way to block the shard updates
	// during the transfer restart and finalization.
	local_shard.update(operation.clone(), wait).await
	}

	/// Forward read-only `scroll_by` to `wrapped_shard`
	async fn scroll_by(
	&self,
	offset: Option<ExtendedPointId>,
	limit: usize,
	with_payload_interface: &WithPayloadInterface,
	with_vector: &WithVector,
	filter: Option<&Filter>,
	search_runtime_handle: &Handle,
	order_by: Option<&OrderBy>,
	timeout: Option<Duration>,
	) -> CollectionResult<Vec<RecordInternal>> {
	let local_shard = &self.wrapped_shard;
	local_shard
	.scroll_by(
	offset,
	limit,
	with_payload_interface,
	with_vector,
	filter,
	search_runtime_handle,
	order_by,
	timeout,
	)
	.await
	}

	/// Forward read-only `info` to `wrapped_shard`
	async fn info(&self) -> CollectionResult<CollectionInfo> {
	let local_shard = &self.wrapped_shard;
	local_shard.info().await
	}

	/// Forward read-only `search` to `wrapped_shard`
	async fn core_search(
	&self,
	request: Arc<CoreSearchRequestBatch>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	hw_measurement_acc: &HwMeasurementAcc,
	) -> CollectionResult<Vec<Vec<ScoredPoint>>> {
	let local_shard = &self.wrapped_shard;
	local_shard
	.core_search(request, search_runtime_handle, timeout, hw_measurement_acc)
	.await
	}

	/// Forward read-only `count` to `wrapped_shard`
	async fn count(
	&self,
	request: Arc<CountRequestInternal>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	hw_measurement_acc: &HwMeasurementAcc,
	) -> CollectionResult<CountResult> {
	let local_shard = &self.wrapped_shard;
	local_shard
	.count(request, search_runtime_handle, timeout, hw_measurement_acc)
	.await
	}

	/// Forward read-only `retrieve` to `wrapped_shard`
	async fn retrieve(
	&self,
	request: Arc<PointRequestInternal>,
	with_payload: &WithPayload,
	with_vector: &WithVector,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	) -> CollectionResult<Vec<RecordInternal>> {
	let local_shard = &self.wrapped_shard;
	local_shard
	.retrieve(
	request,
	with_payload,
	with_vector,
	search_runtime_handle,
	timeout,
	)
	.await
	}

	/// Forward read-only `query` to `wrapped_shard`
	async fn query_batch(
	&self,
	request: Arc<Vec<ShardQueryRequest>>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	hw_measurement_acc: &HwMeasurementAcc,
	) -> CollectionResult<Vec<ShardQueryResponse>> {
	let local_shard = &self.wrapped_shard;
	local_shard
	.query_batch(request, search_runtime_handle, timeout, hw_measurement_acc)
	.await
	}

	async fn facet(
	&self,
	request: Arc<FacetParams>,
	search_runtime_handle: &Handle,
	timeout: Option<Duration>,
	) -> CollectionResult<FacetResponse> {
	let local_shard = &self.wrapped_shard;
	local_shard
	.facet(request, search_runtime_handle, timeout)
	.await
	}
	}

	/// Transfer batch of operations without retries
	///
	/// # Cancel safety
	///
	/// This method is cancel safe.
	///
	/// If cancelled - none, some or all operations of the batch may be transmitted to the remote.
	async fn transfer_operations_batch(
	batch: &[(u64, OperationWithClockTag)],
	remote_shard: &RemoteShard,
	) -> CollectionResult<()> {
	// TODO: naive transfer approach, transfer batch of points instead
	for (_idx, operation) in batch {
	let mut operation = operation.clone();

	// Set force flag because operations from WAL may be unordered if another node is sending
	// new operations at the same time
	if let Some(clock_tag) = &mut operation.clock_tag {
	clock_tag.force = true;
	}

	remote_shard
	.forward_update(operation, true, WriteOrdering::Weak)
	.await?;
	}
	Ok(())
	}