Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::ops::Deref as _;
use std::sync::Arc;
use parking_lot::Mutex;
use segment::types::PointIdType;
use super::ShardReplicaSet;
use crate::hash_ring::HashRingRouter;
use crate::operations::types::{CollectionError, CollectionResult};
use crate::shards::forward_proxy_shard::ForwardProxyShard;
use crate::shards::local_shard::clock_map::RecoveryPoint;
use crate::shards::queue_proxy_shard::QueueProxyShard;
use crate::shards::remote_shard::RemoteShard;
use crate::shards::shard::Shard;
use crate::shards::transfer::transfer_tasks_pool::TransferTaskProgress;
impl ShardReplicaSet {
/// Convert `Local` shard into `ForwardProxy`.
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn proxify_local(
&self,
remote_shard: RemoteShard,
resharding_hash_ring: Option<HashRingRouter>,
) -> CollectionResult<()> {
let mut local = self.local.write().await;
match local.deref() {
// Expected state, continue
Some(Shard::Local(_)) => {}
// If a forward proxy to same remote, return early
Some(Shard::ForwardProxy(proxy))
if proxy.remote_shard.peer_id == remote_shard.peer_id =>
{
return Ok(())
}
// Unexpected states, error
Some(Shard::ForwardProxy(proxy)) => {
return Err(CollectionError::service_error(format!(
"Cannot proxify local shard {} to peer {} because it is already proxified to peer {}",
self.shard_id, remote_shard.peer_id, proxy.remote_shard.peer_id
)));
}
Some(Shard::QueueProxy(_)) => {
return Err(CollectionError::service_error(format!(
"Cannot proxify local shard {} to peer {} because it is already queue proxified",
self.shard_id, remote_shard.peer_id,
)));
}
Some(Shard::Proxy(_)) => {
return Err(CollectionError::service_error(format!(
"Cannot queue proxify local shard {} to peer {} because it already is a proxy",
self.shard_id, remote_shard.peer_id,
)));
}
Some(Shard::Dummy(_)) => {
return Err(CollectionError::service_error(format!(
"Cannot proxify local dummy shard {} to peer {}",
self.shard_id, remote_shard.peer_id,
)));
}
None => {
return Err(CollectionError::service_error(format!(
"Cannot proxify local shard {} on peer {} because it is not active",
self.shard_id,
self.this_peer_id()
)));
}
};
// Explicit `match` instead of `if-let` to catch `unreachable` condition if top `match` is
// changed
let Some(Shard::Local(local_shard)) = local.take() else {
unreachable!()
};
let proxy_shard = ForwardProxyShard::new(
self.shard_id,
local_shard,
remote_shard,
resharding_hash_ring,
);
let _ = local.insert(Shard::ForwardProxy(proxy_shard));
Ok(())
}
/// Queue proxy our local shard, pointing to the remote shard.
///
/// A `from_version` may be provided to start queueing the WAL from a specific version. The
/// point may be in the past, but can never be outside the range of what we currently have in
/// WAL. If `None` is provided, it'll queue from the latest available WAL version at this time.
///
/// For snapshot transfer we queue from the latest version, so we can send all new updates once
/// the remote shard has been recovered. For WAL delta transfer we queue from a specific
/// version based on our recovery point.
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn queue_proxify_local(
&self,
remote_shard: RemoteShard,
from_version: Option<u64>,
progress: Arc<Mutex<TransferTaskProgress>>,
) -> CollectionResult<()> {
let mut local = self.local.write().await;
match local.deref() {
// Expected state, continue
Some(Shard::Local(_)) => {}
// If a forward proxy to same remote, continue and change into queue proxy
Some(Shard::ForwardProxy(proxy))
if proxy.remote_shard.peer_id == remote_shard.peer_id => {}
// Unexpected states, error
Some(Shard::QueueProxy(_)) => {
return Err(CollectionError::service_error(format!(
"Cannot queue proxify local shard {} to peer {} because it is already queue proxified",
self.shard_id, remote_shard.peer_id,
)));
}
Some(Shard::ForwardProxy(proxy)) => {
return Err(CollectionError::service_error(format!(
"Cannot queue proxify local shard {} to peer {} because it is already proxified to peer {}",
self.shard_id, remote_shard.peer_id, proxy.remote_shard.peer_id
)));
}
Some(Shard::Proxy(_)) => {
return Err(CollectionError::service_error(format!(
"Cannot queue proxify local shard {} to peer {} because it already is a proxy",
self.shard_id, remote_shard.peer_id,
)));
}
Some(Shard::Dummy(_)) => {
return Err(CollectionError::service_error(format!(
"Cannot proxify local dummy shard {} to peer {}",
self.shard_id, remote_shard.peer_id,
)));
}
None => {
return Err(CollectionError::service_error(format!(
"Cannot queue proxify local shard {} on peer {} because it is not active",
self.shard_id,
self.this_peer_id()
)));
}
};
// Get `max_ack_version` without "taking" local shard (to maintain cancel safety)
let local_shard = match local.deref() {
Some(Shard::Local(local)) => local,
Some(Shard::ForwardProxy(proxy)) => &proxy.wrapped_shard,
_ => unreachable!(),
};
let wal_keep_from = local_shard
.update_handler
.lock()
.await
.wal_keep_from
.clone();
// Proxify local shard
//
// Making `await` calls between `local.take()` and `local.insert(...)` is *not* cancel safe!
let local_shard = match local.take() {
Some(Shard::Local(local)) => local,
Some(Shard::ForwardProxy(proxy)) => proxy.wrapped_shard,
_ => unreachable!(),
};
// Try to queue proxify with or without version
let proxy_shard = match from_version {
None => Ok(QueueProxyShard::new(
local_shard,
remote_shard,
wal_keep_from,
progress,
)),
Some(from_version) => QueueProxyShard::new_from_version(
local_shard,
remote_shard,
wal_keep_from,
from_version,
progress,
),
};
// Insert queue proxy shard on success or revert to local shard on failure
match proxy_shard {
// All good, insert queue proxy shard
Ok(proxy_shard) => {
let _ = local.insert(Shard::QueueProxy(proxy_shard));
Ok(())
}
Err((local_shard, err)) => {
log::warn!("Failed to queue proxify shard, reverting to local shard: {err}");
let _ = local.insert(Shard::Local(local_shard));
Err(err)
}
}
}
/// Un-proxify local shard wrapped as `ForwardProxy` or `QueueProxy`.
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn un_proxify_local(&self) -> CollectionResult<()> {
let mut local = self.local.write().await;
match local.deref() {
// Expected states, continue
Some(Shard::Local(_)) => return Ok(()),
Some(Shard::ForwardProxy(_) | Shard::QueueProxy(_)) => {}
// Unexpected states, error
Some(shard @ (Shard::Proxy(_) | Shard::Dummy(_))) => {
return Err(CollectionError::service_error(format!(
"Cannot un-proxify local shard {} because it has unexpected type - {}",
self.shard_id,
shard.variant_name(),
)));
}
None => {
return Err(CollectionError::service_error(format!(
"Cannot un-proxify local shard {} on peer {} because it is not active",
self.shard_id,
self.this_peer_id(),
)));
}
};
// Perform async finalization without "taking" local shard (to maintain cancel safety)
//
// Explicit `match` instead of `if-let` on `Shard::QueueProxy` to catch `unreachable`
// condition if top `match` is changed
let result = match local.deref() {
Some(Shard::ForwardProxy(_)) => Ok(()),
Some(Shard::QueueProxy(proxy)) => {
// We should not unproxify a queue proxy shard directly because it can fail if it
// fails to send all updates to the remote shard.
// Instead we should transform it into a forward proxy shard before unproxify is
// called to handle errors at an earlier time.
// Also, we're holding a write lock here which could block other accessors for a
// long time if transferring updates takes a long time.
// See `Self::queue_proxy_into_forward_proxy()` for more details.
log::warn!(
"Directly unproxifying queue proxy shard, this should not happen normally"
);
let result = proxy.transfer_all_missed_updates().await;
if let Err(err) = &result {
log::error!(
"Failed to un-proxify local shard because transferring remaining queue \
items to remote failed: {err}"
);
}
result
}
_ => unreachable!(),
};
// Un-proxify local shard
//
// Making `await` calls between `local.take()` and `local.insert(...)` is *not* cancel safe!
let local_shard = match local.take() {
Some(Shard::ForwardProxy(proxy)) => proxy.wrapped_shard,
Some(Shard::QueueProxy(proxy)) => {
let (local_shard, _) = proxy.forget_updates_and_finalize();
local_shard
}
_ => unreachable!(),
};
let _ = local.insert(Shard::Local(local_shard));
result
}
/// Revert usage of a `QueueProxy` shard and forget all updates, then un-proxify to local
///
/// This can be used to intentionally forget all updates that are collected by the queue proxy
/// shard and revert back to a local shard. This is useful if a shard transfer operation using
/// a queue proxy must be aborted.
///
/// Does nothing if the local shard is not a queue proxy shard.
/// This method cannot fail.
///
/// # Warning
///
/// This intentionally forgets and drops updates pending to be transferred to the remote shard.
/// The remote shard may therefore therefore be left in an inconsistent state, which should be
/// resolved separately.
///
/// # Cancel safety
///
/// This method is cancel safe.
///
/// If cancelled - the queue proxy may not be reverted to a local proxy.
pub async fn revert_queue_proxy_local(&self) {
let mut local = self.local.write().await;
// Take out queue proxy shard or return
if !matches!(local.deref(), Some(Shard::QueueProxy(_))) {
return;
};
log::debug!("Forgetting queue proxy updates and reverting to local shard");
// Making `await` calls between `local.take()` and `local.insert(...)` is *not* cancel safe!
let Some(Shard::QueueProxy(queue_proxy)) = local.take() else {
unreachable!();
};
let (local_shard, _) = queue_proxy.forget_updates_and_finalize();
let _ = local.insert(Shard::Local(local_shard));
}
/// Custom operation for transferring data from one shard to another during transfer
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn transfer_batch(
&self,
offset: Option<PointIdType>,
batch_size: usize,
hashring_filter: Option<&HashRingRouter>,
merge_points: bool,
) -> CollectionResult<Option<PointIdType>> {
let local = self.local.read().await;
let Some(Shard::ForwardProxy(proxy)) = local.deref() else {
return Err(CollectionError::service_error(format!(
"Cannot transfer batch from shard {} because it is not proxified",
self.shard_id
)));
};
proxy
.transfer_batch(
offset,
batch_size,
hashring_filter,
merge_points,
&self.search_runtime,
)
.await
}
/// Custom operation for transferring indexes from one shard to another during transfer
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn transfer_indexes(&self) -> CollectionResult<()> {
let local = self.local.read().await;
let Some(Shard::ForwardProxy(proxy)) = local.deref() else {
return Err(CollectionError::service_error(format!(
"Cannot transfer indexes from shard {} because it is not proxified",
self.shard_id,
)));
};
log::trace!(
"Transferring indexes to shard {}",
proxy.remote_shard.peer_id,
);
proxy.transfer_indexes().await
}
/// Send all queue proxy updates to remote
///
/// This method allows to transfer queued updates at any point, before the shard is
/// unproxified for example. This allows for proper error handling at the time this method is
/// called. Because the shard is transformed into a forward proxy after this operation it will
/// not error again when the shard is eventually unproxified again.
///
/// Does nothing if the local shard is not a queue proxy.
///
/// # Errors
///
/// Returns an error if transferring all updates to the remote failed.
///
/// # Cancel safety
///
/// This function is cancel safe.
///
/// If cancelled - transforming the queue proxy into a forward proxy may not actually complete.
/// None, some or all queued operations may be transmitted to the remote.
pub async fn queue_proxy_flush(&self) -> CollectionResult<()> {
let local = self.local.read().await;
let Some(Shard::QueueProxy(proxy)) = local.deref() else {
return Ok(());
};
proxy.transfer_all_missed_updates().await?;
Ok(())
}
/// Send all queue proxy updates to remote and transform into forward proxy
///
/// When a queue or forward proxy shard needs to be unproxified into a local shard again we
/// typically don't have room to handle errors. A queue proxy shard may error if it fails to
/// send updates to the remote shard, while a forward proxy does not fail at all when
/// transforming.
///
/// This method allows to transfer queued updates before the shard is unproxified. This allows
/// for proper error handling at the time this method is called. Because the shard is
/// transformed into a forward proxy after this operation it will not error again when the
/// shard is eventually unproxified again.
///
/// If the local shard is a queue proxy:
/// - Transfers all missed updates to remote
/// - Transforms queue proxy into forward proxy
///
/// Does nothing if the local shard is not a queue proxy.
///
/// # Errors
///
/// Returns an error if transferring all updates to the remote failed.
///
/// # Cancel safety
///
/// This function is cancel safe.
///
/// If cancelled - transforming the queue proxy into a forward proxy may not actually complete.
/// None, some or all queued operations may be transmitted to the remote.
pub async fn queue_proxy_into_forward_proxy(&self) -> CollectionResult<()> {
// First pass: transfer all missed updates with shared read lock
self.queue_proxy_flush().await?;
// Second pass: transfer new updates
let mut local = self.local.write().await;
let Some(Shard::QueueProxy(proxy)) = local.deref() else {
return Ok(());
};
proxy.transfer_all_missed_updates().await?;
// Transform `QueueProxyShard` into `ForwardProxyShard`
log::trace!("Transferred all queue proxy operations, transforming into forward proxy now");
// Making `await` calls between `local.take()` and `local.insert(...)` is *not* cancel safe!
let Some(Shard::QueueProxy(queue_proxy)) = local.take() else {
unreachable!();
};
let (local_shard, remote_shard) = queue_proxy.forget_updates_and_finalize();
let forward_proxy = ForwardProxyShard::new(self.shard_id, local_shard, remote_shard, None);
let _ = local.insert(Shard::ForwardProxy(forward_proxy));
Ok(())
}
pub async fn resolve_wal_delta(
&self,
recovery_point: RecoveryPoint,
) -> CollectionResult<Option<u64>> {
let local_shard_read = self.local.read().await;
let Some(local_shard) = local_shard_read.deref() else {
return Err(CollectionError::service_error(
"Cannot resolve WAL delta, shard replica set does not have local shard",
));
};
local_shard.resolve_wal_delta(recovery_point).await
}
pub async fn wal_version(&self) -> CollectionResult<Option<u64>> {
let local_shard_read = self.local.read().await;
let Some(local_shard) = local_shard_read.deref() else {
return Err(CollectionError::service_error(
"Cannot get WAL version, shard replica set does not have local shard",
));
};
local_shard.wal_version()
}
}