Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::path::Path;
use std::sync::Arc;
use common::defaults;
use parking_lot::Mutex;
use semver::Version;
use tempfile::TempPath;
use super::transfer_tasks_pool::TransferTaskProgress;
use super::{ShardTransfer, ShardTransferConsensus};
use crate::operations::snapshot_ops::{get_checksum_path, SnapshotPriority};
use crate::operations::types::{CollectionError, CollectionResult};
use crate::shards::channel_service::ChannelService;
use crate::shards::remote_shard::RemoteShard;
use crate::shards::replica_set::ReplicaState;
use crate::shards::shard::ShardId;
use crate::shards::shard_holder::LockedShardHolder;
use crate::shards::CollectionId;
/// Orchestrate shard snapshot transfer
///
/// This is called on the sender and will arrange all that is needed for the shard snapshot
/// transfer process to a receiver.
///
/// The order of operations here is critical for correctness. Explicit synchronization across nodes
/// is used to ensure data consistency.
///
/// Before this function, this has happened:
///
/// - An empty shard is initialized on the remote
/// - Set the remote shard state to `PartialSnapshot`
/// In `PartialSnapshot` state, the remote shard will ignore all operations and other nodes will
/// prevent sending operations to it. This is critical not to modify the shard while it is being
/// recovered from the snapshot.
///
/// During this function, this happens in order:
///
/// - Queue proxy local shard
/// We queue all new operations to the shard for the remote. Once the remote is ready, we can
/// transfer all these operations to it.
/// - Create shard snapshot
/// Snapshot the shard after the queue proxy is initialized. This snapshot will be used to get
/// the shard into the same state on the remote.
/// - Recover shard snapshot on remote
/// Instruct the remote to download the snapshot from this node over HTTP, then recover it.
/// - Set shard state to `Partial`
/// After recovery, we set the shard state from `PartialSnapshot` to `Partial`. We propose an
/// operation to consensus for this. Our logic explicitly confirms that the remote reaches the
/// `Partial` state. That is critical for the remote to accept incoming operations, that also
/// confirms consensus has accepted accepted our proposal. If this fails it will be retried up to
/// three times.
/// - Transfer queued updates to remote, transform into forward proxy
/// Once the remote is in `Partial` state we can transfer all accumulated updates in the queue
/// proxy to the remote. This ensures all operations reach the recovered shard on the remote to
/// make it consistent again. When all updates are transferred, we transform the queue proxy into
/// a forward proxy to start forwarding new updates to the remote right away.
/// We transfer the queue and transform into a forward proxy right now so that we can catch any
/// errors as early as possible. The forward proxy shard we end up with will not error again once
/// we un-proxify.
/// - Wait for Partial state in our replica set
/// Wait for the remote shard to be set to `Partial` in our local replica set. That way we
/// confirm consensus has also propagated on this node.
/// - Synchronize all nodes
/// After confirming consensus propagation on this node, synchronize all nodes to reach the same
/// consensus state before finalizing the transfer. That way, we ensure we have a consistent
/// replica set state across all nodes. All nodes will have the `Partial` state, which makes the
/// shard participate on all nodes.
///
/// After this function, the following will happen:
///
/// - The local shard is un-proxified
/// - The shard transfer is finished
/// - The remote shard state is set to `Active` through consensus
///
/// # Diagram
///
/// Here's a rough sequence diagram for the shard snasphot transfer process with the consensus,
/// sender and receiver actors:
///
/// β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
/// β”‚ Consensus β”‚ β”‚ Sender β”‚ β”‚ Receiver β”‚
/// β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
/// | | |
/// | start transfer | |
/// β”€β”€β”€β”€β–Ίβ”Œβ”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€|────────────────────────►|──┐
/// β”‚ β”‚ | | β”‚ shard state:
/// │ │ start transfer | init transfer | │ Dead→PartialSnapshot
/// β””β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”¬β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”Œβ”€β”β—„β”˜
/// | β”‚Xβ”‚ β”‚ β”‚
/// | β”‚Xβ”‚ β”‚ β”œβ”€β”
/// | β”‚Xβ”‚ ready β”‚ β”‚ β”‚ init local shard
/// | β”‚Xβ”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”˜β—„β”˜
/// | β”‚ β”‚ |
/// | β”‚ β”œβ”€β” |
/// | β”‚ β”‚ β”‚ qproxy + snapshot |
/// | β”‚ β”‚β—„β”˜ |
/// | β”‚ β”‚ |
/// | β”‚ β”‚ recover shard by URL |
/// | β”‚Xβ”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”
/// | β”‚Xβ”‚ β”‚ β”‚
/// | β”‚Xβ”‚ β”‚ β”‚
/// | β”Œβ”€β”β—„β”€Β·β”‚X│·─────────────────────── β”‚
/// | β”‚ β”‚ β”‚Xβ”‚ download snapshot β”‚ β”‚
/// | └─┴──·│X│·─────────────────────►│ β”œβ”€β”
/// | β”‚Xβ”‚ β”‚ β”‚ β”‚ apply snapshot
/// | β”‚Xβ”‚ done recovery β”‚ β”‚ β”‚ delete snapshot
/// | β”‚Xβ”‚β—„β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”˜β—„β”˜
/// | snapshot recovered β”‚ β”‚ |
/// β”Œβ”€β”β—„β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ |
/// β”‚ β”‚ β”‚ β”‚ |
/// β”‚ β”‚ β”Œβ”€β”€Xβ”‚ |
/// β”‚ β”‚ wait consensus β”‚ β”‚Xβ”‚ |
/// β”‚ β”‚ or retry β”‚ β”‚Xβ”‚ |
/// β”‚ β”‚ β”‚ β”‚Xβ”‚ |
/// β”‚ β”‚ continue transfer β”‚ β”‚Xβ”‚ |
/// β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€Β·β”‚ β”‚Xβ”‚Β·β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”Œβ”€β”¬β”€β”
/// β”‚ β”‚ continue transfer β”‚ β”‚Xβ”‚ β”‚ β”‚ β”‚ shard state:
/// └─┴────────────────────►│Xβ”œβ”€β” β”‚ β”‚ β”‚ PartialSnapshotβ†’Partial
/// | β”‚ β”‚Xβ”‚ β”‚ shard state: β””β”€β”˜β—„β”˜
/// | │ │X│ │ PartialSnpst→Partial |
/// | β””β–Ίβ”‚Xβ”‚β—„β”˜ |
/// | β”‚ β”‚ |
/// | β”‚ β”‚ transfer queue ops |
/// | β”Œβ–Ίβ”‚Xβ”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”Œβ”€β”¬β”€β”
/// | send batches β”‚ β”‚Xβ”‚ β”‚ β”‚ β”‚ apply operations
/// | └──Xβ”‚β—„β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”˜β—„β”˜
/// | β”‚ β”‚ |
/// | β”‚ β”œβ”€β” |
/// | │ │ │ qproxy→fwd proxy |
/// | β”‚ β”‚β—„β”˜ |
/// | β”‚ β”‚ |
/// | β”‚ β”‚ sync all nodes |
/// | β”‚Xβ”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”Œβ”€β”¬β”€β”
/// | β”‚Xβ”‚ β”‚ β”‚ β”‚ wait consensus
/// | β”‚Xβ”‚ node synced β”‚ β”‚ β”‚ commit+term
/// | β”‚Xβ”‚β—„β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”˜β—„β”˜
/// | β”‚ β”‚ |
/// | β”‚ β”œβ”€β” |
/// | finish transfer β”‚ β”‚ β”‚ unproxify |
/// β”Œβ”€β”β—„β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”˜β—„β”˜ |
/// β”‚ β”‚ transfer finished | |
/// β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€|β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”Œβ”€β”¬β”€β”
/// β”‚ β”‚ transfer finished | β”‚ β”‚ β”‚ shard state:
/// β””β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”Œβ”€β”¬β”€β” β”‚ β”‚ β”‚ Partialβ†’Active
/// | β”‚ β”‚ β”‚ shard state: β””β”€β”˜β—„β”˜
/// | │ │ │ Partial→Active |
/// | β””β”€β”˜β—„β”˜ |
/// | | |
///
/// # Cancel safety
///
/// This function is cancel safe.
///
/// If cancelled - the remote shard may only be partially recovered/transferred and the local shard
/// may be left in an unexpected state. This must be resolved manually in case of cancellation.
#[allow(clippy::too_many_arguments)]
pub(super) async fn transfer_snapshot(
transfer_config: ShardTransfer,
shard_holder: Arc<LockedShardHolder>,
progress: Arc<Mutex<TransferTaskProgress>>,
shard_id: ShardId,
remote_shard: RemoteShard,
channel_service: &ChannelService,
consensus: &dyn ShardTransferConsensus,
snapshots_path: &Path,
collection_id: &CollectionId,
temp_dir: &Path,
) -> CollectionResult<()> {
let remote_peer_id = remote_shard.peer_id;
log::debug!(
"Starting shard {shard_id} transfer to peer {remote_peer_id} using snapshot transfer"
);
let shard_holder_read = shard_holder.read().await;
let local_rest_address = channel_service.current_rest_address(transfer_config.from)?;
let transferring_shard = shard_holder_read.get_shard(shard_id);
let Some(replica_set) = transferring_shard else {
return Err(CollectionError::service_error(format!(
"Shard {shard_id} cannot be queue proxied because it does not exist"
)));
};
// Queue proxy local shard
replica_set
.queue_proxify_local(remote_shard.clone(), None, progress)
.await?;
debug_assert!(
replica_set.is_queue_proxy().await,
"Local shard must be a queue proxy",
);
// The ability to read streaming snapshot format is introduced in 1.12 (#5179).
let use_streaming_endpoint =
channel_service.peer_is_at_version(remote_peer_id, &Version::new(1, 12, 0));
let mut snapshot_temp_paths = Vec::new();
let mut shard_download_url = local_rest_address;
if use_streaming_endpoint {
log::trace!("Using streaming endpoint for shard snapshot transfer");
shard_download_url.set_path(&format!(
"/collections/{collection_id}/shards/{shard_id}/snapshot",
));
} else {
// Create shard snapshot
log::trace!("Creating snapshot of shard {shard_id} for shard snapshot transfer");
let snapshot_description = shard_holder_read
.create_shard_snapshot(snapshots_path, collection_id, shard_id, temp_dir)
.await?;
// TODO: If future is cancelled until `get_shard_snapshot_path` resolves, shard snapshot may not be cleaned up...
let snapshot_temp_path = shard_holder_read
.get_shard_snapshot_path(snapshots_path, shard_id, &snapshot_description.name)
.await
.map(TempPath::from_path)
.map_err(|err| {
CollectionError::service_error(format!(
"Failed to determine snapshot path, cannot continue with shard snapshot recovery: {err}",
))
})?;
let snapshot_checksum_temp_path =
TempPath::from_path(get_checksum_path(&snapshot_temp_path));
snapshot_temp_paths.push(snapshot_temp_path);
snapshot_temp_paths.push(snapshot_checksum_temp_path);
shard_download_url.set_path(&format!(
"/collections/{collection_id}/shards/{shard_id}/snapshots/{}",
&snapshot_description.name,
));
};
// Recover shard snapshot on remote
log::trace!("Transferring and recovering shard {shard_id} snapshot on peer {remote_peer_id}");
remote_shard
.recover_shard_snapshot_from_url(
collection_id,
shard_id,
&shard_download_url,
SnapshotPriority::ShardTransfer,
// Provide API key here so the remote can access our snapshot
channel_service.api_key.as_deref(),
)
.await
.map_err(|err| {
CollectionError::service_error(format!(
"Failed to recover shard snapshot on remote: {err}"
))
})?;
for snapshot_temp_path in snapshot_temp_paths {
if let Err(err) = snapshot_temp_path.close() {
log::warn!(
"Failed to delete shard transfer snapshot after recovery, \
snapshot file may be left behind: {err}"
);
}
}
// Set shard state to Partial
log::trace!("Shard {shard_id} snapshot recovered on {remote_peer_id} for snapshot transfer, switching into next stage through consensus");
consensus
.recovered_switch_to_partial_confirm_remote(&transfer_config, collection_id, &remote_shard)
.await
.map_err(|err| {
CollectionError::service_error(format!(
"Can't switch shard {shard_id} to Partial state after snapshot transfer: {err}"
))
})?;
// Transfer queued updates to remote, transform into forward proxy
log::trace!("Transfer all queue proxy updates and transform into forward proxy");
replica_set.queue_proxy_into_forward_proxy().await?;
// Wait for Partial state in our replica set
// Consensus sync is done right after this function
let partial_state = ReplicaState::Partial;
log::trace!("Wait for local shard to reach {partial_state:?} state");
replica_set
.wait_for_state(
transfer_config.to,
partial_state,
defaults::CONSENSUS_META_OP_WAIT,
)
.await
.map_err(|err| {
CollectionError::service_error(format!(
"Shard being transferred did not reach {partial_state:?} state in time: {err}",
))
})?;
log::debug!(
"Ending shard {shard_id} transfer to peer {remote_peer_id} using snapshot transfer"
);
Ok(())
}