Spaces:
Build error
Build error
use std::path::Path; | |
use std::sync::Arc; | |
use common::defaults; | |
use parking_lot::Mutex; | |
use semver::Version; | |
use tempfile::TempPath; | |
use super::transfer_tasks_pool::TransferTaskProgress; | |
use super::{ShardTransfer, ShardTransferConsensus}; | |
use crate::operations::snapshot_ops::{get_checksum_path, SnapshotPriority}; | |
use crate::operations::types::{CollectionError, CollectionResult}; | |
use crate::shards::channel_service::ChannelService; | |
use crate::shards::remote_shard::RemoteShard; | |
use crate::shards::replica_set::ReplicaState; | |
use crate::shards::shard::ShardId; | |
use crate::shards::shard_holder::LockedShardHolder; | |
use crate::shards::CollectionId; | |
/// Orchestrate shard snapshot transfer | |
/// | |
/// This is called on the sender and will arrange all that is needed for the shard snapshot | |
/// transfer process to a receiver. | |
/// | |
/// The order of operations here is critical for correctness. Explicit synchronization across nodes | |
/// is used to ensure data consistency. | |
/// | |
/// Before this function, this has happened: | |
/// | |
/// - An empty shard is initialized on the remote | |
/// - Set the remote shard state to `PartialSnapshot` | |
/// In `PartialSnapshot` state, the remote shard will ignore all operations and other nodes will | |
/// prevent sending operations to it. This is critical not to modify the shard while it is being | |
/// recovered from the snapshot. | |
/// | |
/// During this function, this happens in order: | |
/// | |
/// - Queue proxy local shard | |
/// We queue all new operations to the shard for the remote. Once the remote is ready, we can | |
/// transfer all these operations to it. | |
/// - Create shard snapshot | |
/// Snapshot the shard after the queue proxy is initialized. This snapshot will be used to get | |
/// the shard into the same state on the remote. | |
/// - Recover shard snapshot on remote | |
/// Instruct the remote to download the snapshot from this node over HTTP, then recover it. | |
/// - Set shard state to `Partial` | |
/// After recovery, we set the shard state from `PartialSnapshot` to `Partial`. We propose an | |
/// operation to consensus for this. Our logic explicitly confirms that the remote reaches the | |
/// `Partial` state. That is critical for the remote to accept incoming operations, that also | |
/// confirms consensus has accepted accepted our proposal. If this fails it will be retried up to | |
/// three times. | |
/// - Transfer queued updates to remote, transform into forward proxy | |
/// Once the remote is in `Partial` state we can transfer all accumulated updates in the queue | |
/// proxy to the remote. This ensures all operations reach the recovered shard on the remote to | |
/// make it consistent again. When all updates are transferred, we transform the queue proxy into | |
/// a forward proxy to start forwarding new updates to the remote right away. | |
/// We transfer the queue and transform into a forward proxy right now so that we can catch any | |
/// errors as early as possible. The forward proxy shard we end up with will not error again once | |
/// we un-proxify. | |
/// - Wait for Partial state in our replica set | |
/// Wait for the remote shard to be set to `Partial` in our local replica set. That way we | |
/// confirm consensus has also propagated on this node. | |
/// - Synchronize all nodes | |
/// After confirming consensus propagation on this node, synchronize all nodes to reach the same | |
/// consensus state before finalizing the transfer. That way, we ensure we have a consistent | |
/// replica set state across all nodes. All nodes will have the `Partial` state, which makes the | |
/// shard participate on all nodes. | |
/// | |
/// After this function, the following will happen: | |
/// | |
/// - The local shard is un-proxified | |
/// - The shard transfer is finished | |
/// - The remote shard state is set to `Active` through consensus | |
/// | |
/// # Diagram | |
/// | |
/// Here's a rough sequence diagram for the shard snasphot transfer process with the consensus, | |
/// sender and receiver actors: | |
/// | |
/// βββββββββββββ βββββββββββββ βββββββββββββ | |
/// β Consensus β β Sender β β Receiver β | |
/// βββββββββββββ βββββββββββββ βββββββββββββ | |
/// | | | | |
/// | start transfer | | | |
/// βββββΊβββ¬ββββββββββββββββββββββ|βββββββββββββββββββββββββΊ|βββ | |
/// β β | | β shard state: | |
/// β β start transfer | init transfer | β DeadβPartialSnapshot | |
/// βββ΄βββββββββββββββββββββΊβ¬ββ¬βββββββββββββββββββββββΊβββββ | |
/// | βXβ β β | |
/// | βXβ β βββ | |
/// | βXβ ready β β β init local shard | |
/// | βXβββββββββββββββββββββββββ΄ββββ | |
/// | β β | | |
/// | β βββ | | |
/// | β β β qproxy + snapshot | | |
/// | β βββ | | |
/// | β β | | |
/// | β β recover shard by URL | | |
/// | βXβββββββββββββββββββββββββ¬ββ | |
/// | βXβ β β | |
/// | βXβ β β | |
/// | βββββΒ·βXβΒ·βββββββββββββββββββββββ€ β | |
/// | β β βXβ download snapshot β β | |
/// | βββ΄ββΒ·βXβΒ·ββββββββββββββββββββββΊβ βββ | |
/// | βXβ β β β apply snapshot | |
/// | βXβ done recovery β β β delete snapshot | |
/// | βXβββββββββββββββββββββββββ΄ββββ | |
/// | snapshot recovered β β | | |
/// βββββββββββββββββββββββββ€ β | | |
/// β β β β | | |
/// β β βββ€Xβ | | |
/// β β wait consensus β βXβ | | |
/// β β or retry β βXβ | | |
/// β β β βXβ | | |
/// β β continue transfer β βXβ | | |
/// β βββββββββββββββββββΒ·β βXβΒ·ββββββββββββββββββββββΊβββ¬ββ | |
/// β β continue transfer β βXβ β β β shard state: | |
/// βββ΄ββββββββββββββββββββ€βΊβXβββ β β β PartialSnapshotβPartial | |
/// | β βXβ β shard state: βββββ | |
/// | β βXβ β PartialSnpstβPartial | | |
/// | ββΊβXβββ | | |
/// | β β | | |
/// | β β transfer queue ops | | |
/// | ββΊβXββββββββββββββββββββββββΊβββ¬ββ | |
/// | send batches β βXβ β β β apply operations | |
/// | βββ€Xβββββββββββββββββββββββββ΄ββββ | |
/// | β β | | |
/// | β βββ | | |
/// | β β β qproxyβfwd proxy | | |
/// | β βββ | | |
/// | β β | | |
/// | β β sync all nodes | | |
/// | βXββββββββββββββββββββββββΊβββ¬ββ | |
/// | βXβ β β β wait consensus | |
/// | βXβ node synced β β β commit+term | |
/// | βXβββββββββββββββββββββββββ΄ββββ | |
/// | β β | | |
/// | β βββ | | |
/// | finish transfer β β β unproxify | | |
/// βββββββββββββββββββββββββ΄ββββ | | |
/// β β transfer finished | | | |
/// β βββββββββββββββββββββββ|ββββββββββββββββββββββββΊβββ¬ββ | |
/// β β transfer finished | β β β shard state: | |
/// βββ΄βββββββββββββββββββββΊβββ¬ββ β β β PartialβActive | |
/// | β β β shard state: βββββ | |
/// | β β β PartialβActive | | |
/// | βββββ | | |
/// | | | | |
/// | |
/// # Cancel safety | |
/// | |
/// This function is cancel safe. | |
/// | |
/// If cancelled - the remote shard may only be partially recovered/transferred and the local shard | |
/// may be left in an unexpected state. This must be resolved manually in case of cancellation. | |
pub(super) async fn transfer_snapshot( | |
transfer_config: ShardTransfer, | |
shard_holder: Arc<LockedShardHolder>, | |
progress: Arc<Mutex<TransferTaskProgress>>, | |
shard_id: ShardId, | |
remote_shard: RemoteShard, | |
channel_service: &ChannelService, | |
consensus: &dyn ShardTransferConsensus, | |
snapshots_path: &Path, | |
collection_id: &CollectionId, | |
temp_dir: &Path, | |
) -> CollectionResult<()> { | |
let remote_peer_id = remote_shard.peer_id; | |
log::debug!( | |
"Starting shard {shard_id} transfer to peer {remote_peer_id} using snapshot transfer" | |
); | |
let shard_holder_read = shard_holder.read().await; | |
let local_rest_address = channel_service.current_rest_address(transfer_config.from)?; | |
let transferring_shard = shard_holder_read.get_shard(shard_id); | |
let Some(replica_set) = transferring_shard else { | |
return Err(CollectionError::service_error(format!( | |
"Shard {shard_id} cannot be queue proxied because it does not exist" | |
))); | |
}; | |
// Queue proxy local shard | |
replica_set | |
.queue_proxify_local(remote_shard.clone(), None, progress) | |
.await?; | |
debug_assert!( | |
replica_set.is_queue_proxy().await, | |
"Local shard must be a queue proxy", | |
); | |
// The ability to read streaming snapshot format is introduced in 1.12 (#5179). | |
let use_streaming_endpoint = | |
channel_service.peer_is_at_version(remote_peer_id, &Version::new(1, 12, 0)); | |
let mut snapshot_temp_paths = Vec::new(); | |
let mut shard_download_url = local_rest_address; | |
if use_streaming_endpoint { | |
log::trace!("Using streaming endpoint for shard snapshot transfer"); | |
shard_download_url.set_path(&format!( | |
"/collections/{collection_id}/shards/{shard_id}/snapshot", | |
)); | |
} else { | |
// Create shard snapshot | |
log::trace!("Creating snapshot of shard {shard_id} for shard snapshot transfer"); | |
let snapshot_description = shard_holder_read | |
.create_shard_snapshot(snapshots_path, collection_id, shard_id, temp_dir) | |
.await?; | |
// TODO: If future is cancelled until `get_shard_snapshot_path` resolves, shard snapshot may not be cleaned up... | |
let snapshot_temp_path = shard_holder_read | |
.get_shard_snapshot_path(snapshots_path, shard_id, &snapshot_description.name) | |
.await | |
.map(TempPath::from_path) | |
.map_err(|err| { | |
CollectionError::service_error(format!( | |
"Failed to determine snapshot path, cannot continue with shard snapshot recovery: {err}", | |
)) | |
})?; | |
let snapshot_checksum_temp_path = | |
TempPath::from_path(get_checksum_path(&snapshot_temp_path)); | |
snapshot_temp_paths.push(snapshot_temp_path); | |
snapshot_temp_paths.push(snapshot_checksum_temp_path); | |
shard_download_url.set_path(&format!( | |
"/collections/{collection_id}/shards/{shard_id}/snapshots/{}", | |
&snapshot_description.name, | |
)); | |
}; | |
// Recover shard snapshot on remote | |
log::trace!("Transferring and recovering shard {shard_id} snapshot on peer {remote_peer_id}"); | |
remote_shard | |
.recover_shard_snapshot_from_url( | |
collection_id, | |
shard_id, | |
&shard_download_url, | |
SnapshotPriority::ShardTransfer, | |
// Provide API key here so the remote can access our snapshot | |
channel_service.api_key.as_deref(), | |
) | |
.await | |
.map_err(|err| { | |
CollectionError::service_error(format!( | |
"Failed to recover shard snapshot on remote: {err}" | |
)) | |
})?; | |
for snapshot_temp_path in snapshot_temp_paths { | |
if let Err(err) = snapshot_temp_path.close() { | |
log::warn!( | |
"Failed to delete shard transfer snapshot after recovery, \ | |
snapshot file may be left behind: {err}" | |
); | |
} | |
} | |
// Set shard state to Partial | |
log::trace!("Shard {shard_id} snapshot recovered on {remote_peer_id} for snapshot transfer, switching into next stage through consensus"); | |
consensus | |
.recovered_switch_to_partial_confirm_remote(&transfer_config, collection_id, &remote_shard) | |
.await | |
.map_err(|err| { | |
CollectionError::service_error(format!( | |
"Can't switch shard {shard_id} to Partial state after snapshot transfer: {err}" | |
)) | |
})?; | |
// Transfer queued updates to remote, transform into forward proxy | |
log::trace!("Transfer all queue proxy updates and transform into forward proxy"); | |
replica_set.queue_proxy_into_forward_proxy().await?; | |
// Wait for Partial state in our replica set | |
// Consensus sync is done right after this function | |
let partial_state = ReplicaState::Partial; | |
log::trace!("Wait for local shard to reach {partial_state:?} state"); | |
replica_set | |
.wait_for_state( | |
transfer_config.to, | |
partial_state, | |
defaults::CONSENSUS_META_OP_WAIT, | |
) | |
.await | |
.map_err(|err| { | |
CollectionError::service_error(format!( | |
"Shard being transferred did not reach {partial_state:?} state in time: {err}", | |
)) | |
})?; | |
log::debug!( | |
"Ending shard {shard_id} transfer to peer {remote_peer_id} using snapshot transfer" | |
); | |
Ok(()) | |
} | |