use std::ops::Deref as _; use std::path::Path; use common::tar_ext; use segment::types::SnapshotFormat; use super::{ReplicaSetState, ReplicaState, ShardReplicaSet, REPLICA_STATE_FILE}; use crate::operations::types::{CollectionError, CollectionResult}; use crate::save_on_disk::SaveOnDisk; use crate::shards::dummy_shard::DummyShard; use crate::shards::local_shard::LocalShard; use crate::shards::shard::{PeerId, Shard}; use crate::shards::shard_config::ShardConfig; impl ShardReplicaSet { pub async fn create_snapshot( &self, temp_path: &Path, tar: &tar_ext::BuilderExt, format: SnapshotFormat, save_wal: bool, ) -> CollectionResult<()> { let local_read = self.local.read().await; if let Some(local) = &*local_read { local .create_snapshot(temp_path, tar, format, save_wal) .await? } self.replica_state .save_to_tar(tar, REPLICA_STATE_FILE) .await?; let shard_config = ShardConfig::new_replica_set(); shard_config.save_to_tar(tar).await?; Ok(()) } pub fn restore_snapshot( snapshot_path: &Path, this_peer_id: PeerId, is_distributed: bool, ) -> CollectionResult<()> { let replica_state: SaveOnDisk = SaveOnDisk::load_or_init_default(snapshot_path.join(REPLICA_STATE_FILE))?; // If this shard have local data let is_snapshot_local = replica_state.read().is_local; if !is_distributed && !is_snapshot_local { return Err(CollectionError::service_error(format!( "Can't restore snapshot in local mode with missing data at shard: {}", snapshot_path.display() ))); } replica_state.write(|state| { state.this_peer_id = this_peer_id; if is_distributed { state .peers .remove(&this_peer_id) .and_then(|replica_state| state.peers.insert(this_peer_id, replica_state)); } else { // In local mode we don't want any remote peers state.peers.clear(); state.peers.insert(this_peer_id, ReplicaState::Active); } })?; if replica_state.read().is_local { LocalShard::restore_snapshot(snapshot_path)?; } Ok(()) } /// # Cancel safety /// /// This method is *not* cancel safe. pub async fn restore_local_replica_from( &self, replica_path: &Path, cancel: cancel::CancellationToken, ) -> CollectionResult { // `local.take()` call and `restore` task have to be executed as a single transaction if !LocalShard::check_data(replica_path) { return Ok(false); } // TODO: // Check that shard snapshot is compatible with the collection // (see `VectorsConfig::check_compatible_with_segment_config`) let mut local = cancel::future::cancel_on_token(cancel.clone(), self.local.write()).await?; // Check `cancel` token one last time before starting non-cancellable section if cancel.is_cancelled() { return Err(cancel::Error::Cancelled.into()); } // Drop `LocalShard` instance to free resources and clear shard data let clear = local.take().is_some(); // Try to restore local replica from specified shard snapshot directory let restore = async { if clear { LocalShard::clear(&self.shard_path).await?; } LocalShard::move_data(replica_path, &self.shard_path).await?; LocalShard::load( self.shard_id, self.collection_id.clone(), &self.shard_path, self.collection_config.clone(), self.optimizers_config.clone(), self.shared_storage_config.clone(), self.payload_index_schema.clone(), self.update_runtime.clone(), self.search_runtime.clone(), self.optimizer_cpu_budget.clone(), ) .await }; match restore.await { Ok(new_local) => { local.replace(Shard::Local(new_local)); Ok(true) } Err(restore_err) => { // Initialize "dummy" replica local.replace(Shard::Dummy(DummyShard::new( "Failed to restore local replica", ))); // TODO: Handle single-node mode!? (How!? 😰) // Mark this peer as "locally disabled"... let has_other_active_peers = self.active_remote_shards().is_empty(); // ...if this peer is *not* the last active replica if has_other_active_peers { let notify = self .locally_disabled_peers .write() .disable_peer_and_notify_if_elapsed(self.this_peer_id(), None); if notify { self.notify_peer_failure_cb.deref()( self.this_peer_id(), self.shard_id, None, ); } } // Remove shard directory, so we don't leave empty directory/corrupted data match tokio::fs::remove_dir_all(&self.shard_path).await { Ok(()) => Err(restore_err), Err(cleanup_err) => { log::error!( "Failed to cleanup shard {} directory ({}) after restore failed: \ {cleanup_err}", self.shard_id, self.shard_path.display(), ); // TODO: Contextualize `restore_err` with `cleanup_err` details!? Err(restore_err) } } } } } }