Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / lib /collection /src /shards /replica_set /snapshots.rs

Gouzi Mohaled

Ajout du dossier lib

84d2a97 7 months ago

6.23 kB

	use std::ops::Deref as _;
	use std::path::Path;

	use common::tar_ext;
	use segment::types::SnapshotFormat;

	use super::{ReplicaSetState, ReplicaState, ShardReplicaSet, REPLICA_STATE_FILE};
	use crate::operations::types::{CollectionError, CollectionResult};
	use crate::save_on_disk::SaveOnDisk;
	use crate::shards::dummy_shard::DummyShard;
	use crate::shards::local_shard::LocalShard;
	use crate::shards::shard::{PeerId, Shard};
	use crate::shards::shard_config::ShardConfig;

	impl ShardReplicaSet {
	pub async fn create_snapshot(
	&self,
	temp_path: &Path,
	tar: &tar_ext::BuilderExt,
	format: SnapshotFormat,
	save_wal: bool,
	) -> CollectionResult<()> {
	let local_read = self.local.read().await;

	if let Some(local) = &*local_read {
	local
	.create_snapshot(temp_path, tar, format, save_wal)
	.await?
	}

	self.replica_state
	.save_to_tar(tar, REPLICA_STATE_FILE)
	.await?;

	let shard_config = ShardConfig::new_replica_set();
	shard_config.save_to_tar(tar).await?;
	Ok(())
	}

	pub fn restore_snapshot(
	snapshot_path: &Path,
	this_peer_id: PeerId,
	is_distributed: bool,
	) -> CollectionResult<()> {
	let replica_state: SaveOnDisk<ReplicaSetState> =
	SaveOnDisk::load_or_init_default(snapshot_path.join(REPLICA_STATE_FILE))?;

	// If this shard have local data
	let is_snapshot_local = replica_state.read().is_local;

	if !is_distributed && !is_snapshot_local {
	return Err(CollectionError::service_error(format!(
	"Can't restore snapshot in local mode with missing data at shard: {}",
	snapshot_path.display()
	)));
	}

	replica_state.write(\|state\| {
	state.this_peer_id = this_peer_id;
	if is_distributed {
	state
	.peers
	.remove(&this_peer_id)
	.and_then(\|replica_state\| state.peers.insert(this_peer_id, replica_state));
	} else {
	// In local mode we don't want any remote peers
	state.peers.clear();
	state.peers.insert(this_peer_id, ReplicaState::Active);
	}
	})?;

	if replica_state.read().is_local {
	LocalShard::restore_snapshot(snapshot_path)?;
	}
	Ok(())
	}

	/// # Cancel safety
	///
	/// This method is not cancel safe.
	pub async fn restore_local_replica_from(
	&self,
	replica_path: &Path,
	cancel: cancel::CancellationToken,
	) -> CollectionResult<bool> {
	// `local.take()` call and `restore` task have to be executed as a single transaction

	if !LocalShard::check_data(replica_path) {
	return Ok(false);
	}

	// TODO:
	// Check that shard snapshot is compatible with the collection
	// (see `VectorsConfig::check_compatible_with_segment_config`)

	let mut local = cancel::future::cancel_on_token(cancel.clone(), self.local.write()).await?;

	// Check `cancel` token one last time before starting non-cancellable section
	if cancel.is_cancelled() {
	return Err(cancel::Error::Cancelled.into());
	}

	// Drop `LocalShard` instance to free resources and clear shard data
	let clear = local.take().is_some();

	// Try to restore local replica from specified shard snapshot directory
	let restore = async {
	if clear {
	LocalShard::clear(&self.shard_path).await?;
	}

	LocalShard::move_data(replica_path, &self.shard_path).await?;

	LocalShard::load(
	self.shard_id,
	self.collection_id.clone(),
	&self.shard_path,
	self.collection_config.clone(),
	self.optimizers_config.clone(),
	self.shared_storage_config.clone(),
	self.payload_index_schema.clone(),
	self.update_runtime.clone(),
	self.search_runtime.clone(),
	self.optimizer_cpu_budget.clone(),
	)
	.await
	};

	match restore.await {
	Ok(new_local) => {
	local.replace(Shard::Local(new_local));
	Ok(true)
	}

	Err(restore_err) => {
	// Initialize "dummy" replica
	local.replace(Shard::Dummy(DummyShard::new(
	"Failed to restore local replica",
	)));

	// TODO: Handle single-node mode!? (How!? 😰)

	// Mark this peer as "locally disabled"...
	let has_other_active_peers = self.active_remote_shards().is_empty();

	// ...if this peer is not the last active replica
	if has_other_active_peers {
	let notify = self
	.locally_disabled_peers
	.write()
	.disable_peer_and_notify_if_elapsed(self.this_peer_id(), None);

	if notify {
	self.notify_peer_failure_cb.deref()(
	self.this_peer_id(),
	self.shard_id,
	None,
	);
	}
	}

	// Remove shard directory, so we don't leave empty directory/corrupted data
	match tokio::fs::remove_dir_all(&self.shard_path).await {
	Ok(()) => Err(restore_err),

	Err(cleanup_err) => {
	log::error!(
	"Failed to cleanup shard {} directory ({}) after restore failed: \
	{cleanup_err}",
	self.shard_id,
	self.shard_path.display(),
	);

	// TODO: Contextualize `restore_err` with `cleanup_err` details!?
	Err(restore_err)
	}
	}
	}
	}
	}
	}