Spaces:

reztilop
/

colibri.qdrant

Build error

Gouzi Mohaled

Ajout du dossier src

d8435ba 8 months ago

13.5 kB

	use std::collections::HashSet;
	use std::future::{self, Future};
	use std::sync::atomic::{self, AtomicBool};
	use std::sync::Arc;
	use std::time::Duration;
	use std::{panic, thread};

	use api::grpc::qdrant::qdrant_internal_client::QdrantInternalClient;
	use api::grpc::qdrant::{GetConsensusCommitRequest, GetConsensusCommitResponse};
	use api::grpc::transport_channel_pool::{self, TransportChannelPool};
	use collection::shards::shard::ShardId;
	use collection::shards::CollectionId;
	use common::defaults;
	use futures::stream::FuturesUnordered;
	use futures::{FutureExt as _, StreamExt as _, TryStreamExt as _};
	use itertools::Itertools;
	use storage::content_manager::consensus_manager::ConsensusStateRef;
	use storage::content_manager::toc::TableOfContent;
	use storage::rbac::Access;
	use tokio::{runtime, sync, time};

	const READY_CHECK_TIMEOUT: Duration = Duration::from_millis(500);
	const GET_CONSENSUS_COMMITS_RETRIES: usize = 2;

	/// Structure used to process health checks like `/readyz` endpoints.
	pub struct HealthChecker {
	// The state of the health checker.
	// Once set to `true`, it should not change back to `false`.
	// Initially set to `false`.
	is_ready: Arc<AtomicBool>,
	// The signal that notifies that state has changed.
	// Comes from the health checker task.
	is_ready_signal: Arc<sync::Notify>,
	// Signal to the health checker task, that the API was called.
	// Used to drive the health checker task and avoid constant polling.
	check_ready_signal: Arc<sync::Notify>,
	cancel: cancel::DropGuard,
	}

	impl HealthChecker {
	pub fn spawn(
	toc: Arc<TableOfContent>,
	consensus_state: ConsensusStateRef,
	runtime: &runtime::Handle,
	wait_for_bootstrap: bool,
	) -> Self {
	let task = Task {
	toc,
	consensus_state,
	is_ready: Default::default(),
	is_ready_signal: Default::default(),
	check_ready_signal: Default::default(),
	cancel: Default::default(),
	wait_for_bootstrap,
	};

	let health_checker = Self {
	is_ready: task.is_ready.clone(),
	is_ready_signal: task.is_ready_signal.clone(),
	check_ready_signal: task.check_ready_signal.clone(),
	cancel: task.cancel.clone().drop_guard(),
	};

	let task = runtime.spawn(task.exec());
	drop(task); // drop `JoinFuture` explicitly to make clippy happy

	health_checker
	}

	pub async fn check_ready(&self) -> bool {
	if self.is_ready() {
	return true;
	}

	self.notify_task();
	self.wait_ready().await
	}

	pub fn is_ready(&self) -> bool {
	self.is_ready.load(atomic::Ordering::Relaxed)
	}

	pub fn notify_task(&self) {
	self.check_ready_signal.notify_one();
	}

	async fn wait_ready(&self) -> bool {
	let is_ready_signal = self.is_ready_signal.notified();

	if self.is_ready() {
	return true;
	}

	time::timeout(READY_CHECK_TIMEOUT, is_ready_signal)
	.await
	.is_ok()
	}
	}

	pub struct Task {
	toc: Arc<TableOfContent>,
	consensus_state: ConsensusStateRef,
	// Shared state with the health checker
	// Once set to `true`, it should not change back to `false`.
	is_ready: Arc<AtomicBool>,
	// Used to notify the health checker service that the state has changed.
	is_ready_signal: Arc<sync::Notify>,
	// Driver signal for the health checker task
	// Once received, the task should proceed with an attempt to check the state.
	// Usually comes from the API call, but can be triggered by the task itself.
	check_ready_signal: Arc<sync::Notify>,
	cancel: cancel::CancellationToken,
	wait_for_bootstrap: bool,
	}

	impl Task {
	pub async fn exec(mut self) {
	while let Err(err) = self.exec_catch_unwind().await {
	let message = common::panic::downcast_str(&err).unwrap_or("");
	let separator = if !message.is_empty() { ": " } else { "" };

	log::error!("HealthChecker task panicked, retrying{separator}{message}",);
	}
	}

	async fn exec_catch_unwind(&mut self) -> thread::Result<()> {
	panic::AssertUnwindSafe(self.exec_cancel())
	.catch_unwind()
	.await
	}

	async fn exec_cancel(&mut self) {
	let _ = cancel::future::cancel_on_token(self.cancel.clone(), self.exec_impl()).await;
	}

	async fn exec_impl(&mut self) {
	// Wait until node joins cluster for the first time
	//
	// If this is a new deployment and `--bootstrap` CLI parameter was specified...
	if self.wait_for_bootstrap {
	// Check if this is the only node in the cluster
	while self.consensus_state.peer_count() <= 1 {
	// If cluster is empty, make another attempt to check
	// after we receive another call to `/readyz`
	//
	// Wait for `/readyz` signal
	self.check_ready_signal.notified().await;
	}
	}

	// Artificial simulate signal from `/readyz` endpoint
	// as if it was already called by the user.
	// This allows to check the happy path without waiting for the first call.
	self.check_ready_signal.notify_one();

	// Get cluster commit index, or check if this is the only node in the cluster
	let Some(cluster_commit_index) = self.cluster_commit_index().await else {
	self.set_ready();
	return;
	};

	// Check if local commit index >= cluster commit index...
	while self.commit_index() < cluster_commit_index {
	// Wait for `/readyz` signal
	self.check_ready_signal.notified().await;

	// If not:
	//
	// - Check if this is the only node in the cluster
	if self.consensus_state.peer_count() <= 1 {
	self.set_ready();
	return;
	}

	// TODO: Do we want to update `cluster_commit_index` here?
	//
	// I.e.:
	// - If we don't update `cluster_commit_index`, then we will only wait till the node
	// catch up with the cluster commit index at the moment the node has been started
	// - If we do update `cluster_commit_index`, then we will keep track of cluster
	// commit index updates and wait till the node completely catch up with the leader,
	// which might be hard (if not impossible) in some situations
	}

	// Collect "unhealthy" shards list
	let mut unhealthy_shards = self.unhealthy_shards().await;

	// Check if all shards are "healthy"...
	while !unhealthy_shards.is_empty() {
	// If not:
	//
	// - Wait for `/readyz` signal
	self.check_ready_signal.notified().await;

	// - Refresh "unhealthy" shards list
	let current_unhealthy_shards = self.unhealthy_shards().await;

	// - Check if any shards "healed" since last check
	unhealthy_shards.retain(\|shard\| current_unhealthy_shards.contains(shard));
	}

	self.set_ready();
	}

	async fn cluster_commit_index(&self) -> Option<u64> {
	// Wait for `/readyz` signal
	self.check_ready_signal.notified().await;

	// Check if there is only 1 node in the cluster
	if self.consensus_state.peer_count() <= 1 {
	return None;
	}

	// Get cluster commit index
	let peer_address_by_id = self.consensus_state.peer_address_by_id();
	let transport_channel_pool = &self.toc.get_channel_service().channel_pool;
	let this_peer_id = self.toc.this_peer_id;
	let this_peer_uri = peer_address_by_id.get(&this_peer_id);

	let mut requests = peer_address_by_id
	.values()
	// Do not get the current commit from ourselves
	.filter(\|&uri\| Some(uri) != this_peer_uri)
	// Historic peers might use the same URLs as our current peers, request each URI once
	.unique()
	.map(\|uri\| get_consensus_commit(transport_channel_pool, uri))
	.collect::<FuturesUnordered<_>>()
	.inspect_err(\|err\| log::error!("GetConsensusCommit request failed: {err}"))
	.filter_map(\|res\| future::ready(res.ok()));

	// Raft commits consensus operation, after majority of nodes persisted it.
	//
	// This means, if we check the majority of nodes (e.g., `total nodes / 2 + 1`), at least one
	// of these nodes will always have an up-to-date commit index. And so, the highest commit
	// index among majority of nodes is the cluster commit index.
	//
	// Our current node is one of the cluster nodes, so it's enough to query `total nodes / 2`
	// additional nodes, to get cluster commit index.
	//
	// The check goes like this:
	// - Either at least one of the "additional" nodes return a higher commit index, which
	// means our node is not up-to-date, and we have to wait to reach this commit index
	// - Or all of them return lower commit index, which means current node is already
	// up-to-date, and `/readyz` check will pass to the next step
	//
	// Example:
	//
	// Total nodes: 2
	// Required: 2 / 2 = 1
	//
	// Total nodes: 3
	// Required: 3 / 2 = 1
	//
	// Total nodes: 4
	// Required: 4 / 2 = 2
	//
	// Total nodes: 5
	// Required: 5 / 2 = 2
	let sufficient_commit_indices_count = peer_address_by_id.len() / 2;

	// Wait for `total nodex / 2` successful responses...
	let mut commit_indices: Vec<_> = (&mut requests)
	.take(sufficient_commit_indices_count)
	.collect()
	.await;

	// ...and also collect any additional responses, that we might have already received
	while let Ok(Some(resp)) = time::timeout(Duration::ZERO, requests.next()).await {
	commit_indices.push(resp);
	}

	// Find the maximum commit index among all responses.
	//
	// Note, that we progress even if most (or even all) requests failed (e.g., because all
	// other nodes are unavailable or they don't support `GetConsensusCommit` gRPC API).
	//
	// So this check is not 100% reliable and can give a false-positive result!
	let cluster_commit_index = commit_indices
	.into_iter()
	.map(\|resp\| resp.into_inner().commit)
	.max()
	.unwrap_or(0);

	Some(cluster_commit_index as _)
	}

	fn commit_index(&self) -> u64 {
	// TODO: Blocking call in async context!?
	self.consensus_state
	.persistent
	.read()
	.last_applied_entry()
	.unwrap_or(0)
	}

	/// List shards that are unhealthy, which may undergo automatic recovery.
	///
	/// Shards in resharding state are not considered unhealthy and are excluded here.
	/// They require an external driver to make them active or to drop them.
	async fn unhealthy_shards(&self) -> HashSet<Shard> {
	let this_peer_id = self.toc.this_peer_id;
	let collections = self
	.toc
	.all_collections(&Access::full("For health check"))
	.await;

	let mut unhealthy_shards = HashSet::new();

	for collection_pass in &collections {
	let state = match self.toc.get_collection(collection_pass).await {
	Ok(collection) => collection.state().await,
	Err(_) => continue,
	};

	for (&shard, info) in state.shards.iter() {
	let Some(state) = info.replicas.get(&this_peer_id) else {
	continue;
	};

	if state.is_active_or_listener_or_resharding() {
	continue;
	}

	unhealthy_shards.insert(Shard::new(collection_pass.name(), shard));
	}
	}

	unhealthy_shards
	}

	fn set_ready(&self) {
	self.is_ready.store(true, atomic::Ordering::Relaxed);
	self.is_ready_signal.notify_waiters();
	}
	}

	fn get_consensus_commit<'a>(
	transport_channel_pool: &'a TransportChannelPool,
	uri: &'a tonic::transport::Uri,
	) -> impl Future<Output = GetConsensusCommitResult> + 'a {
	transport_channel_pool.with_channel_timeout(
	uri,
	\|channel\| async {
	let mut client = QdrantInternalClient::new(channel);
	let mut request = tonic::Request::new(GetConsensusCommitRequest {});
	request.set_timeout(defaults::CONSENSUS_META_OP_WAIT);
	client.get_consensus_commit(request).await
	},
	Some(defaults::CONSENSUS_META_OP_WAIT),
	GET_CONSENSUS_COMMITS_RETRIES,
	)
	}

	type GetConsensusCommitResult = Result<
	tonic::Response<GetConsensusCommitResponse>,
	transport_channel_pool::RequestError<tonic::Status>,
	>;

	#[derive(Clone, Debug, Eq, PartialEq, Hash)]
	struct Shard {
	collection: CollectionId,
	shard: ShardId,
	}

	impl Shard {
	pub fn new(collection: impl Into<CollectionId>, shard: ShardId) -> Self {
	Self {
	collection: collection.into(),
	shard,
	}
	}
	}