Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / src /consensus.rs

Gouzi Mohaled

Ajout du dossier src

d8435ba 8 months ago

57.9 kB

	use std::collections::{HashMap, HashSet};
	use std::str::FromStr;
	use std::sync::{mpsc, Arc};
	use std::thread::JoinHandle;
	use std::time::{Duration, Instant};
	use std::{fmt, thread};

	use anyhow::{anyhow, Context as _};
	use api::grpc::dynamic_channel_pool::make_grpc_channel;
	use api::grpc::qdrant::raft_client::RaftClient;
	use api::grpc::qdrant::{AllPeers, PeerId as GrpcPeerId, RaftMessage as GrpcRaftMessage};
	use api::grpc::transport_channel_pool::TransportChannelPool;
	use collection::shards::channel_service::ChannelService;
	use collection::shards::shard::PeerId;
	#[cfg(target_os = "linux")]
	use common::cpu::linux_high_thread_priority;
	use common::defaults;
	use raft::eraftpb::Message as RaftMessage;
	use raft::prelude::*;
	use raft::{SoftState, StateRole, INVALID_ID};
	use storage::content_manager::consensus_manager::ConsensusStateRef;
	use storage::content_manager::consensus_ops::{ConsensusOperations, SnapshotStatus};
	use storage::content_manager::toc::TableOfContent;
	use tokio::runtime::Handle;
	use tokio::sync::mpsc::{Receiver, Sender};
	use tokio::sync::watch;
	use tokio::time::sleep;
	use tonic::transport::{ClientTlsConfig, Uri};

	use crate::common::helpers;
	use crate::common::telemetry_ops::requests_telemetry::TonicTelemetryCollector;
	use crate::settings::{ConsensusConfig, Settings};
	use crate::tonic::init_internal;

	type Node = RawNode<ConsensusStateRef>;

	const RECOVERY_RETRY_TIMEOUT: Duration = Duration::from_secs(1);
	const RECOVERY_MAX_RETRY_COUNT: usize = 3;

	pub enum Message {
	FromClient(ConsensusOperations),
	FromPeer(Box<RaftMessage>),
	}

	/// Aka Consensus Thread
	/// Manages proposed changes to consensus state, ensures that everything is ordered properly
	pub struct Consensus {
	/// Raft structure which handles raft-related state
	node: Node,
	/// Receives proposals from peers and client for applying in consensus
	receiver: Receiver<Message>,
	/// Runtime for async message sending
	runtime: Handle,
	/// Uri to some other known peer, used to join the consensus
	/// ToDo: Make if many
	config: ConsensusConfig,
	broker: RaftMessageBroker,
	}

	impl Consensus {
	/// Create and run consensus node
	#[allow(clippy::too_many_arguments)]
	pub fn run(
	logger: &slog::Logger,
	state_ref: ConsensusStateRef,
	bootstrap_peer: Option<Uri>,
	uri: Option<String>,
	settings: Settings,
	channel_service: ChannelService,
	propose_receiver: mpsc::Receiver<ConsensusOperations>,
	telemetry_collector: Arc<parking_lot::Mutex<TonicTelemetryCollector>>,
	toc: Arc<TableOfContent>,
	runtime: Handle,
	reinit: bool,
	) -> anyhow::Result<JoinHandle<std::io::Result<()>>> {
	let tls_client_config = helpers::load_tls_client_config(&settings)?;

	let p2p_host = settings.service.host.clone();
	let p2p_port = settings.cluster.p2p.port.expect("P2P port is not set");
	let config = settings.cluster.consensus.clone();

	let (mut consensus, message_sender) = Self::new(
	logger,
	state_ref.clone(),
	bootstrap_peer,
	uri,
	p2p_port,
	config,
	tls_client_config,
	channel_service,
	runtime.clone(),
	reinit,
	)?;

	let state_ref_clone = state_ref.clone();
	thread::Builder::new()
	.name("consensus".to_string())
	.spawn(move \|\| {
	// On Linux, try to use high thread priority because consensus is important
	// Likely fails as we cannot set a higher priority by default due to permissions
	#[cfg(target_os = "linux")]
	if let Err(err) = linux_high_thread_priority() {
	log::debug!(
	"Failed to set high thread priority for consensus, ignoring: {err}"
	);
	}

	if let Err(err) = consensus.start() {
	log::error!("Consensus stopped with error: {err:#}");
	state_ref_clone.on_consensus_thread_err(err);
	} else {
	log::info!("Consensus stopped");
	state_ref_clone.on_consensus_stopped();
	}
	})?;

	let message_sender_moved = message_sender.clone();
	thread::Builder::new()
	.name("forward-proposals".to_string())
	.spawn(move \|\| {
	// On Linux, try to use high thread priority because consensus is important
	// Likely fails as we cannot set a higher priority by default due to permissions
	#[cfg(target_os = "linux")]
	if let Err(err) = linux_high_thread_priority() {
	log::debug!(
	"Failed to set high thread priority for consensus, ignoring: {err}"
	);
	}

	while let Ok(entry) = propose_receiver.recv() {
	if message_sender_moved
	.blocking_send(Message::FromClient(entry))
	.is_err()
	{
	log::error!("Can not forward new entry to consensus as it was stopped.");
	break;
	}
	}
	})?;

	let server_tls = if settings.cluster.p2p.enable_tls {
	let tls_config = settings
	.tls
	.clone()
	.ok_or_else(Settings::tls_config_is_undefined_error)?;

	Some(helpers::load_tls_internal_server_config(&tls_config)?)
	} else {
	None
	};

	let handle = thread::Builder::new()
	.name("grpc_internal".to_string())
	.spawn(move \|\| {
	init_internal(
	toc,
	state_ref,
	telemetry_collector,
	settings,
	p2p_host,
	p2p_port,
	server_tls,
	message_sender,
	runtime,
	)
	})
	.unwrap();

	Ok(handle)
	}

	/// If `bootstrap_peer` peer is supplied, then either `uri` or `p2p_port` should be also supplied
	#[allow(clippy::too_many_arguments)]
	pub fn new(
	logger: &slog::Logger,
	state_ref: ConsensusStateRef,
	bootstrap_peer: Option<Uri>,
	uri: Option<String>,
	p2p_port: u16,
	config: ConsensusConfig,
	tls_config: Option<ClientTlsConfig>,
	channel_service: ChannelService,
	runtime: Handle,
	reinit: bool,
	) -> anyhow::Result<(Self, Sender<Message>)> {
	// If we want to re-initialize consensus, we need to prevent other peers
	// from re-playing consensus WAL operations, as they should already have them applied.
	// Do ensure that we are forcing compacting WAL on the first re-initialized peer,
	// which should trigger snapshot transferring instead of replaying WAL.
	let force_compact_wal = reinit && bootstrap_peer.is_none();

	// On the bootstrap-ed peers during reinit of the consensus
	// we want to make sure only the bootstrap peer will hold the true state
	// Therefore we clear the WAL on the bootstrap peer to force it to request a snapshot
	let clear_wal = reinit && bootstrap_peer.is_some();

	if clear_wal {
	log::debug!("Clearing WAL on the bootstrap peer to force snapshot transfer");
	state_ref.clear_wal()?;
	}

	// raft will not return entries to the application smaller or equal to `applied`
	let last_applied = state_ref.last_applied_entry().unwrap_or_default();
	let raft_config = Config {
	id: state_ref.this_peer_id(),
	applied: last_applied,
	..Default::default()
	};
	raft_config.validate()?;
	let op_wait = defaults::CONSENSUS_META_OP_WAIT;
	// Commit might take up to 4 ticks as:
	// 1 tick - send proposal to leader
	// 2 tick - leader sends append entries to peers
	// 3 tick - peer answers leader, that entry is persisted
	// 4 tick - leader increases commit index and sends it
	if 4 * Duration::from_millis(config.tick_period_ms) > op_wait {
	log::warn!("With current tick period of {}ms, operation commit time might exceed default wait timeout: {}ms",
	config.tick_period_ms, op_wait.as_millis())
	}
	// bounded channel for backpressure
	let (sender, receiver) = tokio::sync::mpsc::channel(config.max_message_queue_size);
	// State might be initialized but the node might be shutdown without actually syncing or committing anything.
	if state_ref.is_new_deployment() \|\| reinit {
	let leader_established_in_ms =
	config.tick_period_ms * raft_config.max_election_tick() as u64;
	Self::init(
	&state_ref,
	bootstrap_peer.clone(),
	uri,
	p2p_port,
	&config,
	tls_config.clone(),
	&runtime,
	leader_established_in_ms,
	)
	.map_err(\|err\| anyhow!("Failed to initialize Consensus for new Raft state: {}", err))?;
	} else {
	runtime
	.block_on(Self::recover(
	&state_ref,
	uri.clone(),
	p2p_port,
	&config,
	tls_config.clone(),
	))
	.map_err(\|err\| {
	anyhow!(
	"Failed to recover Consensus from existing Raft state: {}",
	err
	)
	})?;

	if bootstrap_peer.is_some() \|\| uri.is_some() {
	log::debug!("Local raft state found - bootstrap and uri cli arguments were ignored")
	}
	log::debug!("Local raft state found - skipping initialization");
	};

	let mut node = Node::new(&raft_config, state_ref.clone(), logger)?;
	node.set_batch_append(true);

	// Before consensus has started apply any unapplied committed entries
	// They might have not been applied due to unplanned Qdrant shutdown
	let _stop_consensus = state_ref.apply_entries(&mut node)?;

	if force_compact_wal {
	// Making sure that the WAL will be compacted on start
	state_ref.compact_wal(1)?;
	} else {
	state_ref.compact_wal(config.compact_wal_entries)?;
	}

	let broker = RaftMessageBroker::new(
	runtime.clone(),
	bootstrap_peer,
	tls_config,
	config.clone(),
	node.store().clone(),
	channel_service.channel_pool,
	);

	let consensus = Self {
	node,
	receiver,
	runtime,
	config,
	broker,
	};

	if !state_ref.is_new_deployment() {
	state_ref.recover_first_voter()?;
	}

	Ok((consensus, sender))
	}

	#[allow(clippy::too_many_arguments)]
	fn init(
	state_ref: &ConsensusStateRef,
	bootstrap_peer: Option<Uri>,
	uri: Option<String>,
	p2p_port: u16,
	config: &ConsensusConfig,
	tls_config: Option<ClientTlsConfig>,
	runtime: &Handle,
	leader_established_in_ms: u64,
	) -> anyhow::Result<()> {
	if let Some(bootstrap_peer) = bootstrap_peer {
	log::debug!("Bootstrapping from peer with address: {bootstrap_peer}");
	runtime.block_on(Self::bootstrap(
	state_ref,
	bootstrap_peer,
	uri,
	p2p_port,
	config,
	tls_config,
	))?;
	Ok(())
	} else {
	log::debug!(
	"Bootstrapping is disabled. Assuming this peer is the first in the network"
	);
	let tick_period = config.tick_period_ms;
	log::info!("With current tick period of {tick_period}ms, leader will be established in approximately {leader_established_in_ms}ms. To avoid rejected operations - add peers and submit operations only after this period.");
	// First peer needs to add its own address
	state_ref.add_peer(
	state_ref.this_peer_id(),
	uri.ok_or_else(\|\| anyhow::anyhow!("First peer should specify its uri."))?
	.parse()?,
	)?;
	Ok(())
	}
	}

	async fn add_peer_to_known_for(
	this_peer_id: PeerId,
	cluster_uri: Uri,
	current_uri: Option<String>,
	p2p_port: u16,
	config: &ConsensusConfig,
	tls_config: Option<ClientTlsConfig>,
	) -> anyhow::Result<AllPeers> {
	// Use dedicated transport channel for bootstrapping because of specific timeout
	let channel = make_grpc_channel(
	Duration::from_secs(config.bootstrap_timeout_sec),
	Duration::from_secs(config.bootstrap_timeout_sec),
	cluster_uri,
	tls_config,
	)
	.await
	.map_err(\|err\| anyhow!("Failed to create timeout channel: {err}"))?;
	let mut client = RaftClient::new(channel);
	let all_peers = client
	.add_peer_to_known(tonic::Request::new(
	api::grpc::qdrant::AddPeerToKnownMessage {
	uri: current_uri,
	port: Some(u32::from(p2p_port)),
	id: this_peer_id,
	},
	))
	.await
	.map_err(\|err\| anyhow!("Failed to add peer to known: {err}"))?
	.into_inner();
	Ok(all_peers)
	}

	// Re-attach peer to the consensus:
	// Notifies the cluster(any node) that this node changed its address
	async fn recover(
	state_ref: &ConsensusStateRef,
	uri: Option<String>,
	p2p_port: u16,
	config: &ConsensusConfig,
	tls_config: Option<ClientTlsConfig>,
	) -> anyhow::Result<()> {
	let this_peer_id = state_ref.this_peer_id();
	let mut peer_to_uri = state_ref
	.persistent
	.read()
	.peer_address_by_id
	.read()
	.clone();
	let this_peer_url = peer_to_uri.remove(&this_peer_id);
	// Recover url if a different one is provided
	let do_recover = match (&this_peer_url, &uri) {
	(Some(this_peer_url), Some(uri)) => this_peer_url != &Uri::from_str(uri)?,
	_ => false,
	};

	if do_recover {
	let mut tries = RECOVERY_MAX_RETRY_COUNT;
	while tries > 0 {
	// Try to inform any peer about the change of address
	for (peer_id, peer_uri) in &peer_to_uri {
	let res = Self::add_peer_to_known_for(
	this_peer_id,
	peer_uri.clone(),
	uri.clone(),
	p2p_port,
	config,
	tls_config.clone(),
	)
	.await;
	if res.is_err() {
	log::warn!(
	"Failed to recover from peer with id {} at {} with error {:?}, trying others",
	peer_id,
	peer_uri,
	res
	);
	} else {
	log::debug!(
	"Successfully recovered from peer with id {} at {}",
	peer_id,
	peer_uri
	);
	return Ok(());
	}
	}
	tries -= 1;
	log::warn!(
	"Retrying recovering from known peers (retry {})",
	RECOVERY_MAX_RETRY_COUNT - tries
	);
	let exp_timeout =
	RECOVERY_RETRY_TIMEOUT * (RECOVERY_MAX_RETRY_COUNT - tries) as u32;
	sleep(exp_timeout).await;
	}
	return Err(anyhow::anyhow!("Failed to recover from any known peers"));
	}

	Ok(())
	}

	/// Add node sequence:
	///
	/// 1. Add current node as a learner
	/// 2. Start applying entries from consensus
	/// 3. Eventually leader submits the promotion proposal
	/// 4. Learners become voters once they read about the promotion from consensus log
	async fn bootstrap(
	state_ref: &ConsensusStateRef,
	bootstrap_peer: Uri,
	uri: Option<String>,
	p2p_port: u16,
	config: &ConsensusConfig,
	tls_config: Option<ClientTlsConfig>,
	) -> anyhow::Result<()> {
	let this_peer_id = state_ref.this_peer_id();
	let all_peers = Self::add_peer_to_known_for(
	this_peer_id,
	bootstrap_peer,
	uri.clone(),
	p2p_port,
	config,
	tls_config,
	)
	.await?;

	// Although peer addresses are synchronized with consensus, addresses need to be pre-fetched in the case of a new peer
	// or it will not know how to answer the Raft leader
	for peer in all_peers.all_peers {
	state_ref
	.add_peer(
	peer.id,
	peer.uri
	.parse()
	.context(format!("Failed to parse peer URI: {}", peer.uri))?,
	)
	.map_err(\|err\| anyhow!("Failed to add peer: {}", err))?
	}
	// Only first peer has itself as a voter in the initial conf state.
	// This needs to be propagated manually to other peers as it is not contained in any log entry.
	// So we skip the learner phase for the first peer.
	state_ref.set_first_voter(all_peers.first_peer_id)?;
	state_ref.set_conf_state(ConfState::from((vec![all_peers.first_peer_id], vec![])))?;
	Ok(())
	}

	pub fn start(&mut self) -> anyhow::Result<()> {
	// If this is the only peer in the cluster, tick Raft node a few times to instantly
	// self-elect itself as Raft leader
	if self.node.store().peer_count() == 1 {
	while !self.node.has_ready() {
	self.node.tick();
	}
	}

	let tick_period = Duration::from_millis(self.config.tick_period_ms);
	let mut previous_tick = Instant::now();

	loop {
	// Apply in-memory changes to the Raft State Machine
	// If updates = None, we need to skip this step due to timing limits
	// If updates = Some(0), means we didn't receive any updates explicitly
	let updates = self.advance_node(previous_tick, tick_period)?;

	let mut elapsed = previous_tick.elapsed();

	while elapsed > tick_period {
	self.node.tick();

	previous_tick += tick_period;
	elapsed -= tick_period;
	}

	if self.node.has_ready() {
	// Persist AND apply changes, which were committed in the Raft State Machine
	let stop_consensus = self.on_ready()?;

	if stop_consensus {
	return Ok(());
	}
	} else if updates == Some(0) {
	// Assume consensus is up-to-date, we can sync local state
	// Which involves resoling inconsistencies and trying to recover data marked as dead
	self.try_sync_local_state()?;
	}
	}
	}

	fn advance_node(
	&mut self,
	previous_tick: Instant,
	tick_period: Duration,
	) -> anyhow::Result<Option<usize>> {
	if previous_tick.elapsed() >= tick_period {
	return Ok(None);
	}

	match self.try_add_origin() {
	// `try_add_origin` is not applicable:
	// - either current peer is not an origin peer
	// - or cluster is already established
	Ok(false) => (),

	// Successfully proposed origin peer to consensus, return to consensus loop to handle `on_ready`
	Ok(true) => return Ok(Some(1)),

	// Origin peer is not a leader yet, wait for the next tick and return to consensus loop
	// to tick Raft node
	Err(err @ TryAddOriginError::NotLeader) => {
	log::debug!("{err}");

	let next_tick = previous_tick + tick_period;
	let duration_until_next_tick = next_tick.saturating_duration_since(Instant::now());
	thread::sleep(duration_until_next_tick);

	return Ok(None);
	}

	// Failed to propose origin peer ID to consensus (which should never happen!),
	// log error and continue regular consensus loop
	Err(err) => {
	log::error!("{err}");
	}
	}

	if self
	.try_promote_learner()
	.context("failed to promote learner")?
	{
	return Ok(Some(1));
	}

	let mut updates = 0;
	let mut timeout_at = previous_tick + tick_period;

	// We need to limit the batch size, as application of one batch should be limited in time.
	const RAFT_BATCH_SIZE: usize = 128;

	let wait_timeout_for_consecutive_messages = tick_period / 10;

	// This loop batches incoming messages, so we would need to "apply" them only once.
	// The "Apply" step is expensive, so it is done for performance reasons.

	// But on the other hand, we still want to react to rare
	// individual messages as fast as possible.
	// To fulfill both requirements, we are going the following way:
	// 1. Wait for the first message for full tick period.
	// 2. If the message is received, wait for the next message only for 1/10 of the tick period.
	loop {
	// This queue have 2 types of events:
	// - Messages from the leader, like pings, requests to add logs, acks, etc.
	// - Messages from users, like requests to start shard transfers, etc.
	//
	// Timeout defines how long can we wait for the next message.
	// Since this thread is sync, we can't wait indefinitely.
	// Timeout is set up to be about the time of tick.
	let Ok(message) = self.recv_update(timeout_at) else {
	break;
	};

	// Those messages should not be batched, so we interrupt the loop if we see them.
	// Motivation is: if we change the peer, it should be done immediately,
	// otherwise we loose the update on this new peer
	let is_conf_change = matches!(
	message,
	Message::FromClient(
	ConsensusOperations::AddPeer { .. } \| ConsensusOperations::RemovePeer(_)
	),
	);

	// We put the message in Raft State Machine
	// This update will hold update in memory, but will not be persisted yet.
	// E.g. if it is a ping, we don't need to persist anything ofr it.
	if let Err(err) = self.advance_node_impl(message) {
	log::warn!("{err}");
	continue;
	}

	updates += 1;
	timeout_at = Instant::now() + wait_timeout_for_consecutive_messages;

	if previous_tick.elapsed() >= tick_period
	\|\| updates >= RAFT_BATCH_SIZE
	\|\| is_conf_change
	{
	break;
	}
	}

	Ok(Some(updates))
	}

	fn recv_update(&mut self, timeout_at: Instant) -> Result<Message, TryRecvUpdateError> {
	self.runtime.block_on(async {
	tokio::select! {
	biased;
	_ = tokio::time::sleep_until(timeout_at.into()) => Err(TryRecvUpdateError::Timeout),
	message = self.receiver.recv() => message.ok_or(TryRecvUpdateError::Closed),
	}
	})
	}

	fn advance_node_impl(&mut self, message: Message) -> anyhow::Result<()> {
	match message {
	Message::FromClient(ConsensusOperations::AddPeer { peer_id, uri }) => {
	let mut change = ConfChangeV2::default();

	change.set_changes(vec![raft_proto::new_conf_change_single(
	peer_id,
	ConfChangeType::AddLearnerNode,
	)]);

	log::debug!("Proposing network configuration change: {:?}", change);
	self.node
	.propose_conf_change(uri.into_bytes(), change)
	.context("failed to propose conf change")?;
	}

	Message::FromClient(ConsensusOperations::RemovePeer(peer_id)) => {
	let mut change = ConfChangeV2::default();

	change.set_changes(vec![raft_proto::new_conf_change_single(
	peer_id,
	ConfChangeType::RemoveNode,
	)]);

	log::debug!("Proposing network configuration change: {:?}", change);
	self.node
	.propose_conf_change(vec![], change)
	.context("failed to propose conf change")?;
	}

	Message::FromClient(ConsensusOperations::RequestSnapshot) => {
	self.node
	.request_snapshot()
	.context("failed to request snapshot")?;
	}

	Message::FromClient(ConsensusOperations::ReportSnapshot { peer_id, status }) => {
	self.node.report_snapshot(peer_id, status.into());
	}

	Message::FromClient(operation) => {
	let data =
	serde_cbor::to_vec(&operation).context("failed to serialize operation")?;

	log::trace!("Proposing entry from client with length: {}", data.len());
	self.node
	.propose(vec![], data)
	.context("failed to propose entry")?;
	}

	Message::FromPeer(message) => {
	let is_heartbeat = matches!(
	message.get_msg_type(),
	MessageType::MsgHeartbeat \| MessageType::MsgHeartbeatResponse,
	);

	if !is_heartbeat {
	log::trace!(
	"Received a message from peer with progress: {:?}. Message: {:?}",
	self.node.raft.prs().get(message.from),
	message,
	);
	}

	self.node.step(*message).context("failed to step message")?;
	}
	}

	Ok(())
	}

	fn try_sync_local_state(&mut self) -> anyhow::Result<()> {
	if !self.node.has_ready() {
	// No updates to process
	let store = self.node.store();
	let pending_operations = store.persistent.read().unapplied_entities_count();
	if pending_operations == 0 && store.is_leader_established.check_ready() {
	// If leader is established and there is nothing else to do on this iteration,
	// then we can check if there are any un-synchronized local state left.
	store.sync_local_state()?;
	}
	}
	Ok(())
	}

	/// Tries to propose "origin peer" (the very first peer, that starts new cluster) to consensus
	fn try_add_origin(&mut self) -> Result<bool, TryAddOriginError> {
	// We can determine origin peer from consensus state:
	// - it should be the only peer in the cluster
	// - and its commit index should be at 0 or 1
	//
	// When we add a new node to existing cluster, we have to bootstrap it from existing cluster
	// node, and during bootstrap we explicitly add all current peers to consensus state. So,
	// all peers added to the cluster after the origin will always have at least two peers.
	//
	// When origin peer starts new cluster, it self-elects itself as a leader and commits empty
	// operation with index 1. It is impossible to commit anything to consensus before this
	// operation is committed. And to add another (second/third/etc) peer to the cluster, we
	// have to commit a conf-change operation. Which means that only origin peer can ever be at
	// commit index 0 or 1.

	// Check that we are the only peer in the cluster
	if self.node.store().peer_count() > 1 {
	return Ok(false);
	}

	let status = self.node.status();

	// Check that we are at index 0 or 1
	if status.hs.commit > 1 {
	return Ok(false);
	}

	// If we reached this point, we are the origin peer, but it's impossible to propose anything
	// to consensus, before leader is elected (`propose_conf_change` will return an error),
	// so we have to wait for a few ticks for self-election
	if status.ss.raft_state != StateRole::Leader {
	return Err(TryAddOriginError::NotLeader);
	}

	// Propose origin peer to consensus
	let mut change = ConfChangeV2::default();

	change.set_changes(vec![raft_proto::new_conf_change_single(
	status.id,
	ConfChangeType::AddNode,
	)]);

	let peer_uri = self
	.node
	.store()
	.persistent
	.read()
	.peer_address_by_id
	.read()
	.get(&status.id)
	.ok_or_else(\|\| TryAddOriginError::UriNotFound)?
	.to_string();

	self.node.propose_conf_change(peer_uri.into(), change)?;

	Ok(true)
	}

	/// Returns `true` if learner promotion was proposed, `false` otherwise.
	/// Learner node does not vote on elections, cause it might not have a big picture yet.
	/// So consensus should guarantee that learners are promoted one-by-one.
	/// Promotions are done by leader and only after it has no pending entries,
	/// that guarantees that learner will start voting only after it applies all the changes in the log
	fn try_promote_learner(&mut self) -> anyhow::Result<bool> {
	// Promote only if leader
	if self.node.status().ss.raft_state != StateRole::Leader {
	return Ok(false);
	}

	// Promote only when there are no uncommitted changes.
	let store = self.node.store();
	let commit = store.hard_state().commit;
	let last_log_entry = store.last_index()?;

	if commit != last_log_entry {
	return Ok(false);
	}

	let Some(learner) = self.find_learner_to_promote() else {
	return Ok(false);
	};

	log::debug!("Proposing promotion for learner {learner} to voter");

	let mut change = ConfChangeV2::default();

	change.set_changes(vec![raft_proto::new_conf_change_single(
	learner,
	ConfChangeType::AddNode,
	)]);

	self.node.propose_conf_change(vec![], change)?;

	Ok(true)
	}

	fn find_learner_to_promote(&self) -> Option<u64> {
	let commit = self.node.store().hard_state().commit;
	let learners: HashSet<_> = self
	.node
	.store()
	.conf_state()
	.learners
	.into_iter()
	.collect();
	let status = self.node.status();
	status
	.progress?
	.iter()
	.find(\|(id, progress)\| learners.contains(id) && progress.matched == commit)
	.map(\|(id, _)\| *id)
	}

	/// Returns `true` if consensus should be stopped, `false` otherwise.
	fn on_ready(&mut self) -> anyhow::Result<bool> {
	if !self.node.has_ready() {
	// No updates to process
	return Ok(false);
	}
	self.store().record_consensus_working();
	// Get the `Ready` with `RawNode::ready` interface.
	let ready = self.node.ready();

	let (Some(light_ready), role_change) = self.process_ready(ready)? else {
	// No light ready, so we need to stop consensus.
	return Ok(true);
	};

	let result = self.process_light_ready(light_ready)?;

	if let Some(role_change) = role_change {
	self.process_role_change(role_change);
	}

	self.store().compact_wal(self.config.compact_wal_entries)?;

	Ok(result)
	}

	fn process_role_change(&self, role_change: StateRole) {
	// Explicit match here for better readability
	match role_change {
	StateRole::Candidate \| StateRole::PreCandidate => {
	self.store().is_leader_established.make_not_ready()
	}
	StateRole::Leader \| StateRole::Follower => {
	if self.node.raft.leader_id != INVALID_ID {
	self.store().is_leader_established.make_ready()
	} else {
	self.store().is_leader_established.make_not_ready()
	}
	}
	}
	}

	/// Tries to process raft's ready state. Happens on each tick.
	///
	/// The order of operations in this functions is critical, changing it might lead to bugs.
	///
	/// Returns with err on failure to apply the state.
	/// If it receives message to stop the consensus - returns None instead of LightReady.
	fn process_ready(
	&mut self,
	mut ready: raft::Ready,
	) -> anyhow::Result<(Option<raft::LightReady>, Option<StateRole>)> {
	let store = self.store();

	if !ready.messages().is_empty() {
	log::trace!("Handling {} messages", ready.messages().len());
	self.send_messages(ready.take_messages());
	}
	if !ready.snapshot().is_empty() {
	// This is a snapshot, we need to apply the snapshot at first.
	log::debug!("Applying snapshot");

	if let Err(err) = store.apply_snapshot(&ready.snapshot().clone())? {
	log::error!("Failed to apply snapshot: {err}");
	}
	}
	if !ready.entries().is_empty() {
	// Append entries to the Raft log.
	log::debug!("Appending {} entries to raft log", ready.entries().len());
	store
	.append_entries(ready.take_entries())
	.map_err(\|err\| anyhow!("Failed to append entries: {}", err))?
	}
	if let Some(hs) = ready.hs() {
	// Raft HardState changed, and we need to persist it.
	log::debug!("Changing hard state. New hard state: {hs:?}");
	store
	.set_hard_state(hs.clone())
	.map_err(\|err\| anyhow!("Failed to set hard state: {}", err))?
	}
	let role_change = ready.ss().map(\|ss\| ss.raft_state);
	if let Some(ss) = ready.ss() {
	log::debug!("Changing soft state. New soft state: {ss:?}");
	self.handle_soft_state(ss);
	}
	if !ready.persisted_messages().is_empty() {
	log::trace!(
	"Handling {} persisted messages",
	ready.persisted_messages().len()
	);
	self.send_messages(ready.take_persisted_messages());
	}
	// Should be done after Hard State is saved, so that `applied` index is never bigger than `commit`.
	let stop_consensus =
	handle_committed_entries(&ready.take_committed_entries(), &store, &mut self.node)
	.context("Failed to handle committed entries")?;
	if stop_consensus {
	return Ok((None, None));
	}

	// Advance the Raft.
	let light_rd = self.node.advance(ready);
	Ok((Some(light_rd), role_change))
	}

	/// Tries to process raft's light ready state.
	///
	/// The order of operations in this functions is critical, changing it might lead to bugs.
	///
	/// Returns with err on failure to apply the state.
	/// If it receives message to stop the consensus - returns `true`, otherwise `false`.
	fn process_light_ready(&mut self, mut light_rd: raft::LightReady) -> anyhow::Result<bool> {
	let store = self.store();
	// Update commit index.
	if let Some(commit) = light_rd.commit_index() {
	log::debug!("Updating commit index to {commit}");
	store
	.set_commit_index(commit)
	.map_err(\|err\| anyhow!("Failed to set commit index: {}", err))?;
	}
	self.send_messages(light_rd.take_messages());
	// Apply all committed entries.
	let stop_consensus =
	handle_committed_entries(&light_rd.take_committed_entries(), &store, &mut self.node)
	.context("Failed to apply committed entries")?;
	// Advance the apply index.
	self.node.advance_apply();
	Ok(stop_consensus)
	}

	fn store(&self) -> ConsensusStateRef {
	self.node.store().clone()
	}

	fn handle_soft_state(&self, state: &SoftState) {
	let store = self.node.store();
	store.set_raft_soft_state(state);
	}

	fn send_messages(&mut self, messages: Vec<RaftMessage>) {
	self.broker.send(messages);
	}
	}

	#[derive(Copy, Clone, Debug, thiserror::Error)]
	enum TryRecvUpdateError {
	#[error("timeout elapsed")]
	Timeout,

	#[error("channel closed")]
	Closed,
	}

	#[derive(Debug, thiserror::Error)]
	enum TryAddOriginError {
	#[error("origin peer is not a leader")]
	NotLeader,

	#[error("origin peer URI not found")]
	UriNotFound,

	#[error("failed to propose origin peer URI to consensus: {0}")]
	RaftError(#[from] raft::Error),
	}

	/// This function actually applies the committed entries to the state machine.
	/// Return `true` if consensus should be stopped.
	/// `false` otherwise.
	fn handle_committed_entries(
	entries: &[Entry],
	state: &ConsensusStateRef,
	raw_node: &mut RawNode<ConsensusStateRef>,
	) -> anyhow::Result<bool> {
	let mut stop_consensus = false;
	if let (Some(first), Some(last)) = (entries.first(), entries.last()) {
	state.set_unapplied_entries(first.index, last.index)?;
	stop_consensus = state.apply_entries(raw_node)?;
	}
	Ok(stop_consensus)
	}

	struct RaftMessageBroker {
	senders: HashMap<PeerId, RaftMessageSenderHandle>,
	runtime: Handle,
	bootstrap_uri: Option<Uri>,
	tls_config: Option<ClientTlsConfig>,
	consensus_config: Arc<ConsensusConfig>,
	consensus_state: ConsensusStateRef,
	transport_channel_pool: Arc<TransportChannelPool>,
	}

	impl RaftMessageBroker {
	pub fn new(
	runtime: Handle,
	bootstrap_uri: Option<Uri>,
	tls_config: Option<ClientTlsConfig>,
	consensus_config: ConsensusConfig,
	consensus_state: ConsensusStateRef,
	transport_channel_pool: Arc<TransportChannelPool>,
	) -> Self {
	Self {
	senders: HashMap::new(),
	runtime,
	bootstrap_uri,
	tls_config,
	consensus_config: consensus_config.into(),
	consensus_state,
	transport_channel_pool,
	}
	}

	pub fn send(&mut self, messages: impl IntoIterator<Item = RaftMessage>) {
	let mut messages = messages.into_iter();
	let mut retry = None;

	while let Some(message) = retry.take().or_else(\|\| messages.next()) {
	let peer_id = message.to;

	let sender = match self.senders.get_mut(&peer_id) {
	Some(sender) => sender,
	None => {
	log::debug!("Spawning message sender task for peer {peer_id}...");

	let (task, handle) = self.message_sender();
	let future = self.runtime.spawn(task.exec());
	drop(future); // drop `JoinFuture` explicitly to make clippy happy

	self.senders.insert(peer_id, handle);

	self.senders
	.get_mut(&peer_id)
	.expect("message sender task spawned")
	}
	};

	let failed_to_forward = \|message: &RaftMessage, description: &str\| {
	let peer_id = message.to;

	let is_debug = log::max_level() >= log::Level::Debug;
	let space = if is_debug { " " } else { "" };
	let message: &dyn fmt::Debug = if is_debug { &message } else { &"" }; // TODO: `fmt::Debug` for `""` prints `""`... 😒

	log::error!(
	"Failed to forward message{space}{message:?} to message sender task {peer_id}: \
	{description}"
	);
	};

	match sender.send(message) {
	Ok(()) => (),

	Err(tokio::sync::mpsc::error::TrySendError::Full((_, message))) => {
	failed_to_forward(
	&message,
	"message sender task queue is full. Message will be dropped.",
	);
	}

	Err(tokio::sync::mpsc::error::TrySendError::Closed((_, message))) => {
	failed_to_forward(
	&message,
	"message sender task queue is closed. \
	Message sender task will be restarted and message will be retried.",
	);

	self.senders.remove(&peer_id);
	retry = Some(message);
	}
	}
	}
	}

	fn message_sender(&self) -> (RaftMessageSender, RaftMessageSenderHandle) {
	let (messages_tx, messages_rx) = tokio::sync::mpsc::channel(128);
	let (heartbeat_tx, heartbeat_rx) = tokio::sync::watch::channel(Default::default());

	let task = RaftMessageSender {
	messages: messages_rx,
	heartbeat: heartbeat_rx,
	bootstrap_uri: self.bootstrap_uri.clone(),
	tls_config: self.tls_config.clone(),
	consensus_config: self.consensus_config.clone(),
	consensus_state: self.consensus_state.clone(),
	transport_channel_pool: self.transport_channel_pool.clone(),
	};

	let handle = RaftMessageSenderHandle {
	messages: messages_tx,
	heartbeat: heartbeat_tx,
	index: 0,
	};

	(task, handle)
	}
	}

	#[derive(Debug)]
	struct RaftMessageSenderHandle {
	messages: Sender<(usize, RaftMessage)>,
	heartbeat: watch::Sender<(usize, RaftMessage)>,
	index: usize,
	}

	impl RaftMessageSenderHandle {
	#[allow(clippy::result_large_err)]
	pub fn send(&mut self, message: RaftMessage) -> RaftMessageSenderResult<()> {
	if !is_heartbeat(&message) {
	self.messages.try_send((self.index, message))?;
	} else {
	self.heartbeat.send((self.index, message)).map_err(
	\|tokio::sync::watch::error::SendError(message)\| {
	tokio::sync::mpsc::error::TrySendError::Closed(message)
	},
	)?;
	}

	self.index += 1;

	Ok(())
	}
	}

	type RaftMessageSenderResult<T, E = RaftMessageSenderError> = Result<T, E>;
	type RaftMessageSenderError = tokio::sync::mpsc::error::TrySendError<(usize, RaftMessage)>;

	struct RaftMessageSender {
	messages: Receiver<(usize, RaftMessage)>,
	heartbeat: watch::Receiver<(usize, RaftMessage)>,
	bootstrap_uri: Option<Uri>,
	tls_config: Option<ClientTlsConfig>,
	consensus_config: Arc<ConsensusConfig>,
	consensus_state: ConsensusStateRef,
	transport_channel_pool: Arc<TransportChannelPool>,
	}

	impl RaftMessageSender {
	pub async fn exec(mut self) {
	// Imagine that `raft` crate put four messages to be sent to some other Raft node into
	// `RaftMessageSender`'s queue:
	//
	// \| 4: AppendLog \| 3: Heartbeat \| 2: Heartbeat \| 1: AppendLog \|
	//
	// Heartbeat is the most basic message type in Raft. It only carries common "metadata"
	// without any additional "payload". And all other message types in Raft also carry
	// the same basic metadata as the heartbeat message.
	//
	// This way, message `3` instantly "outdates" message `2`: they both carry the same data
	// fields, but message `3` was produced more recently, and so it might contain newer values
	// of these data fields.
	//
	// And because all messages carry the same basic data as the heartbeat message, message `4`
	// instantly "outdates" both message `2` and `3`.
	//
	// This way, if there are more than one message queued for the `RaftMessageSender`,
	// we can optimize delivery a bit and skip any heartbeat message if there's a more
	// recent message scheduled later in the queue.
	//
	// `RaftMessageSender` have two separate "queues":
	// - `messages` queue for non-heartbeat messages
	// - and `heartbeat` "watch" channel for heartbeat messages
	// - "watch" is a special channel in Tokio, that only retains the last sent value
	// - so any heartbeat received from the `heartbeat` channel is always the most recent one
	//
	// We are using `tokio::select` to "simultaneously" check both queues for new messages...
	// but we are using `tokio::select` in a "biased" mode!
	//
	// - in this mode select always polls `messages.recv()` future first
	// - so even if there are new messages in both queues, it will always return a non-heartbeat
	// message from `messages` queue first
	// - and it will only return a heartbeat message from `heartbeat` channel if there's no
	// messages left in the `messages` queue
	//
	// There's one special case that we should be careful about with our two queues:
	//
	// If we return to the diagram above, and imagine four messages were sent in the same order
	// into our two queues, then `RaftMessageSender` might pull them from the queues in the
	// `1`, `4`, `3` order.
	//
	// E.g., we pull non-heartbeat messages `1` and `4` first, heartbeat `2` was overwritten
	// by heartbeat `3` (because of the "watch" channel), so once `messages` queue is empty
	// we receive heartbeat `3`, which is now out-of-order.
	//
	// To handle this we explicitly enumerate each message and only send a message if its index
	// is higher-or-equal than the index of a previous one. (This check can be expressed with
	// both strict "higher" or "higher-or-equal" conditional, I just like the "or-equal" version
	// a bit better.)
	//
	// If either `messages` queue or `heartbeat` channel is closed (e.g., `messages.recv()`
	// returns `None` or `heartbeat.changed()` returns an error), we assume that
	// `RaftMessageSenderHandle` has been dropped, and treat it as a "shutdown"/"cancellation"
	// signal (and break from the loop).

	let mut prev_index = 0;

	loop {
	let (index, message) = tokio::select! {
	biased;
	Some(message) = self.messages.recv() => message,
	Ok(()) = self.heartbeat.changed() => self.heartbeat.borrow_and_update().clone(),
	else => break,
	};

	if prev_index <= index {
	self.send(&message).await;
	prev_index = index;
	}
	}
	}

	async fn send(&mut self, message: &RaftMessage) {
	if let Err(err) = self.try_send(message).await {
	let peer_id = message.to;

	if log::max_level() >= log::Level::Debug {
	log::error!("Failed to send Raft message {message:?} to peer {peer_id}: {err}");
	} else {
	log::error!("Failed to send Raft message to peer {peer_id}: {err}");
	}
	}
	}

	async fn try_send(&mut self, message: &RaftMessage) -> anyhow::Result<()> {
	let peer_id = message.to;

	let uri = self.uri(peer_id).await?;

	let mut bytes = Vec::new();
	<RaftMessage as prost_for_raft::Message>::encode(message, &mut bytes)
	.context("failed to encode Raft message")?;
	let grpc_message = GrpcRaftMessage { message: bytes };

	let timeout = Duration::from_millis(
	self.consensus_config.message_timeout_ticks * self.consensus_config.tick_period_ms,
	);

	let res = self
	.transport_channel_pool
	.with_channel_timeout(
	&uri,
	\|channel\| async {
	let mut client = RaftClient::new(channel);
	let mut request = tonic::Request::new(grpc_message.clone());
	request.set_timeout(timeout);
	client.send(request).await
	},
	Some(timeout),
	0,
	)
	.await;

	if message.msg_type == raft::eraftpb::MessageType::MsgSnapshot as i32 {
	let res = self.consensus_state.report_snapshot(
	peer_id,
	if res.is_ok() {
	SnapshotStatus::Finish
	} else {
	SnapshotStatus::Failure
	},
	);

	// Should we ignore the error? Seems like it will only produce noise.
	//
	// - `send_message` is only called by the sub-task spawned by the consensus thread.
	// - `report_snapshot` sends a message back to the consensus thread.
	// - It can only fail, if the "receiver" end of the channel is closed.
	// - Which means consensus thread either resolved successfully, or failed.
	// - So, if the consensus thread is shutting down, no need to log a misleading error...
	// - ...or, if the consensus thread failed, then we should already have an error,
	// and it will only produce more noise.

	if let Err(err) = res {
	log::error!("{}", err);
	}
	}

	match res {
	Ok(_) => self.consensus_state.record_message_send_success(&uri),
	Err(err) => self.consensus_state.record_message_send_failure(&uri, err),
	}

	Ok(())
	}

	async fn uri(&mut self, peer_id: PeerId) -> anyhow::Result<Uri> {
	let uri = self
	.consensus_state
	.peer_address_by_id()
	.get(&peer_id)
	.cloned();

	match uri {
	Some(uri) => Ok(uri),
	None => self.who_is(peer_id).await,
	}
	}

	async fn who_is(&mut self, peer_id: PeerId) -> anyhow::Result<Uri> {
	let bootstrap_uri = self
	.bootstrap_uri
	.clone()
	.ok_or_else(\|\| anyhow::format_err!("No bootstrap URI provided"))?;

	let bootstrap_timeout = Duration::from_secs(self.consensus_config.bootstrap_timeout_sec);

	// Use dedicated transport channel for who_is because of specific timeout
	let channel = make_grpc_channel(
	bootstrap_timeout,
	bootstrap_timeout,
	bootstrap_uri,
	self.tls_config.clone(),
	)
	.await
	.map_err(\|err\| anyhow::format_err!("Failed to create who-is channel: {}", err))?;

	let uri = RaftClient::new(channel)
	.who_is(tonic::Request::new(GrpcPeerId { id: peer_id }))
	.await?
	.into_inner()
	.uri
	.parse()?;

	Ok(uri)
	}
	}

	fn is_heartbeat(message: &RaftMessage) -> bool {
	message.msg_type == raft::eraftpb::MessageType::MsgHeartbeat as i32
	\|\| message.msg_type == raft::eraftpb::MessageType::MsgHeartbeatResponse as i32
	}

	#[cfg(test)]
	mod tests {
	use std::sync::Arc;
	use std::thread;

	use collection::operations::vector_params_builder::VectorParamsBuilder;
	use collection::shards::channel_service::ChannelService;
	use common::cpu::CpuBudget;
	use segment::types::Distance;
	use slog::Drain;
	use storage::content_manager::collection_meta_ops::{
	CollectionMetaOperations, CreateCollection, CreateCollectionOperation,
	};
	use storage::content_manager::consensus::operation_sender::OperationSender;
	use storage::content_manager::consensus::persistent::Persistent;
	use storage::content_manager::consensus_manager::{ConsensusManager, ConsensusStateRef};
	use storage::content_manager::toc::TableOfContent;
	use storage::dispatcher::Dispatcher;
	use storage::rbac::Access;
	use tempfile::Builder;

	use super::Consensus;
	use crate::common::helpers::create_general_purpose_runtime;
	use crate::settings::ConsensusConfig;

	#[test]
	fn collection_creation_passes_consensus() {
	// Given
	let storage_dir = Builder::new().prefix("storage").tempdir().unwrap();
	let mut settings = crate::Settings::new(None).expect("Can't read config.");
	settings.storage.storage_path = storage_dir.path().to_str().unwrap().to_string();
	tracing_subscriber::fmt::init();
	let search_runtime =
	crate::create_search_runtime(settings.storage.performance.max_search_threads)
	.expect("Can't create search runtime.");
	let update_runtime =
	crate::create_update_runtime(settings.storage.performance.max_search_threads)
	.expect("Can't create update runtime.");
	let general_runtime =
	create_general_purpose_runtime().expect("Can't create general purpose runtime.");
	let handle = general_runtime.handle().clone();
	let (propose_sender, propose_receiver) = std::sync::mpsc::channel();
	let persistent_state =
	Persistent::load_or_init(&settings.storage.storage_path, true, false).unwrap();
	let operation_sender = OperationSender::new(propose_sender);
	let toc = TableOfContent::new(
	&settings.storage,
	search_runtime,
	update_runtime,
	general_runtime,
	CpuBudget::default(),
	ChannelService::new(settings.service.http_port, None),
	persistent_state.this_peer_id(),
	Some(operation_sender.clone()),
	);
	let toc_arc = Arc::new(toc);
	let storage_path = toc_arc.storage_path();
	let consensus_state: ConsensusStateRef = ConsensusManager::new(
	persistent_state,
	toc_arc.clone(),
	operation_sender,
	storage_path,
	)
	.into();
	let dispatcher = Dispatcher::new(toc_arc.clone()).with_consensus(consensus_state.clone());
	let slog_logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), slog::o!());
	let (mut consensus, message_sender) = Consensus::new(
	&slog_logger,
	consensus_state.clone(),
	None,
	Some("http://127.0.0.1:6335".parse().unwrap()),
	6335,
	ConsensusConfig::default(),
	None,
	ChannelService::new(settings.service.http_port, None),
	handle.clone(),
	false,
	)
	.unwrap();

	let is_leader_established = consensus_state.is_leader_established.clone();
	thread::spawn(move \|\| consensus.start().unwrap());
	thread::spawn(move \|\| {
	while let Ok(entry) = propose_receiver.recv() {
	if message_sender
	.blocking_send(super::Message::FromClient(entry))
	.is_err()
	{
	log::error!("Can not forward new entry to consensus as it was stopped.");
	break;
	}
	}
	});
	// Wait for Raft to establish the leader
	is_leader_established.await_ready();
	// Leader election produces a raft log entry, and then origin peer adds itself to consensus
	assert_eq!(consensus_state.hard_state().commit, 2);
	// Initially there are 0 collections
	assert_eq!(toc_arc.all_collections_sync().len(), 0);

	// When

	// New runtime is used as timers need to be enabled.
	handle
	.block_on(
	dispatcher.submit_collection_meta_op(
	CollectionMetaOperations::CreateCollection(CreateCollectionOperation::new(
	"test".to_string(),
	CreateCollection {
	vectors: VectorParamsBuilder::new(10, Distance::Cosine)
	.build()
	.into(),
	sparse_vectors: None,
	hnsw_config: None,
	wal_config: None,
	optimizers_config: None,
	shard_number: Some(2),
	on_disk_payload: None,
	replication_factor: None,
	write_consistency_factor: None,
	init_from: None,
	quantization_config: None,
	sharding_method: None,
	strict_mode_config: None,
	uuid: None,
	},
	)),
	Access::full("For test"),
	None,
	),
	)
	.unwrap();

	// Then
	assert_eq!(consensus_state.hard_state().commit, 5); // first peer self-election + add first peer + create collection + activate shard x2
	assert_eq!(toc_arc.all_collections_sync(), vec!["test"]);
	}
	}