colibri.qdrant / src /consensus.rs
Gouzi Mohaled
Ajout du dossier src
d8435ba
use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::sync::{mpsc, Arc};
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
use std::{fmt, thread};
use anyhow::{anyhow, Context as _};
use api::grpc::dynamic_channel_pool::make_grpc_channel;
use api::grpc::qdrant::raft_client::RaftClient;
use api::grpc::qdrant::{AllPeers, PeerId as GrpcPeerId, RaftMessage as GrpcRaftMessage};
use api::grpc::transport_channel_pool::TransportChannelPool;
use collection::shards::channel_service::ChannelService;
use collection::shards::shard::PeerId;
#[cfg(target_os = "linux")]
use common::cpu::linux_high_thread_priority;
use common::defaults;
use raft::eraftpb::Message as RaftMessage;
use raft::prelude::*;
use raft::{SoftState, StateRole, INVALID_ID};
use storage::content_manager::consensus_manager::ConsensusStateRef;
use storage::content_manager::consensus_ops::{ConsensusOperations, SnapshotStatus};
use storage::content_manager::toc::TableOfContent;
use tokio::runtime::Handle;
use tokio::sync::mpsc::{Receiver, Sender};
use tokio::sync::watch;
use tokio::time::sleep;
use tonic::transport::{ClientTlsConfig, Uri};
use crate::common::helpers;
use crate::common::telemetry_ops::requests_telemetry::TonicTelemetryCollector;
use crate::settings::{ConsensusConfig, Settings};
use crate::tonic::init_internal;
type Node = RawNode<ConsensusStateRef>;
const RECOVERY_RETRY_TIMEOUT: Duration = Duration::from_secs(1);
const RECOVERY_MAX_RETRY_COUNT: usize = 3;
pub enum Message {
FromClient(ConsensusOperations),
FromPeer(Box<RaftMessage>),
}
/// Aka Consensus Thread
/// Manages proposed changes to consensus state, ensures that everything is ordered properly
pub struct Consensus {
/// Raft structure which handles raft-related state
node: Node,
/// Receives proposals from peers and client for applying in consensus
receiver: Receiver<Message>,
/// Runtime for async message sending
runtime: Handle,
/// Uri to some other known peer, used to join the consensus
/// ToDo: Make if many
config: ConsensusConfig,
broker: RaftMessageBroker,
}
impl Consensus {
/// Create and run consensus node
#[allow(clippy::too_many_arguments)]
pub fn run(
logger: &slog::Logger,
state_ref: ConsensusStateRef,
bootstrap_peer: Option<Uri>,
uri: Option<String>,
settings: Settings,
channel_service: ChannelService,
propose_receiver: mpsc::Receiver<ConsensusOperations>,
telemetry_collector: Arc<parking_lot::Mutex<TonicTelemetryCollector>>,
toc: Arc<TableOfContent>,
runtime: Handle,
reinit: bool,
) -> anyhow::Result<JoinHandle<std::io::Result<()>>> {
let tls_client_config = helpers::load_tls_client_config(&settings)?;
let p2p_host = settings.service.host.clone();
let p2p_port = settings.cluster.p2p.port.expect("P2P port is not set");
let config = settings.cluster.consensus.clone();
let (mut consensus, message_sender) = Self::new(
logger,
state_ref.clone(),
bootstrap_peer,
uri,
p2p_port,
config,
tls_client_config,
channel_service,
runtime.clone(),
reinit,
)?;
let state_ref_clone = state_ref.clone();
thread::Builder::new()
.name("consensus".to_string())
.spawn(move || {
// On Linux, try to use high thread priority because consensus is important
// Likely fails as we cannot set a higher priority by default due to permissions
#[cfg(target_os = "linux")]
if let Err(err) = linux_high_thread_priority() {
log::debug!(
"Failed to set high thread priority for consensus, ignoring: {err}"
);
}
if let Err(err) = consensus.start() {
log::error!("Consensus stopped with error: {err:#}");
state_ref_clone.on_consensus_thread_err(err);
} else {
log::info!("Consensus stopped");
state_ref_clone.on_consensus_stopped();
}
})?;
let message_sender_moved = message_sender.clone();
thread::Builder::new()
.name("forward-proposals".to_string())
.spawn(move || {
// On Linux, try to use high thread priority because consensus is important
// Likely fails as we cannot set a higher priority by default due to permissions
#[cfg(target_os = "linux")]
if let Err(err) = linux_high_thread_priority() {
log::debug!(
"Failed to set high thread priority for consensus, ignoring: {err}"
);
}
while let Ok(entry) = propose_receiver.recv() {
if message_sender_moved
.blocking_send(Message::FromClient(entry))
.is_err()
{
log::error!("Can not forward new entry to consensus as it was stopped.");
break;
}
}
})?;
let server_tls = if settings.cluster.p2p.enable_tls {
let tls_config = settings
.tls
.clone()
.ok_or_else(Settings::tls_config_is_undefined_error)?;
Some(helpers::load_tls_internal_server_config(&tls_config)?)
} else {
None
};
let handle = thread::Builder::new()
.name("grpc_internal".to_string())
.spawn(move || {
init_internal(
toc,
state_ref,
telemetry_collector,
settings,
p2p_host,
p2p_port,
server_tls,
message_sender,
runtime,
)
})
.unwrap();
Ok(handle)
}
/// If `bootstrap_peer` peer is supplied, then either `uri` or `p2p_port` should be also supplied
#[allow(clippy::too_many_arguments)]
pub fn new(
logger: &slog::Logger,
state_ref: ConsensusStateRef,
bootstrap_peer: Option<Uri>,
uri: Option<String>,
p2p_port: u16,
config: ConsensusConfig,
tls_config: Option<ClientTlsConfig>,
channel_service: ChannelService,
runtime: Handle,
reinit: bool,
) -> anyhow::Result<(Self, Sender<Message>)> {
// If we want to re-initialize consensus, we need to prevent other peers
// from re-playing consensus WAL operations, as they should already have them applied.
// Do ensure that we are forcing compacting WAL on the first re-initialized peer,
// which should trigger snapshot transferring instead of replaying WAL.
let force_compact_wal = reinit && bootstrap_peer.is_none();
// On the bootstrap-ed peers during reinit of the consensus
// we want to make sure only the bootstrap peer will hold the true state
// Therefore we clear the WAL on the bootstrap peer to force it to request a snapshot
let clear_wal = reinit && bootstrap_peer.is_some();
if clear_wal {
log::debug!("Clearing WAL on the bootstrap peer to force snapshot transfer");
state_ref.clear_wal()?;
}
// raft will not return entries to the application smaller or equal to `applied`
let last_applied = state_ref.last_applied_entry().unwrap_or_default();
let raft_config = Config {
id: state_ref.this_peer_id(),
applied: last_applied,
..Default::default()
};
raft_config.validate()?;
let op_wait = defaults::CONSENSUS_META_OP_WAIT;
// Commit might take up to 4 ticks as:
// 1 tick - send proposal to leader
// 2 tick - leader sends append entries to peers
// 3 tick - peer answers leader, that entry is persisted
// 4 tick - leader increases commit index and sends it
if 4 * Duration::from_millis(config.tick_period_ms) > op_wait {
log::warn!("With current tick period of {}ms, operation commit time might exceed default wait timeout: {}ms",
config.tick_period_ms, op_wait.as_millis())
}
// bounded channel for backpressure
let (sender, receiver) = tokio::sync::mpsc::channel(config.max_message_queue_size);
// State might be initialized but the node might be shutdown without actually syncing or committing anything.
if state_ref.is_new_deployment() || reinit {
let leader_established_in_ms =
config.tick_period_ms * raft_config.max_election_tick() as u64;
Self::init(
&state_ref,
bootstrap_peer.clone(),
uri,
p2p_port,
&config,
tls_config.clone(),
&runtime,
leader_established_in_ms,
)
.map_err(|err| anyhow!("Failed to initialize Consensus for new Raft state: {}", err))?;
} else {
runtime
.block_on(Self::recover(
&state_ref,
uri.clone(),
p2p_port,
&config,
tls_config.clone(),
))
.map_err(|err| {
anyhow!(
"Failed to recover Consensus from existing Raft state: {}",
err
)
})?;
if bootstrap_peer.is_some() || uri.is_some() {
log::debug!("Local raft state found - bootstrap and uri cli arguments were ignored")
}
log::debug!("Local raft state found - skipping initialization");
};
let mut node = Node::new(&raft_config, state_ref.clone(), logger)?;
node.set_batch_append(true);
// Before consensus has started apply any unapplied committed entries
// They might have not been applied due to unplanned Qdrant shutdown
let _stop_consensus = state_ref.apply_entries(&mut node)?;
if force_compact_wal {
// Making sure that the WAL will be compacted on start
state_ref.compact_wal(1)?;
} else {
state_ref.compact_wal(config.compact_wal_entries)?;
}
let broker = RaftMessageBroker::new(
runtime.clone(),
bootstrap_peer,
tls_config,
config.clone(),
node.store().clone(),
channel_service.channel_pool,
);
let consensus = Self {
node,
receiver,
runtime,
config,
broker,
};
if !state_ref.is_new_deployment() {
state_ref.recover_first_voter()?;
}
Ok((consensus, sender))
}
#[allow(clippy::too_many_arguments)]
fn init(
state_ref: &ConsensusStateRef,
bootstrap_peer: Option<Uri>,
uri: Option<String>,
p2p_port: u16,
config: &ConsensusConfig,
tls_config: Option<ClientTlsConfig>,
runtime: &Handle,
leader_established_in_ms: u64,
) -> anyhow::Result<()> {
if let Some(bootstrap_peer) = bootstrap_peer {
log::debug!("Bootstrapping from peer with address: {bootstrap_peer}");
runtime.block_on(Self::bootstrap(
state_ref,
bootstrap_peer,
uri,
p2p_port,
config,
tls_config,
))?;
Ok(())
} else {
log::debug!(
"Bootstrapping is disabled. Assuming this peer is the first in the network"
);
let tick_period = config.tick_period_ms;
log::info!("With current tick period of {tick_period}ms, leader will be established in approximately {leader_established_in_ms}ms. To avoid rejected operations - add peers and submit operations only after this period.");
// First peer needs to add its own address
state_ref.add_peer(
state_ref.this_peer_id(),
uri.ok_or_else(|| anyhow::anyhow!("First peer should specify its uri."))?
.parse()?,
)?;
Ok(())
}
}
async fn add_peer_to_known_for(
this_peer_id: PeerId,
cluster_uri: Uri,
current_uri: Option<String>,
p2p_port: u16,
config: &ConsensusConfig,
tls_config: Option<ClientTlsConfig>,
) -> anyhow::Result<AllPeers> {
// Use dedicated transport channel for bootstrapping because of specific timeout
let channel = make_grpc_channel(
Duration::from_secs(config.bootstrap_timeout_sec),
Duration::from_secs(config.bootstrap_timeout_sec),
cluster_uri,
tls_config,
)
.await
.map_err(|err| anyhow!("Failed to create timeout channel: {err}"))?;
let mut client = RaftClient::new(channel);
let all_peers = client
.add_peer_to_known(tonic::Request::new(
api::grpc::qdrant::AddPeerToKnownMessage {
uri: current_uri,
port: Some(u32::from(p2p_port)),
id: this_peer_id,
},
))
.await
.map_err(|err| anyhow!("Failed to add peer to known: {err}"))?
.into_inner();
Ok(all_peers)
}
// Re-attach peer to the consensus:
// Notifies the cluster(any node) that this node changed its address
async fn recover(
state_ref: &ConsensusStateRef,
uri: Option<String>,
p2p_port: u16,
config: &ConsensusConfig,
tls_config: Option<ClientTlsConfig>,
) -> anyhow::Result<()> {
let this_peer_id = state_ref.this_peer_id();
let mut peer_to_uri = state_ref
.persistent
.read()
.peer_address_by_id
.read()
.clone();
let this_peer_url = peer_to_uri.remove(&this_peer_id);
// Recover url if a different one is provided
let do_recover = match (&this_peer_url, &uri) {
(Some(this_peer_url), Some(uri)) => this_peer_url != &Uri::from_str(uri)?,
_ => false,
};
if do_recover {
let mut tries = RECOVERY_MAX_RETRY_COUNT;
while tries > 0 {
// Try to inform any peer about the change of address
for (peer_id, peer_uri) in &peer_to_uri {
let res = Self::add_peer_to_known_for(
this_peer_id,
peer_uri.clone(),
uri.clone(),
p2p_port,
config,
tls_config.clone(),
)
.await;
if res.is_err() {
log::warn!(
"Failed to recover from peer with id {} at {} with error {:?}, trying others",
peer_id,
peer_uri,
res
);
} else {
log::debug!(
"Successfully recovered from peer with id {} at {}",
peer_id,
peer_uri
);
return Ok(());
}
}
tries -= 1;
log::warn!(
"Retrying recovering from known peers (retry {})",
RECOVERY_MAX_RETRY_COUNT - tries
);
let exp_timeout =
RECOVERY_RETRY_TIMEOUT * (RECOVERY_MAX_RETRY_COUNT - tries) as u32;
sleep(exp_timeout).await;
}
return Err(anyhow::anyhow!("Failed to recover from any known peers"));
}
Ok(())
}
/// Add node sequence:
///
/// 1. Add current node as a learner
/// 2. Start applying entries from consensus
/// 3. Eventually leader submits the promotion proposal
/// 4. Learners become voters once they read about the promotion from consensus log
async fn bootstrap(
state_ref: &ConsensusStateRef,
bootstrap_peer: Uri,
uri: Option<String>,
p2p_port: u16,
config: &ConsensusConfig,
tls_config: Option<ClientTlsConfig>,
) -> anyhow::Result<()> {
let this_peer_id = state_ref.this_peer_id();
let all_peers = Self::add_peer_to_known_for(
this_peer_id,
bootstrap_peer,
uri.clone(),
p2p_port,
config,
tls_config,
)
.await?;
// Although peer addresses are synchronized with consensus, addresses need to be pre-fetched in the case of a new peer
// or it will not know how to answer the Raft leader
for peer in all_peers.all_peers {
state_ref
.add_peer(
peer.id,
peer.uri
.parse()
.context(format!("Failed to parse peer URI: {}", peer.uri))?,
)
.map_err(|err| anyhow!("Failed to add peer: {}", err))?
}
// Only first peer has itself as a voter in the initial conf state.
// This needs to be propagated manually to other peers as it is not contained in any log entry.
// So we skip the learner phase for the first peer.
state_ref.set_first_voter(all_peers.first_peer_id)?;
state_ref.set_conf_state(ConfState::from((vec![all_peers.first_peer_id], vec![])))?;
Ok(())
}
pub fn start(&mut self) -> anyhow::Result<()> {
// If this is the only peer in the cluster, tick Raft node a few times to instantly
// self-elect itself as Raft leader
if self.node.store().peer_count() == 1 {
while !self.node.has_ready() {
self.node.tick();
}
}
let tick_period = Duration::from_millis(self.config.tick_period_ms);
let mut previous_tick = Instant::now();
loop {
// Apply in-memory changes to the Raft State Machine
// If updates = None, we need to skip this step due to timing limits
// If updates = Some(0), means we didn't receive any updates explicitly
let updates = self.advance_node(previous_tick, tick_period)?;
let mut elapsed = previous_tick.elapsed();
while elapsed > tick_period {
self.node.tick();
previous_tick += tick_period;
elapsed -= tick_period;
}
if self.node.has_ready() {
// Persist AND apply changes, which were committed in the Raft State Machine
let stop_consensus = self.on_ready()?;
if stop_consensus {
return Ok(());
}
} else if updates == Some(0) {
// Assume consensus is up-to-date, we can sync local state
// Which involves resoling inconsistencies and trying to recover data marked as dead
self.try_sync_local_state()?;
}
}
}
fn advance_node(
&mut self,
previous_tick: Instant,
tick_period: Duration,
) -> anyhow::Result<Option<usize>> {
if previous_tick.elapsed() >= tick_period {
return Ok(None);
}
match self.try_add_origin() {
// `try_add_origin` is not applicable:
// - either current peer is not an origin peer
// - or cluster is already established
Ok(false) => (),
// Successfully proposed origin peer to consensus, return to consensus loop to handle `on_ready`
Ok(true) => return Ok(Some(1)),
// Origin peer is not a leader yet, wait for the next tick and return to consensus loop
// to tick Raft node
Err(err @ TryAddOriginError::NotLeader) => {
log::debug!("{err}");
let next_tick = previous_tick + tick_period;
let duration_until_next_tick = next_tick.saturating_duration_since(Instant::now());
thread::sleep(duration_until_next_tick);
return Ok(None);
}
// Failed to propose origin peer ID to consensus (which should never happen!),
// log error and continue regular consensus loop
Err(err) => {
log::error!("{err}");
}
}
if self
.try_promote_learner()
.context("failed to promote learner")?
{
return Ok(Some(1));
}
let mut updates = 0;
let mut timeout_at = previous_tick + tick_period;
// We need to limit the batch size, as application of one batch should be limited in time.
const RAFT_BATCH_SIZE: usize = 128;
let wait_timeout_for_consecutive_messages = tick_period / 10;
// This loop batches incoming messages, so we would need to "apply" them only once.
// The "Apply" step is expensive, so it is done for performance reasons.
// But on the other hand, we still want to react to rare
// individual messages as fast as possible.
// To fulfill both requirements, we are going the following way:
// 1. Wait for the first message for full tick period.
// 2. If the message is received, wait for the next message only for 1/10 of the tick period.
loop {
// This queue have 2 types of events:
// - Messages from the leader, like pings, requests to add logs, acks, etc.
// - Messages from users, like requests to start shard transfers, etc.
//
// Timeout defines how long can we wait for the next message.
// Since this thread is sync, we can't wait indefinitely.
// Timeout is set up to be about the time of tick.
let Ok(message) = self.recv_update(timeout_at) else {
break;
};
// Those messages should not be batched, so we interrupt the loop if we see them.
// Motivation is: if we change the peer, it should be done immediately,
// otherwise we loose the update on this new peer
let is_conf_change = matches!(
message,
Message::FromClient(
ConsensusOperations::AddPeer { .. } | ConsensusOperations::RemovePeer(_)
),
);
// We put the message in Raft State Machine
// This update will hold update in memory, but will not be persisted yet.
// E.g. if it is a ping, we don't need to persist anything ofr it.
if let Err(err) = self.advance_node_impl(message) {
log::warn!("{err}");
continue;
}
updates += 1;
timeout_at = Instant::now() + wait_timeout_for_consecutive_messages;
if previous_tick.elapsed() >= tick_period
|| updates >= RAFT_BATCH_SIZE
|| is_conf_change
{
break;
}
}
Ok(Some(updates))
}
fn recv_update(&mut self, timeout_at: Instant) -> Result<Message, TryRecvUpdateError> {
self.runtime.block_on(async {
tokio::select! {
biased;
_ = tokio::time::sleep_until(timeout_at.into()) => Err(TryRecvUpdateError::Timeout),
message = self.receiver.recv() => message.ok_or(TryRecvUpdateError::Closed),
}
})
}
fn advance_node_impl(&mut self, message: Message) -> anyhow::Result<()> {
match message {
Message::FromClient(ConsensusOperations::AddPeer { peer_id, uri }) => {
let mut change = ConfChangeV2::default();
change.set_changes(vec![raft_proto::new_conf_change_single(
peer_id,
ConfChangeType::AddLearnerNode,
)]);
log::debug!("Proposing network configuration change: {:?}", change);
self.node
.propose_conf_change(uri.into_bytes(), change)
.context("failed to propose conf change")?;
}
Message::FromClient(ConsensusOperations::RemovePeer(peer_id)) => {
let mut change = ConfChangeV2::default();
change.set_changes(vec![raft_proto::new_conf_change_single(
peer_id,
ConfChangeType::RemoveNode,
)]);
log::debug!("Proposing network configuration change: {:?}", change);
self.node
.propose_conf_change(vec![], change)
.context("failed to propose conf change")?;
}
Message::FromClient(ConsensusOperations::RequestSnapshot) => {
self.node
.request_snapshot()
.context("failed to request snapshot")?;
}
Message::FromClient(ConsensusOperations::ReportSnapshot { peer_id, status }) => {
self.node.report_snapshot(peer_id, status.into());
}
Message::FromClient(operation) => {
let data =
serde_cbor::to_vec(&operation).context("failed to serialize operation")?;
log::trace!("Proposing entry from client with length: {}", data.len());
self.node
.propose(vec![], data)
.context("failed to propose entry")?;
}
Message::FromPeer(message) => {
let is_heartbeat = matches!(
message.get_msg_type(),
MessageType::MsgHeartbeat | MessageType::MsgHeartbeatResponse,
);
if !is_heartbeat {
log::trace!(
"Received a message from peer with progress: {:?}. Message: {:?}",
self.node.raft.prs().get(message.from),
message,
);
}
self.node.step(*message).context("failed to step message")?;
}
}
Ok(())
}
fn try_sync_local_state(&mut self) -> anyhow::Result<()> {
if !self.node.has_ready() {
// No updates to process
let store = self.node.store();
let pending_operations = store.persistent.read().unapplied_entities_count();
if pending_operations == 0 && store.is_leader_established.check_ready() {
// If leader is established and there is nothing else to do on this iteration,
// then we can check if there are any un-synchronized local state left.
store.sync_local_state()?;
}
}
Ok(())
}
/// Tries to propose "origin peer" (the very first peer, that starts new cluster) to consensus
fn try_add_origin(&mut self) -> Result<bool, TryAddOriginError> {
// We can determine origin peer from consensus state:
// - it should be the only peer in the cluster
// - and its commit index should be at 0 or 1
//
// When we add a new node to existing cluster, we have to bootstrap it from existing cluster
// node, and during bootstrap we explicitly add all current peers to consensus state. So,
// *all* peers added to the cluster after the origin will always have at least two peers.
//
// When origin peer starts new cluster, it self-elects itself as a leader and commits empty
// operation with index 1. It is impossible to commit anything to consensus before this
// operation is committed. And to add another (second/third/etc) peer to the cluster, we
// have to commit a conf-change operation. Which means that only origin peer can ever be at
// commit index 0 or 1.
// Check that we are the only peer in the cluster
if self.node.store().peer_count() > 1 {
return Ok(false);
}
let status = self.node.status();
// Check that we are at index 0 or 1
if status.hs.commit > 1 {
return Ok(false);
}
// If we reached this point, we are the origin peer, but it's impossible to propose anything
// to consensus, before leader is elected (`propose_conf_change` will return an error),
// so we have to wait for a few ticks for self-election
if status.ss.raft_state != StateRole::Leader {
return Err(TryAddOriginError::NotLeader);
}
// Propose origin peer to consensus
let mut change = ConfChangeV2::default();
change.set_changes(vec![raft_proto::new_conf_change_single(
status.id,
ConfChangeType::AddNode,
)]);
let peer_uri = self
.node
.store()
.persistent
.read()
.peer_address_by_id
.read()
.get(&status.id)
.ok_or_else(|| TryAddOriginError::UriNotFound)?
.to_string();
self.node.propose_conf_change(peer_uri.into(), change)?;
Ok(true)
}
/// Returns `true` if learner promotion was proposed, `false` otherwise.
/// Learner node does not vote on elections, cause it might not have a big picture yet.
/// So consensus should guarantee that learners are promoted one-by-one.
/// Promotions are done by leader and only after it has no pending entries,
/// that guarantees that learner will start voting only after it applies all the changes in the log
fn try_promote_learner(&mut self) -> anyhow::Result<bool> {
// Promote only if leader
if self.node.status().ss.raft_state != StateRole::Leader {
return Ok(false);
}
// Promote only when there are no uncommitted changes.
let store = self.node.store();
let commit = store.hard_state().commit;
let last_log_entry = store.last_index()?;
if commit != last_log_entry {
return Ok(false);
}
let Some(learner) = self.find_learner_to_promote() else {
return Ok(false);
};
log::debug!("Proposing promotion for learner {learner} to voter");
let mut change = ConfChangeV2::default();
change.set_changes(vec![raft_proto::new_conf_change_single(
learner,
ConfChangeType::AddNode,
)]);
self.node.propose_conf_change(vec![], change)?;
Ok(true)
}
fn find_learner_to_promote(&self) -> Option<u64> {
let commit = self.node.store().hard_state().commit;
let learners: HashSet<_> = self
.node
.store()
.conf_state()
.learners
.into_iter()
.collect();
let status = self.node.status();
status
.progress?
.iter()
.find(|(id, progress)| learners.contains(id) && progress.matched == commit)
.map(|(id, _)| *id)
}
/// Returns `true` if consensus should be stopped, `false` otherwise.
fn on_ready(&mut self) -> anyhow::Result<bool> {
if !self.node.has_ready() {
// No updates to process
return Ok(false);
}
self.store().record_consensus_working();
// Get the `Ready` with `RawNode::ready` interface.
let ready = self.node.ready();
let (Some(light_ready), role_change) = self.process_ready(ready)? else {
// No light ready, so we need to stop consensus.
return Ok(true);
};
let result = self.process_light_ready(light_ready)?;
if let Some(role_change) = role_change {
self.process_role_change(role_change);
}
self.store().compact_wal(self.config.compact_wal_entries)?;
Ok(result)
}
fn process_role_change(&self, role_change: StateRole) {
// Explicit match here for better readability
match role_change {
StateRole::Candidate | StateRole::PreCandidate => {
self.store().is_leader_established.make_not_ready()
}
StateRole::Leader | StateRole::Follower => {
if self.node.raft.leader_id != INVALID_ID {
self.store().is_leader_established.make_ready()
} else {
self.store().is_leader_established.make_not_ready()
}
}
}
}
/// Tries to process raft's ready state. Happens on each tick.
///
/// The order of operations in this functions is critical, changing it might lead to bugs.
///
/// Returns with err on failure to apply the state.
/// If it receives message to stop the consensus - returns None instead of LightReady.
fn process_ready(
&mut self,
mut ready: raft::Ready,
) -> anyhow::Result<(Option<raft::LightReady>, Option<StateRole>)> {
let store = self.store();
if !ready.messages().is_empty() {
log::trace!("Handling {} messages", ready.messages().len());
self.send_messages(ready.take_messages());
}
if !ready.snapshot().is_empty() {
// This is a snapshot, we need to apply the snapshot at first.
log::debug!("Applying snapshot");
if let Err(err) = store.apply_snapshot(&ready.snapshot().clone())? {
log::error!("Failed to apply snapshot: {err}");
}
}
if !ready.entries().is_empty() {
// Append entries to the Raft log.
log::debug!("Appending {} entries to raft log", ready.entries().len());
store
.append_entries(ready.take_entries())
.map_err(|err| anyhow!("Failed to append entries: {}", err))?
}
if let Some(hs) = ready.hs() {
// Raft HardState changed, and we need to persist it.
log::debug!("Changing hard state. New hard state: {hs:?}");
store
.set_hard_state(hs.clone())
.map_err(|err| anyhow!("Failed to set hard state: {}", err))?
}
let role_change = ready.ss().map(|ss| ss.raft_state);
if let Some(ss) = ready.ss() {
log::debug!("Changing soft state. New soft state: {ss:?}");
self.handle_soft_state(ss);
}
if !ready.persisted_messages().is_empty() {
log::trace!(
"Handling {} persisted messages",
ready.persisted_messages().len()
);
self.send_messages(ready.take_persisted_messages());
}
// Should be done after Hard State is saved, so that `applied` index is never bigger than `commit`.
let stop_consensus =
handle_committed_entries(&ready.take_committed_entries(), &store, &mut self.node)
.context("Failed to handle committed entries")?;
if stop_consensus {
return Ok((None, None));
}
// Advance the Raft.
let light_rd = self.node.advance(ready);
Ok((Some(light_rd), role_change))
}
/// Tries to process raft's light ready state.
///
/// The order of operations in this functions is critical, changing it might lead to bugs.
///
/// Returns with err on failure to apply the state.
/// If it receives message to stop the consensus - returns `true`, otherwise `false`.
fn process_light_ready(&mut self, mut light_rd: raft::LightReady) -> anyhow::Result<bool> {
let store = self.store();
// Update commit index.
if let Some(commit) = light_rd.commit_index() {
log::debug!("Updating commit index to {commit}");
store
.set_commit_index(commit)
.map_err(|err| anyhow!("Failed to set commit index: {}", err))?;
}
self.send_messages(light_rd.take_messages());
// Apply all committed entries.
let stop_consensus =
handle_committed_entries(&light_rd.take_committed_entries(), &store, &mut self.node)
.context("Failed to apply committed entries")?;
// Advance the apply index.
self.node.advance_apply();
Ok(stop_consensus)
}
fn store(&self) -> ConsensusStateRef {
self.node.store().clone()
}
fn handle_soft_state(&self, state: &SoftState) {
let store = self.node.store();
store.set_raft_soft_state(state);
}
fn send_messages(&mut self, messages: Vec<RaftMessage>) {
self.broker.send(messages);
}
}
#[derive(Copy, Clone, Debug, thiserror::Error)]
enum TryRecvUpdateError {
#[error("timeout elapsed")]
Timeout,
#[error("channel closed")]
Closed,
}
#[derive(Debug, thiserror::Error)]
enum TryAddOriginError {
#[error("origin peer is not a leader")]
NotLeader,
#[error("origin peer URI not found")]
UriNotFound,
#[error("failed to propose origin peer URI to consensus: {0}")]
RaftError(#[from] raft::Error),
}
/// This function actually applies the committed entries to the state machine.
/// Return `true` if consensus should be stopped.
/// `false` otherwise.
fn handle_committed_entries(
entries: &[Entry],
state: &ConsensusStateRef,
raw_node: &mut RawNode<ConsensusStateRef>,
) -> anyhow::Result<bool> {
let mut stop_consensus = false;
if let (Some(first), Some(last)) = (entries.first(), entries.last()) {
state.set_unapplied_entries(first.index, last.index)?;
stop_consensus = state.apply_entries(raw_node)?;
}
Ok(stop_consensus)
}
struct RaftMessageBroker {
senders: HashMap<PeerId, RaftMessageSenderHandle>,
runtime: Handle,
bootstrap_uri: Option<Uri>,
tls_config: Option<ClientTlsConfig>,
consensus_config: Arc<ConsensusConfig>,
consensus_state: ConsensusStateRef,
transport_channel_pool: Arc<TransportChannelPool>,
}
impl RaftMessageBroker {
pub fn new(
runtime: Handle,
bootstrap_uri: Option<Uri>,
tls_config: Option<ClientTlsConfig>,
consensus_config: ConsensusConfig,
consensus_state: ConsensusStateRef,
transport_channel_pool: Arc<TransportChannelPool>,
) -> Self {
Self {
senders: HashMap::new(),
runtime,
bootstrap_uri,
tls_config,
consensus_config: consensus_config.into(),
consensus_state,
transport_channel_pool,
}
}
pub fn send(&mut self, messages: impl IntoIterator<Item = RaftMessage>) {
let mut messages = messages.into_iter();
let mut retry = None;
while let Some(message) = retry.take().or_else(|| messages.next()) {
let peer_id = message.to;
let sender = match self.senders.get_mut(&peer_id) {
Some(sender) => sender,
None => {
log::debug!("Spawning message sender task for peer {peer_id}...");
let (task, handle) = self.message_sender();
let future = self.runtime.spawn(task.exec());
drop(future); // drop `JoinFuture` explicitly to make clippy happy
self.senders.insert(peer_id, handle);
self.senders
.get_mut(&peer_id)
.expect("message sender task spawned")
}
};
let failed_to_forward = |message: &RaftMessage, description: &str| {
let peer_id = message.to;
let is_debug = log::max_level() >= log::Level::Debug;
let space = if is_debug { " " } else { "" };
let message: &dyn fmt::Debug = if is_debug { &message } else { &"" }; // TODO: `fmt::Debug` for `""` prints `""`... 😒
log::error!(
"Failed to forward message{space}{message:?} to message sender task {peer_id}: \
{description}"
);
};
match sender.send(message) {
Ok(()) => (),
Err(tokio::sync::mpsc::error::TrySendError::Full((_, message))) => {
failed_to_forward(
&message,
"message sender task queue is full. Message will be dropped.",
);
}
Err(tokio::sync::mpsc::error::TrySendError::Closed((_, message))) => {
failed_to_forward(
&message,
"message sender task queue is closed. \
Message sender task will be restarted and message will be retried.",
);
self.senders.remove(&peer_id);
retry = Some(message);
}
}
}
}
fn message_sender(&self) -> (RaftMessageSender, RaftMessageSenderHandle) {
let (messages_tx, messages_rx) = tokio::sync::mpsc::channel(128);
let (heartbeat_tx, heartbeat_rx) = tokio::sync::watch::channel(Default::default());
let task = RaftMessageSender {
messages: messages_rx,
heartbeat: heartbeat_rx,
bootstrap_uri: self.bootstrap_uri.clone(),
tls_config: self.tls_config.clone(),
consensus_config: self.consensus_config.clone(),
consensus_state: self.consensus_state.clone(),
transport_channel_pool: self.transport_channel_pool.clone(),
};
let handle = RaftMessageSenderHandle {
messages: messages_tx,
heartbeat: heartbeat_tx,
index: 0,
};
(task, handle)
}
}
#[derive(Debug)]
struct RaftMessageSenderHandle {
messages: Sender<(usize, RaftMessage)>,
heartbeat: watch::Sender<(usize, RaftMessage)>,
index: usize,
}
impl RaftMessageSenderHandle {
#[allow(clippy::result_large_err)]
pub fn send(&mut self, message: RaftMessage) -> RaftMessageSenderResult<()> {
if !is_heartbeat(&message) {
self.messages.try_send((self.index, message))?;
} else {
self.heartbeat.send((self.index, message)).map_err(
|tokio::sync::watch::error::SendError(message)| {
tokio::sync::mpsc::error::TrySendError::Closed(message)
},
)?;
}
self.index += 1;
Ok(())
}
}
type RaftMessageSenderResult<T, E = RaftMessageSenderError> = Result<T, E>;
type RaftMessageSenderError = tokio::sync::mpsc::error::TrySendError<(usize, RaftMessage)>;
struct RaftMessageSender {
messages: Receiver<(usize, RaftMessage)>,
heartbeat: watch::Receiver<(usize, RaftMessage)>,
bootstrap_uri: Option<Uri>,
tls_config: Option<ClientTlsConfig>,
consensus_config: Arc<ConsensusConfig>,
consensus_state: ConsensusStateRef,
transport_channel_pool: Arc<TransportChannelPool>,
}
impl RaftMessageSender {
pub async fn exec(mut self) {
// Imagine that `raft` crate put four messages to be sent to some other Raft node into
// `RaftMessageSender`'s queue:
//
// | 4: AppendLog | 3: Heartbeat | 2: Heartbeat | 1: AppendLog |
//
// Heartbeat is the most basic message type in Raft. It only carries common "metadata"
// without any additional "payload". And all other message types in Raft also carry
// the same basic metadata as the heartbeat message.
//
// This way, message `3` instantly "outdates" message `2`: they both carry the same data
// fields, but message `3` was produced more recently, and so it might contain newer values
// of these data fields.
//
// And because all messages carry the same basic data as the heartbeat message, message `4`
// instantly "outdates" both message `2` and `3`.
//
// This way, if there are more than one message queued for the `RaftMessageSender`,
// we can optimize delivery a bit and skip any heartbeat message if there's a more
// recent message scheduled later in the queue.
//
// `RaftMessageSender` have two separate "queues":
// - `messages` queue for non-heartbeat messages
// - and `heartbeat` "watch" channel for heartbeat messages
// - "watch" is a special channel in Tokio, that only retains the *last* sent value
// - so any heartbeat received from the `heartbeat` channel is always the *most recent* one
//
// We are using `tokio::select` to "simultaneously" check both queues for new messages...
// but we are using `tokio::select` in a "biased" mode!
//
// - in this mode select always polls `messages.recv()` future first
// - so even if there are new messages in both queues, it will always return a non-heartbeat
// message from `messages` queue first
// - and it will only return a heartbeat message from `heartbeat` channel if there's no
// messages left in the `messages` queue
//
// There's one special case that we should be careful about with our two queues:
//
// If we return to the diagram above, and imagine four messages were sent in the same order
// into our two queues, then `RaftMessageSender` might pull them from the queues in the
// `1`, `4`, `3` order.
//
// E.g., we pull non-heartbeat messages `1` and `4` first, heartbeat `2` was overwritten
// by heartbeat `3` (because of the "watch" channel), so once `messages` queue is empty
// we receive heartbeat `3`, which is now out-of-order.
//
// To handle this we explicitly enumerate each message and only send a message if its index
// is higher-or-equal than the index of a previous one. (This check can be expressed with
// both strict "higher" or "higher-or-equal" conditional, I just like the "or-equal" version
// a bit better.)
//
// If either `messages` queue or `heartbeat` channel is closed (e.g., `messages.recv()`
// returns `None` or `heartbeat.changed()` returns an error), we assume that
// `RaftMessageSenderHandle` has been dropped, and treat it as a "shutdown"/"cancellation"
// signal (and break from the loop).
let mut prev_index = 0;
loop {
let (index, message) = tokio::select! {
biased;
Some(message) = self.messages.recv() => message,
Ok(()) = self.heartbeat.changed() => self.heartbeat.borrow_and_update().clone(),
else => break,
};
if prev_index <= index {
self.send(&message).await;
prev_index = index;
}
}
}
async fn send(&mut self, message: &RaftMessage) {
if let Err(err) = self.try_send(message).await {
let peer_id = message.to;
if log::max_level() >= log::Level::Debug {
log::error!("Failed to send Raft message {message:?} to peer {peer_id}: {err}");
} else {
log::error!("Failed to send Raft message to peer {peer_id}: {err}");
}
}
}
async fn try_send(&mut self, message: &RaftMessage) -> anyhow::Result<()> {
let peer_id = message.to;
let uri = self.uri(peer_id).await?;
let mut bytes = Vec::new();
<RaftMessage as prost_for_raft::Message>::encode(message, &mut bytes)
.context("failed to encode Raft message")?;
let grpc_message = GrpcRaftMessage { message: bytes };
let timeout = Duration::from_millis(
self.consensus_config.message_timeout_ticks * self.consensus_config.tick_period_ms,
);
let res = self
.transport_channel_pool
.with_channel_timeout(
&uri,
|channel| async {
let mut client = RaftClient::new(channel);
let mut request = tonic::Request::new(grpc_message.clone());
request.set_timeout(timeout);
client.send(request).await
},
Some(timeout),
0,
)
.await;
if message.msg_type == raft::eraftpb::MessageType::MsgSnapshot as i32 {
let res = self.consensus_state.report_snapshot(
peer_id,
if res.is_ok() {
SnapshotStatus::Finish
} else {
SnapshotStatus::Failure
},
);
// Should we ignore the error? Seems like it will only produce noise.
//
// - `send_message` is only called by the sub-task spawned by the consensus thread.
// - `report_snapshot` sends a message back to the consensus thread.
// - It can only fail, if the "receiver" end of the channel is closed.
// - Which means consensus thread either resolved successfully, or failed.
// - So, if the consensus thread is shutting down, no need to log a misleading error...
// - ...or, if the consensus thread failed, then we should already have an error,
// and it will only produce more noise.
if let Err(err) = res {
log::error!("{}", err);
}
}
match res {
Ok(_) => self.consensus_state.record_message_send_success(&uri),
Err(err) => self.consensus_state.record_message_send_failure(&uri, err),
}
Ok(())
}
async fn uri(&mut self, peer_id: PeerId) -> anyhow::Result<Uri> {
let uri = self
.consensus_state
.peer_address_by_id()
.get(&peer_id)
.cloned();
match uri {
Some(uri) => Ok(uri),
None => self.who_is(peer_id).await,
}
}
async fn who_is(&mut self, peer_id: PeerId) -> anyhow::Result<Uri> {
let bootstrap_uri = self
.bootstrap_uri
.clone()
.ok_or_else(|| anyhow::format_err!("No bootstrap URI provided"))?;
let bootstrap_timeout = Duration::from_secs(self.consensus_config.bootstrap_timeout_sec);
// Use dedicated transport channel for who_is because of specific timeout
let channel = make_grpc_channel(
bootstrap_timeout,
bootstrap_timeout,
bootstrap_uri,
self.tls_config.clone(),
)
.await
.map_err(|err| anyhow::format_err!("Failed to create who-is channel: {}", err))?;
let uri = RaftClient::new(channel)
.who_is(tonic::Request::new(GrpcPeerId { id: peer_id }))
.await?
.into_inner()
.uri
.parse()?;
Ok(uri)
}
}
fn is_heartbeat(message: &RaftMessage) -> bool {
message.msg_type == raft::eraftpb::MessageType::MsgHeartbeat as i32
|| message.msg_type == raft::eraftpb::MessageType::MsgHeartbeatResponse as i32
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use std::thread;
use collection::operations::vector_params_builder::VectorParamsBuilder;
use collection::shards::channel_service::ChannelService;
use common::cpu::CpuBudget;
use segment::types::Distance;
use slog::Drain;
use storage::content_manager::collection_meta_ops::{
CollectionMetaOperations, CreateCollection, CreateCollectionOperation,
};
use storage::content_manager::consensus::operation_sender::OperationSender;
use storage::content_manager::consensus::persistent::Persistent;
use storage::content_manager::consensus_manager::{ConsensusManager, ConsensusStateRef};
use storage::content_manager::toc::TableOfContent;
use storage::dispatcher::Dispatcher;
use storage::rbac::Access;
use tempfile::Builder;
use super::Consensus;
use crate::common::helpers::create_general_purpose_runtime;
use crate::settings::ConsensusConfig;
#[test]
fn collection_creation_passes_consensus() {
// Given
let storage_dir = Builder::new().prefix("storage").tempdir().unwrap();
let mut settings = crate::Settings::new(None).expect("Can't read config.");
settings.storage.storage_path = storage_dir.path().to_str().unwrap().to_string();
tracing_subscriber::fmt::init();
let search_runtime =
crate::create_search_runtime(settings.storage.performance.max_search_threads)
.expect("Can't create search runtime.");
let update_runtime =
crate::create_update_runtime(settings.storage.performance.max_search_threads)
.expect("Can't create update runtime.");
let general_runtime =
create_general_purpose_runtime().expect("Can't create general purpose runtime.");
let handle = general_runtime.handle().clone();
let (propose_sender, propose_receiver) = std::sync::mpsc::channel();
let persistent_state =
Persistent::load_or_init(&settings.storage.storage_path, true, false).unwrap();
let operation_sender = OperationSender::new(propose_sender);
let toc = TableOfContent::new(
&settings.storage,
search_runtime,
update_runtime,
general_runtime,
CpuBudget::default(),
ChannelService::new(settings.service.http_port, None),
persistent_state.this_peer_id(),
Some(operation_sender.clone()),
);
let toc_arc = Arc::new(toc);
let storage_path = toc_arc.storage_path();
let consensus_state: ConsensusStateRef = ConsensusManager::new(
persistent_state,
toc_arc.clone(),
operation_sender,
storage_path,
)
.into();
let dispatcher = Dispatcher::new(toc_arc.clone()).with_consensus(consensus_state.clone());
let slog_logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), slog::o!());
let (mut consensus, message_sender) = Consensus::new(
&slog_logger,
consensus_state.clone(),
None,
Some("http://127.0.0.1:6335".parse().unwrap()),
6335,
ConsensusConfig::default(),
None,
ChannelService::new(settings.service.http_port, None),
handle.clone(),
false,
)
.unwrap();
let is_leader_established = consensus_state.is_leader_established.clone();
thread::spawn(move || consensus.start().unwrap());
thread::spawn(move || {
while let Ok(entry) = propose_receiver.recv() {
if message_sender
.blocking_send(super::Message::FromClient(entry))
.is_err()
{
log::error!("Can not forward new entry to consensus as it was stopped.");
break;
}
}
});
// Wait for Raft to establish the leader
is_leader_established.await_ready();
// Leader election produces a raft log entry, and then origin peer adds itself to consensus
assert_eq!(consensus_state.hard_state().commit, 2);
// Initially there are 0 collections
assert_eq!(toc_arc.all_collections_sync().len(), 0);
// When
// New runtime is used as timers need to be enabled.
handle
.block_on(
dispatcher.submit_collection_meta_op(
CollectionMetaOperations::CreateCollection(CreateCollectionOperation::new(
"test".to_string(),
CreateCollection {
vectors: VectorParamsBuilder::new(10, Distance::Cosine)
.build()
.into(),
sparse_vectors: None,
hnsw_config: None,
wal_config: None,
optimizers_config: None,
shard_number: Some(2),
on_disk_payload: None,
replication_factor: None,
write_consistency_factor: None,
init_from: None,
quantization_config: None,
sharding_method: None,
strict_mode_config: None,
uuid: None,
},
)),
Access::full("For test"),
None,
),
)
.unwrap();
// Then
assert_eq!(consensus_state.hard_state().commit, 5); // first peer self-election + add first peer + create collection + activate shard x2
assert_eq!(toc_arc.all_collections_sync(), vec!["test"]);
}
}