use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::sync::{mpsc, Arc};
use std::thread::JoinHandle;
use std::time::{Duration, Instant};
use std::{fmt, thread};

use anyhow::{anyhow, Context as _};
use api::grpc::dynamic_channel_pool::make_grpc_channel;
use api::grpc::qdrant::raft_client::RaftClient;
use api::grpc::qdrant::{AllPeers, PeerId as GrpcPeerId, RaftMessage as GrpcRaftMessage};
use api::grpc::transport_channel_pool::TransportChannelPool;
use collection::shards::channel_service::ChannelService;
use collection::shards::shard::PeerId;
#[cfg(target_os = "linux")]
use common::cpu::linux_high_thread_priority;
use common::defaults;
use raft::eraftpb::Message as RaftMessage;
use raft::prelude::*;
use raft::{SoftState, StateRole, INVALID_ID};
use storage::content_manager::consensus_manager::ConsensusStateRef;
use storage::content_manager::consensus_ops::{ConsensusOperations, SnapshotStatus};
use storage::content_manager::toc::TableOfContent;
use tokio::runtime::Handle;
use tokio::sync::mpsc::{Receiver, Sender};
use tokio::sync::watch;
use tokio::time::sleep;
use tonic::transport::{ClientTlsConfig, Uri};

use crate::common::helpers;
use crate::common::telemetry_ops::requests_telemetry::TonicTelemetryCollector;
use crate::settings::{ConsensusConfig, Settings};
use crate::tonic::init_internal;

type Node = RawNode<ConsensusStateRef>;

const RECOVERY_RETRY_TIMEOUT: Duration = Duration::from_secs(1);
const RECOVERY_MAX_RETRY_COUNT: usize = 3;

pub enum Message {
    FromClient(ConsensusOperations),
    FromPeer(Box<RaftMessage>),
}

/// Aka Consensus Thread
/// Manages proposed changes to consensus state, ensures that everything is ordered properly
pub struct Consensus {
    /// Raft structure which handles raft-related state
    node: Node,
    /// Receives proposals from peers and client for applying in consensus
    receiver: Receiver<Message>,
    /// Runtime for async message sending
    runtime: Handle,
    /// Uri to some other known peer, used to join the consensus
    /// ToDo: Make if many
    config: ConsensusConfig,
    broker: RaftMessageBroker,
}

impl Consensus {
    /// Create and run consensus node
    #[allow(clippy::too_many_arguments)]
    pub fn run(
        logger: &slog::Logger,
        state_ref: ConsensusStateRef,
        bootstrap_peer: Option<Uri>,
        uri: Option<String>,
        settings: Settings,
        channel_service: ChannelService,
        propose_receiver: mpsc::Receiver<ConsensusOperations>,
        telemetry_collector: Arc<parking_lot::Mutex<TonicTelemetryCollector>>,
        toc: Arc<TableOfContent>,
        runtime: Handle,
        reinit: bool,
    ) -> anyhow::Result<JoinHandle<std::io::Result<()>>> {
        let tls_client_config = helpers::load_tls_client_config(&settings)?;

        let p2p_host = settings.service.host.clone();
        let p2p_port = settings.cluster.p2p.port.expect("P2P port is not set");
        let config = settings.cluster.consensus.clone();

        let (mut consensus, message_sender) = Self::new(
            logger,
            state_ref.clone(),
            bootstrap_peer,
            uri,
            p2p_port,
            config,
            tls_client_config,
            channel_service,
            runtime.clone(),
            reinit,
        )?;

        let state_ref_clone = state_ref.clone();
        thread::Builder::new()
            .name("consensus".to_string())
            .spawn(move || {
                // On Linux, try to use high thread priority because consensus is important
                // Likely fails as we cannot set a higher priority by default due to permissions
                #[cfg(target_os = "linux")]
                if let Err(err) = linux_high_thread_priority() {
                    log::debug!(
                        "Failed to set high thread priority for consensus, ignoring: {err}"
                    );
                }

                if let Err(err) = consensus.start() {
                    log::error!("Consensus stopped with error: {err:#}");
                    state_ref_clone.on_consensus_thread_err(err);
                } else {
                    log::info!("Consensus stopped");
                    state_ref_clone.on_consensus_stopped();
                }
            })?;

        let message_sender_moved = message_sender.clone();
        thread::Builder::new()
            .name("forward-proposals".to_string())
            .spawn(move || {
                // On Linux, try to use high thread priority because consensus is important
                // Likely fails as we cannot set a higher priority by default due to permissions
                #[cfg(target_os = "linux")]
                if let Err(err) = linux_high_thread_priority() {
                    log::debug!(
                        "Failed to set high thread priority for consensus, ignoring: {err}"
                    );
                }

                while let Ok(entry) = propose_receiver.recv() {
                    if message_sender_moved
                        .blocking_send(Message::FromClient(entry))
                        .is_err()
                    {
                        log::error!("Can not forward new entry to consensus as it was stopped.");
                        break;
                    }
                }
            })?;

        let server_tls = if settings.cluster.p2p.enable_tls {
            let tls_config = settings
                .tls
                .clone()
                .ok_or_else(Settings::tls_config_is_undefined_error)?;

            Some(helpers::load_tls_internal_server_config(&tls_config)?)
        } else {
            None
        };

        let handle = thread::Builder::new()
            .name("grpc_internal".to_string())
            .spawn(move || {
                init_internal(
                    toc,
                    state_ref,
                    telemetry_collector,
                    settings,
                    p2p_host,
                    p2p_port,
                    server_tls,
                    message_sender,
                    runtime,
                )
            })
            .unwrap();

        Ok(handle)
    }

    /// If `bootstrap_peer` peer is supplied, then either `uri` or `p2p_port` should be also supplied
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        logger: &slog::Logger,
        state_ref: ConsensusStateRef,
        bootstrap_peer: Option<Uri>,
        uri: Option<String>,
        p2p_port: u16,
        config: ConsensusConfig,
        tls_config: Option<ClientTlsConfig>,
        channel_service: ChannelService,
        runtime: Handle,
        reinit: bool,
    ) -> anyhow::Result<(Self, Sender<Message>)> {
        // If we want to re-initialize consensus, we need to prevent other peers
        // from re-playing consensus WAL operations, as they should already have them applied.
        // Do ensure that we are forcing compacting WAL on the first re-initialized peer,
        // which should trigger snapshot transferring instead of replaying WAL.
        let force_compact_wal = reinit && bootstrap_peer.is_none();

        // On the bootstrap-ed peers during reinit of the consensus
        // we want to make sure only the bootstrap peer will hold the true state
        // Therefore we clear the WAL on the bootstrap peer to force it to request a snapshot
        let clear_wal = reinit && bootstrap_peer.is_some();

        if clear_wal {
            log::debug!("Clearing WAL on the bootstrap peer to force snapshot transfer");
            state_ref.clear_wal()?;
        }

        // raft will not return entries to the application smaller or equal to `applied`
        let last_applied = state_ref.last_applied_entry().unwrap_or_default();
        let raft_config = Config {
            id: state_ref.this_peer_id(),
            applied: last_applied,
            ..Default::default()
        };
        raft_config.validate()?;
        let op_wait = defaults::CONSENSUS_META_OP_WAIT;
        // Commit might take up to 4 ticks as:
        // 1 tick - send proposal to leader
        // 2 tick - leader sends append entries to peers
        // 3 tick - peer answers leader, that entry is persisted
        // 4 tick - leader increases commit index and sends it
        if 4 * Duration::from_millis(config.tick_period_ms) > op_wait {
            log::warn!("With current tick period of {}ms, operation commit time might exceed default wait timeout: {}ms",
                 config.tick_period_ms, op_wait.as_millis())
        }
        // bounded channel for backpressure
        let (sender, receiver) = tokio::sync::mpsc::channel(config.max_message_queue_size);
        // State might be initialized but the node might be shutdown without actually syncing or committing anything.
        if state_ref.is_new_deployment() || reinit {
            let leader_established_in_ms =
                config.tick_period_ms * raft_config.max_election_tick() as u64;
            Self::init(
                &state_ref,
                bootstrap_peer.clone(),
                uri,
                p2p_port,
                &config,
                tls_config.clone(),
                &runtime,
                leader_established_in_ms,
            )
            .map_err(|err| anyhow!("Failed to initialize Consensus for new Raft state: {}", err))?;
        } else {
            runtime
                .block_on(Self::recover(
                    &state_ref,
                    uri.clone(),
                    p2p_port,
                    &config,
                    tls_config.clone(),
                ))
                .map_err(|err| {
                    anyhow!(
                        "Failed to recover Consensus from existing Raft state: {}",
                        err
                    )
                })?;

            if bootstrap_peer.is_some() || uri.is_some() {
                log::debug!("Local raft state found - bootstrap and uri cli arguments were ignored")
            }
            log::debug!("Local raft state found - skipping initialization");
        };

        let mut node = Node::new(&raft_config, state_ref.clone(), logger)?;
        node.set_batch_append(true);

        // Before consensus has started apply any unapplied committed entries
        // They might have not been applied due to unplanned Qdrant shutdown
        let _stop_consensus = state_ref.apply_entries(&mut node)?;

        if force_compact_wal {
            // Making sure that the WAL will be compacted on start
            state_ref.compact_wal(1)?;
        } else {
            state_ref.compact_wal(config.compact_wal_entries)?;
        }

        let broker = RaftMessageBroker::new(
            runtime.clone(),
            bootstrap_peer,
            tls_config,
            config.clone(),
            node.store().clone(),
            channel_service.channel_pool,
        );

        let consensus = Self {
            node,
            receiver,
            runtime,
            config,
            broker,
        };

        if !state_ref.is_new_deployment() {
            state_ref.recover_first_voter()?;
        }

        Ok((consensus, sender))
    }

    #[allow(clippy::too_many_arguments)]
    fn init(
        state_ref: &ConsensusStateRef,
        bootstrap_peer: Option<Uri>,
        uri: Option<String>,
        p2p_port: u16,
        config: &ConsensusConfig,
        tls_config: Option<ClientTlsConfig>,
        runtime: &Handle,
        leader_established_in_ms: u64,
    ) -> anyhow::Result<()> {
        if let Some(bootstrap_peer) = bootstrap_peer {
            log::debug!("Bootstrapping from peer with address: {bootstrap_peer}");
            runtime.block_on(Self::bootstrap(
                state_ref,
                bootstrap_peer,
                uri,
                p2p_port,
                config,
                tls_config,
            ))?;
            Ok(())
        } else {
            log::debug!(
                "Bootstrapping is disabled. Assuming this peer is the first in the network"
            );
            let tick_period = config.tick_period_ms;
            log::info!("With current tick period of {tick_period}ms, leader will be established in approximately {leader_established_in_ms}ms. To avoid rejected operations - add peers and submit operations only after this period.");
            // First peer needs to add its own address
            state_ref.add_peer(
                state_ref.this_peer_id(),
                uri.ok_or_else(|| anyhow::anyhow!("First peer should specify its uri."))?
                    .parse()?,
            )?;
            Ok(())
        }
    }

    async fn add_peer_to_known_for(
        this_peer_id: PeerId,
        cluster_uri: Uri,
        current_uri: Option<String>,
        p2p_port: u16,
        config: &ConsensusConfig,
        tls_config: Option<ClientTlsConfig>,
    ) -> anyhow::Result<AllPeers> {
        // Use dedicated transport channel for bootstrapping because of specific timeout
        let channel = make_grpc_channel(
            Duration::from_secs(config.bootstrap_timeout_sec),
            Duration::from_secs(config.bootstrap_timeout_sec),
            cluster_uri,
            tls_config,
        )
        .await
        .map_err(|err| anyhow!("Failed to create timeout channel: {err}"))?;
        let mut client = RaftClient::new(channel);
        let all_peers = client
            .add_peer_to_known(tonic::Request::new(
                api::grpc::qdrant::AddPeerToKnownMessage {
                    uri: current_uri,
                    port: Some(u32::from(p2p_port)),
                    id: this_peer_id,
                },
            ))
            .await
            .map_err(|err| anyhow!("Failed to add peer to known: {err}"))?
            .into_inner();
        Ok(all_peers)
    }

    // Re-attach peer to the consensus:
    // Notifies the cluster(any node) that this node changed its address
    async fn recover(
        state_ref: &ConsensusStateRef,
        uri: Option<String>,
        p2p_port: u16,
        config: &ConsensusConfig,
        tls_config: Option<ClientTlsConfig>,
    ) -> anyhow::Result<()> {
        let this_peer_id = state_ref.this_peer_id();
        let mut peer_to_uri = state_ref
            .persistent
            .read()
            .peer_address_by_id
            .read()
            .clone();
        let this_peer_url = peer_to_uri.remove(&this_peer_id);
        // Recover url if a different one is provided
        let do_recover = match (&this_peer_url, &uri) {
            (Some(this_peer_url), Some(uri)) => this_peer_url != &Uri::from_str(uri)?,
            _ => false,
        };

        if do_recover {
            let mut tries = RECOVERY_MAX_RETRY_COUNT;
            while tries > 0 {
                // Try to inform any peer about the change of address
                for (peer_id, peer_uri) in &peer_to_uri {
                    let res = Self::add_peer_to_known_for(
                        this_peer_id,
                        peer_uri.clone(),
                        uri.clone(),
                        p2p_port,
                        config,
                        tls_config.clone(),
                    )
                    .await;
                    if res.is_err() {
                        log::warn!(
                            "Failed to recover from peer with id {} at {} with error {:?}, trying others",
                            peer_id,
                            peer_uri,
                            res
                        );
                    } else {
                        log::debug!(
                            "Successfully recovered from peer with id {} at {}",
                            peer_id,
                            peer_uri
                        );
                        return Ok(());
                    }
                }
                tries -= 1;
                log::warn!(
                    "Retrying recovering from known peers (retry {})",
                    RECOVERY_MAX_RETRY_COUNT - tries
                );
                let exp_timeout =
                    RECOVERY_RETRY_TIMEOUT * (RECOVERY_MAX_RETRY_COUNT - tries) as u32;
                sleep(exp_timeout).await;
            }
            return Err(anyhow::anyhow!("Failed to recover from any known peers"));
        }

        Ok(())
    }

    /// Add node sequence:
    ///
    /// 1. Add current node as a learner
    /// 2. Start applying entries from consensus
    /// 3. Eventually leader submits the promotion proposal
    /// 4. Learners become voters once they read about the promotion from consensus log
    async fn bootstrap(
        state_ref: &ConsensusStateRef,
        bootstrap_peer: Uri,
        uri: Option<String>,
        p2p_port: u16,
        config: &ConsensusConfig,
        tls_config: Option<ClientTlsConfig>,
    ) -> anyhow::Result<()> {
        let this_peer_id = state_ref.this_peer_id();
        let all_peers = Self::add_peer_to_known_for(
            this_peer_id,
            bootstrap_peer,
            uri.clone(),
            p2p_port,
            config,
            tls_config,
        )
        .await?;

        // Although peer addresses are synchronized with consensus, addresses need to be pre-fetched in the case of a new peer
        // or it will not know how to answer the Raft leader
        for peer in all_peers.all_peers {
            state_ref
                .add_peer(
                    peer.id,
                    peer.uri
                        .parse()
                        .context(format!("Failed to parse peer URI: {}", peer.uri))?,
                )
                .map_err(|err| anyhow!("Failed to add peer: {}", err))?
        }
        // Only first peer has itself as a voter in the initial conf state.
        // This needs to be propagated manually to other peers as it is not contained in any log entry.
        // So we skip the learner phase for the first peer.
        state_ref.set_first_voter(all_peers.first_peer_id)?;
        state_ref.set_conf_state(ConfState::from((vec![all_peers.first_peer_id], vec![])))?;
        Ok(())
    }

    pub fn start(&mut self) -> anyhow::Result<()> {
        // If this is the only peer in the cluster, tick Raft node a few times to instantly
        // self-elect itself as Raft leader
        if self.node.store().peer_count() == 1 {
            while !self.node.has_ready() {
                self.node.tick();
            }
        }

        let tick_period = Duration::from_millis(self.config.tick_period_ms);
        let mut previous_tick = Instant::now();

        loop {
            // Apply in-memory changes to the Raft State Machine
            // If updates = None, we need to skip this step due to timing limits
            // If updates = Some(0), means we didn't receive any updates explicitly
            let updates = self.advance_node(previous_tick, tick_period)?;

            let mut elapsed = previous_tick.elapsed();

            while elapsed > tick_period {
                self.node.tick();

                previous_tick += tick_period;
                elapsed -= tick_period;
            }

            if self.node.has_ready() {
                // Persist AND apply changes, which were committed in the Raft State Machine
                let stop_consensus = self.on_ready()?;

                if stop_consensus {
                    return Ok(());
                }
            } else if updates == Some(0) {
                // Assume consensus is up-to-date, we can sync local state
                // Which involves resoling inconsistencies and trying to recover data marked as dead
                self.try_sync_local_state()?;
            }
        }
    }

    fn advance_node(
        &mut self,
        previous_tick: Instant,
        tick_period: Duration,
    ) -> anyhow::Result<Option<usize>> {
        if previous_tick.elapsed() >= tick_period {
            return Ok(None);
        }

        match self.try_add_origin() {
            // `try_add_origin` is not applicable:
            // - either current peer is not an origin peer
            // - or cluster is already established
            Ok(false) => (),

            // Successfully proposed origin peer to consensus, return to consensus loop to handle `on_ready`
            Ok(true) => return Ok(Some(1)),

            // Origin peer is not a leader yet, wait for the next tick and return to consensus loop
            // to tick Raft node
            Err(err @ TryAddOriginError::NotLeader) => {
                log::debug!("{err}");

                let next_tick = previous_tick + tick_period;
                let duration_until_next_tick = next_tick.saturating_duration_since(Instant::now());
                thread::sleep(duration_until_next_tick);

                return Ok(None);
            }

            // Failed to propose origin peer ID to consensus (which should never happen!),
            // log error and continue regular consensus loop
            Err(err) => {
                log::error!("{err}");
            }
        }

        if self
            .try_promote_learner()
            .context("failed to promote learner")?
        {
            return Ok(Some(1));
        }

        let mut updates = 0;
        let mut timeout_at = previous_tick + tick_period;

        // We need to limit the batch size, as application of one batch should be limited in time.
        const RAFT_BATCH_SIZE: usize = 128;

        let wait_timeout_for_consecutive_messages = tick_period / 10;

        // This loop batches incoming messages, so we would need to "apply" them only once.
        // The "Apply" step is expensive, so it is done for performance reasons.

        // But on the other hand, we still want to react to rare
        // individual messages as fast as possible.
        // To fulfill both requirements, we are going the following way:
        //   1. Wait for the first message for full tick period.
        //   2. If the message is received, wait for the next message only for 1/10 of the tick period.
        loop {
            // This queue have 2 types of events:
            // - Messages from the leader, like pings, requests to add logs, acks, etc.
            // - Messages from users, like requests to start shard transfers, etc.
            //
            // Timeout defines how long can we wait for the next message.
            // Since this thread is sync, we can't wait indefinitely.
            // Timeout is set up to be about the time of tick.
            let Ok(message) = self.recv_update(timeout_at) else {
                break;
            };

            // Those messages should not be batched, so we interrupt the loop if we see them.
            // Motivation is: if we change the peer, it should be done immediately,
            //  otherwise we loose the update on this new peer
            let is_conf_change = matches!(
                message,
                Message::FromClient(
                    ConsensusOperations::AddPeer { .. } | ConsensusOperations::RemovePeer(_)
                ),
            );

            // We put the message in Raft State Machine
            // This update will hold update in memory, but will not be persisted yet.
            // E.g. if it is a ping, we don't need to persist anything ofr it.
            if let Err(err) = self.advance_node_impl(message) {
                log::warn!("{err}");
                continue;
            }

            updates += 1;
            timeout_at = Instant::now() + wait_timeout_for_consecutive_messages;

            if previous_tick.elapsed() >= tick_period
                || updates >= RAFT_BATCH_SIZE
                || is_conf_change
            {
                break;
            }
        }

        Ok(Some(updates))
    }

    fn recv_update(&mut self, timeout_at: Instant) -> Result<Message, TryRecvUpdateError> {
        self.runtime.block_on(async {
            tokio::select! {
                biased;
                _ = tokio::time::sleep_until(timeout_at.into()) => Err(TryRecvUpdateError::Timeout),
                message = self.receiver.recv() => message.ok_or(TryRecvUpdateError::Closed),
            }
        })
    }

    fn advance_node_impl(&mut self, message: Message) -> anyhow::Result<()> {
        match message {
            Message::FromClient(ConsensusOperations::AddPeer { peer_id, uri }) => {
                let mut change = ConfChangeV2::default();

                change.set_changes(vec![raft_proto::new_conf_change_single(
                    peer_id,
                    ConfChangeType::AddLearnerNode,
                )]);

                log::debug!("Proposing network configuration change: {:?}", change);
                self.node
                    .propose_conf_change(uri.into_bytes(), change)
                    .context("failed to propose conf change")?;
            }

            Message::FromClient(ConsensusOperations::RemovePeer(peer_id)) => {
                let mut change = ConfChangeV2::default();

                change.set_changes(vec![raft_proto::new_conf_change_single(
                    peer_id,
                    ConfChangeType::RemoveNode,
                )]);

                log::debug!("Proposing network configuration change: {:?}", change);
                self.node
                    .propose_conf_change(vec![], change)
                    .context("failed to propose conf change")?;
            }

            Message::FromClient(ConsensusOperations::RequestSnapshot) => {
                self.node
                    .request_snapshot()
                    .context("failed to request snapshot")?;
            }

            Message::FromClient(ConsensusOperations::ReportSnapshot { peer_id, status }) => {
                self.node.report_snapshot(peer_id, status.into());
            }

            Message::FromClient(operation) => {
                let data =
                    serde_cbor::to_vec(&operation).context("failed to serialize operation")?;

                log::trace!("Proposing entry from client with length: {}", data.len());
                self.node
                    .propose(vec![], data)
                    .context("failed to propose entry")?;
            }

            Message::FromPeer(message) => {
                let is_heartbeat = matches!(
                    message.get_msg_type(),
                    MessageType::MsgHeartbeat | MessageType::MsgHeartbeatResponse,
                );

                if !is_heartbeat {
                    log::trace!(
                        "Received a message from peer with progress: {:?}. Message: {:?}",
                        self.node.raft.prs().get(message.from),
                        message,
                    );
                }

                self.node.step(*message).context("failed to step message")?;
            }
        }

        Ok(())
    }

    fn try_sync_local_state(&mut self) -> anyhow::Result<()> {
        if !self.node.has_ready() {
            // No updates to process
            let store = self.node.store();
            let pending_operations = store.persistent.read().unapplied_entities_count();
            if pending_operations == 0 && store.is_leader_established.check_ready() {
                // If leader is established and there is nothing else to do on this iteration,
                // then we can check if there are any un-synchronized local state left.
                store.sync_local_state()?;
            }
        }
        Ok(())
    }

    /// Tries to propose "origin peer" (the very first peer, that starts new cluster) to consensus
    fn try_add_origin(&mut self) -> Result<bool, TryAddOriginError> {
        // We can determine origin peer from consensus state:
        // - it should be the only peer in the cluster
        // - and its commit index should be at 0 or 1
        //
        // When we add a new node to existing cluster, we have to bootstrap it from existing cluster
        // node, and during bootstrap we explicitly add all current peers to consensus state. So,
        // *all* peers added to the cluster after the origin will always have at least two peers.
        //
        // When origin peer starts new cluster, it self-elects itself as a leader and commits empty
        // operation with index 1. It is impossible to commit anything to consensus before this
        // operation is committed. And to add another (second/third/etc) peer to the cluster, we
        // have to commit a conf-change operation. Which means that only origin peer can ever be at
        // commit index 0 or 1.

        // Check that we are the only peer in the cluster
        if self.node.store().peer_count() > 1 {
            return Ok(false);
        }

        let status = self.node.status();

        // Check that we are at index 0 or 1
        if status.hs.commit > 1 {
            return Ok(false);
        }

        // If we reached this point, we are the origin peer, but it's impossible to propose anything
        // to consensus, before leader is elected (`propose_conf_change` will return an error),
        // so we have to wait for a few ticks for self-election
        if status.ss.raft_state != StateRole::Leader {
            return Err(TryAddOriginError::NotLeader);
        }

        // Propose origin peer to consensus
        let mut change = ConfChangeV2::default();

        change.set_changes(vec![raft_proto::new_conf_change_single(
            status.id,
            ConfChangeType::AddNode,
        )]);

        let peer_uri = self
            .node
            .store()
            .persistent
            .read()
            .peer_address_by_id
            .read()
            .get(&status.id)
            .ok_or_else(|| TryAddOriginError::UriNotFound)?
            .to_string();

        self.node.propose_conf_change(peer_uri.into(), change)?;

        Ok(true)
    }

    /// Returns `true` if learner promotion was proposed, `false` otherwise.
    /// Learner node does not vote on elections, cause it might not have a big picture yet.
    /// So consensus should guarantee that learners are promoted one-by-one.
    /// Promotions are done by leader and only after it has no pending entries,
    /// that guarantees that learner will start voting only after it applies all the changes in the log
    fn try_promote_learner(&mut self) -> anyhow::Result<bool> {
        // Promote only if leader
        if self.node.status().ss.raft_state != StateRole::Leader {
            return Ok(false);
        }

        // Promote only when there are no uncommitted changes.
        let store = self.node.store();
        let commit = store.hard_state().commit;
        let last_log_entry = store.last_index()?;

        if commit != last_log_entry {
            return Ok(false);
        }

        let Some(learner) = self.find_learner_to_promote() else {
            return Ok(false);
        };

        log::debug!("Proposing promotion for learner {learner} to voter");

        let mut change = ConfChangeV2::default();

        change.set_changes(vec![raft_proto::new_conf_change_single(
            learner,
            ConfChangeType::AddNode,
        )]);

        self.node.propose_conf_change(vec![], change)?;

        Ok(true)
    }

    fn find_learner_to_promote(&self) -> Option<u64> {
        let commit = self.node.store().hard_state().commit;
        let learners: HashSet<_> = self
            .node
            .store()
            .conf_state()
            .learners
            .into_iter()
            .collect();
        let status = self.node.status();
        status
            .progress?
            .iter()
            .find(|(id, progress)| learners.contains(id) && progress.matched == commit)
            .map(|(id, _)| *id)
    }

    /// Returns `true` if consensus should be stopped, `false` otherwise.
    fn on_ready(&mut self) -> anyhow::Result<bool> {
        if !self.node.has_ready() {
            // No updates to process
            return Ok(false);
        }
        self.store().record_consensus_working();
        // Get the `Ready` with `RawNode::ready` interface.
        let ready = self.node.ready();

        let (Some(light_ready), role_change) = self.process_ready(ready)? else {
            // No light ready, so we need to stop consensus.
            return Ok(true);
        };

        let result = self.process_light_ready(light_ready)?;

        if let Some(role_change) = role_change {
            self.process_role_change(role_change);
        }

        self.store().compact_wal(self.config.compact_wal_entries)?;

        Ok(result)
    }

    fn process_role_change(&self, role_change: StateRole) {
        // Explicit match here for better readability
        match role_change {
            StateRole::Candidate | StateRole::PreCandidate => {
                self.store().is_leader_established.make_not_ready()
            }
            StateRole::Leader | StateRole::Follower => {
                if self.node.raft.leader_id != INVALID_ID {
                    self.store().is_leader_established.make_ready()
                } else {
                    self.store().is_leader_established.make_not_ready()
                }
            }
        }
    }

    /// Tries to process raft's ready state. Happens on each tick.
    ///
    /// The order of operations in this functions is critical, changing it might lead to bugs.
    ///
    /// Returns with err on failure to apply the state.
    /// If it receives message to stop the consensus - returns None instead of LightReady.
    fn process_ready(
        &mut self,
        mut ready: raft::Ready,
    ) -> anyhow::Result<(Option<raft::LightReady>, Option<StateRole>)> {
        let store = self.store();

        if !ready.messages().is_empty() {
            log::trace!("Handling {} messages", ready.messages().len());
            self.send_messages(ready.take_messages());
        }
        if !ready.snapshot().is_empty() {
            // This is a snapshot, we need to apply the snapshot at first.
            log::debug!("Applying snapshot");

            if let Err(err) = store.apply_snapshot(&ready.snapshot().clone())? {
                log::error!("Failed to apply snapshot: {err}");
            }
        }
        if !ready.entries().is_empty() {
            // Append entries to the Raft log.
            log::debug!("Appending {} entries to raft log", ready.entries().len());
            store
                .append_entries(ready.take_entries())
                .map_err(|err| anyhow!("Failed to append entries: {}", err))?
        }
        if let Some(hs) = ready.hs() {
            // Raft HardState changed, and we need to persist it.
            log::debug!("Changing hard state. New hard state: {hs:?}");
            store
                .set_hard_state(hs.clone())
                .map_err(|err| anyhow!("Failed to set hard state: {}", err))?
        }
        let role_change = ready.ss().map(|ss| ss.raft_state);
        if let Some(ss) = ready.ss() {
            log::debug!("Changing soft state. New soft state: {ss:?}");
            self.handle_soft_state(ss);
        }
        if !ready.persisted_messages().is_empty() {
            log::trace!(
                "Handling {} persisted messages",
                ready.persisted_messages().len()
            );
            self.send_messages(ready.take_persisted_messages());
        }
        // Should be done after Hard State is saved, so that `applied` index is never bigger than `commit`.
        let stop_consensus =
            handle_committed_entries(&ready.take_committed_entries(), &store, &mut self.node)
                .context("Failed to handle committed entries")?;
        if stop_consensus {
            return Ok((None, None));
        }

        // Advance the Raft.
        let light_rd = self.node.advance(ready);
        Ok((Some(light_rd), role_change))
    }

    /// Tries to process raft's light ready state.
    ///
    /// The order of operations in this functions is critical, changing it might lead to bugs.
    ///
    /// Returns with err on failure to apply the state.
    /// If it receives message to stop the consensus - returns `true`, otherwise `false`.
    fn process_light_ready(&mut self, mut light_rd: raft::LightReady) -> anyhow::Result<bool> {
        let store = self.store();
        // Update commit index.
        if let Some(commit) = light_rd.commit_index() {
            log::debug!("Updating commit index to {commit}");
            store
                .set_commit_index(commit)
                .map_err(|err| anyhow!("Failed to set commit index: {}", err))?;
        }
        self.send_messages(light_rd.take_messages());
        // Apply all committed entries.
        let stop_consensus =
            handle_committed_entries(&light_rd.take_committed_entries(), &store, &mut self.node)
                .context("Failed to apply committed entries")?;
        // Advance the apply index.
        self.node.advance_apply();
        Ok(stop_consensus)
    }

    fn store(&self) -> ConsensusStateRef {
        self.node.store().clone()
    }

    fn handle_soft_state(&self, state: &SoftState) {
        let store = self.node.store();
        store.set_raft_soft_state(state);
    }

    fn send_messages(&mut self, messages: Vec<RaftMessage>) {
        self.broker.send(messages);
    }
}

#[derive(Copy, Clone, Debug, thiserror::Error)]
enum TryRecvUpdateError {
    #[error("timeout elapsed")]
    Timeout,

    #[error("channel closed")]
    Closed,
}

#[derive(Debug, thiserror::Error)]
enum TryAddOriginError {
    #[error("origin peer is not a leader")]
    NotLeader,

    #[error("origin peer URI not found")]
    UriNotFound,

    #[error("failed to propose origin peer URI to consensus: {0}")]
    RaftError(#[from] raft::Error),
}

/// This function actually applies the committed entries to the state machine.
/// Return `true` if consensus should be stopped.
/// `false` otherwise.
fn handle_committed_entries(
    entries: &[Entry],
    state: &ConsensusStateRef,
    raw_node: &mut RawNode<ConsensusStateRef>,
) -> anyhow::Result<bool> {
    let mut stop_consensus = false;
    if let (Some(first), Some(last)) = (entries.first(), entries.last()) {
        state.set_unapplied_entries(first.index, last.index)?;
        stop_consensus = state.apply_entries(raw_node)?;
    }
    Ok(stop_consensus)
}

struct RaftMessageBroker {
    senders: HashMap<PeerId, RaftMessageSenderHandle>,
    runtime: Handle,
    bootstrap_uri: Option<Uri>,
    tls_config: Option<ClientTlsConfig>,
    consensus_config: Arc<ConsensusConfig>,
    consensus_state: ConsensusStateRef,
    transport_channel_pool: Arc<TransportChannelPool>,
}

impl RaftMessageBroker {
    pub fn new(
        runtime: Handle,
        bootstrap_uri: Option<Uri>,
        tls_config: Option<ClientTlsConfig>,
        consensus_config: ConsensusConfig,
        consensus_state: ConsensusStateRef,
        transport_channel_pool: Arc<TransportChannelPool>,
    ) -> Self {
        Self {
            senders: HashMap::new(),
            runtime,
            bootstrap_uri,
            tls_config,
            consensus_config: consensus_config.into(),
            consensus_state,
            transport_channel_pool,
        }
    }

    pub fn send(&mut self, messages: impl IntoIterator<Item = RaftMessage>) {
        let mut messages = messages.into_iter();
        let mut retry = None;

        while let Some(message) = retry.take().or_else(|| messages.next()) {
            let peer_id = message.to;

            let sender = match self.senders.get_mut(&peer_id) {
                Some(sender) => sender,
                None => {
                    log::debug!("Spawning message sender task for peer {peer_id}...");

                    let (task, handle) = self.message_sender();
                    let future = self.runtime.spawn(task.exec());
                    drop(future); // drop `JoinFuture` explicitly to make clippy happy

                    self.senders.insert(peer_id, handle);

                    self.senders
                        .get_mut(&peer_id)
                        .expect("message sender task spawned")
                }
            };

            let failed_to_forward = |message: &RaftMessage, description: &str| {
                let peer_id = message.to;

                let is_debug = log::max_level() >= log::Level::Debug;
                let space = if is_debug { " " } else { "" };
                let message: &dyn fmt::Debug = if is_debug { &message } else { &"" }; // TODO: `fmt::Debug` for `""` prints `""`... 😒

                log::error!(
                    "Failed to forward message{space}{message:?} to message sender task {peer_id}: \
                     {description}"
                );
            };

            match sender.send(message) {
                Ok(()) => (),

                Err(tokio::sync::mpsc::error::TrySendError::Full((_, message))) => {
                    failed_to_forward(
                        &message,
                        "message sender task queue is full. Message will be dropped.",
                    );
                }

                Err(tokio::sync::mpsc::error::TrySendError::Closed((_, message))) => {
                    failed_to_forward(
                        &message,
                        "message sender task queue is closed. \
                         Message sender task will be restarted and message will be retried.",
                    );

                    self.senders.remove(&peer_id);
                    retry = Some(message);
                }
            }
        }
    }

    fn message_sender(&self) -> (RaftMessageSender, RaftMessageSenderHandle) {
        let (messages_tx, messages_rx) = tokio::sync::mpsc::channel(128);
        let (heartbeat_tx, heartbeat_rx) = tokio::sync::watch::channel(Default::default());

        let task = RaftMessageSender {
            messages: messages_rx,
            heartbeat: heartbeat_rx,
            bootstrap_uri: self.bootstrap_uri.clone(),
            tls_config: self.tls_config.clone(),
            consensus_config: self.consensus_config.clone(),
            consensus_state: self.consensus_state.clone(),
            transport_channel_pool: self.transport_channel_pool.clone(),
        };

        let handle = RaftMessageSenderHandle {
            messages: messages_tx,
            heartbeat: heartbeat_tx,
            index: 0,
        };

        (task, handle)
    }
}

#[derive(Debug)]
struct RaftMessageSenderHandle {
    messages: Sender<(usize, RaftMessage)>,
    heartbeat: watch::Sender<(usize, RaftMessage)>,
    index: usize,
}

impl RaftMessageSenderHandle {
    #[allow(clippy::result_large_err)]
    pub fn send(&mut self, message: RaftMessage) -> RaftMessageSenderResult<()> {
        if !is_heartbeat(&message) {
            self.messages.try_send((self.index, message))?;
        } else {
            self.heartbeat.send((self.index, message)).map_err(
                |tokio::sync::watch::error::SendError(message)| {
                    tokio::sync::mpsc::error::TrySendError::Closed(message)
                },
            )?;
        }

        self.index += 1;

        Ok(())
    }
}

type RaftMessageSenderResult<T, E = RaftMessageSenderError> = Result<T, E>;
type RaftMessageSenderError = tokio::sync::mpsc::error::TrySendError<(usize, RaftMessage)>;

struct RaftMessageSender {
    messages: Receiver<(usize, RaftMessage)>,
    heartbeat: watch::Receiver<(usize, RaftMessage)>,
    bootstrap_uri: Option<Uri>,
    tls_config: Option<ClientTlsConfig>,
    consensus_config: Arc<ConsensusConfig>,
    consensus_state: ConsensusStateRef,
    transport_channel_pool: Arc<TransportChannelPool>,
}

impl RaftMessageSender {
    pub async fn exec(mut self) {
        // Imagine that `raft` crate put four messages to be sent to some other Raft node into
        // `RaftMessageSender`'s queue:
        //
        // | 4: AppendLog | 3: Heartbeat | 2: Heartbeat | 1: AppendLog |
        //
        // Heartbeat is the most basic message type in Raft. It only carries common "metadata"
        // without any additional "payload". And all other message types in Raft also carry
        // the same basic metadata as the heartbeat message.
        //
        // This way, message `3` instantly "outdates" message `2`: they both carry the same data
        // fields, but message `3` was produced more recently, and so it might contain newer values
        // of these data fields.
        //
        // And because all messages carry the same basic data as the heartbeat message, message `4`
        // instantly "outdates" both message `2` and `3`.
        //
        // This way, if there are more than one message queued for the `RaftMessageSender`,
        // we can optimize delivery a bit and skip any heartbeat message if there's a more
        // recent message scheduled later in the queue.
        //
        // `RaftMessageSender` have two separate "queues":
        // - `messages` queue for non-heartbeat messages
        // - and `heartbeat` "watch" channel for heartbeat messages
        //   - "watch" is a special channel in Tokio, that only retains the *last* sent value
        //   - so any heartbeat received from the `heartbeat` channel is always the *most recent* one
        //
        // We are using `tokio::select` to "simultaneously" check both queues for new messages...
        // but we are using `tokio::select` in a "biased" mode!
        //
        // - in this mode select always polls `messages.recv()` future first
        // - so even if there are new messages in both queues, it will always return a non-heartbeat
        //   message from `messages` queue first
        // - and it will only return a heartbeat message from `heartbeat` channel if there's no
        //   messages left in the `messages` queue
        //
        // There's one special case that we should be careful about with our two queues:
        //
        // If we return to the diagram above, and imagine four messages were sent in the same order
        // into our two queues, then `RaftMessageSender` might pull them from the queues in the
        // `1`, `4`, `3` order.
        //
        // E.g., we pull non-heartbeat messages `1` and `4` first, heartbeat `2` was overwritten
        // by heartbeat `3` (because of the "watch" channel), so once `messages` queue is empty
        // we receive heartbeat `3`, which is now out-of-order.
        //
        // To handle this we explicitly enumerate each message and only send a message if its index
        // is higher-or-equal than the index of a previous one. (This check can be expressed with
        // both strict "higher" or "higher-or-equal" conditional, I just like the "or-equal" version
        // a bit better.)
        //
        // If either `messages` queue or `heartbeat` channel is closed (e.g., `messages.recv()`
        // returns `None` or `heartbeat.changed()` returns an error), we assume that
        // `RaftMessageSenderHandle` has been dropped, and treat it as a "shutdown"/"cancellation"
        // signal (and break from the loop).

        let mut prev_index = 0;

        loop {
            let (index, message) = tokio::select! {
                biased;
                Some(message) = self.messages.recv() => message,
                Ok(()) = self.heartbeat.changed() => self.heartbeat.borrow_and_update().clone(),
                else => break,
            };

            if prev_index <= index {
                self.send(&message).await;
                prev_index = index;
            }
        }
    }

    async fn send(&mut self, message: &RaftMessage) {
        if let Err(err) = self.try_send(message).await {
            let peer_id = message.to;

            if log::max_level() >= log::Level::Debug {
                log::error!("Failed to send Raft message {message:?} to peer {peer_id}: {err}");
            } else {
                log::error!("Failed to send Raft message to peer {peer_id}: {err}");
            }
        }
    }

    async fn try_send(&mut self, message: &RaftMessage) -> anyhow::Result<()> {
        let peer_id = message.to;

        let uri = self.uri(peer_id).await?;

        let mut bytes = Vec::new();
        <RaftMessage as prost_for_raft::Message>::encode(message, &mut bytes)
            .context("failed to encode Raft message")?;
        let grpc_message = GrpcRaftMessage { message: bytes };

        let timeout = Duration::from_millis(
            self.consensus_config.message_timeout_ticks * self.consensus_config.tick_period_ms,
        );

        let res = self
            .transport_channel_pool
            .with_channel_timeout(
                &uri,
                |channel| async {
                    let mut client = RaftClient::new(channel);
                    let mut request = tonic::Request::new(grpc_message.clone());
                    request.set_timeout(timeout);
                    client.send(request).await
                },
                Some(timeout),
                0,
            )
            .await;

        if message.msg_type == raft::eraftpb::MessageType::MsgSnapshot as i32 {
            let res = self.consensus_state.report_snapshot(
                peer_id,
                if res.is_ok() {
                    SnapshotStatus::Finish
                } else {
                    SnapshotStatus::Failure
                },
            );

            // Should we ignore the error? Seems like it will only produce noise.
            //
            // - `send_message` is only called by the sub-task spawned by the consensus thread.
            // - `report_snapshot` sends a message back to the consensus thread.
            // - It can only fail, if the "receiver" end of the channel is closed.
            // - Which means consensus thread either resolved successfully, or failed.
            // - So, if the consensus thread is shutting down, no need to log a misleading error...
            // - ...or, if the consensus thread failed, then we should already have an error,
            //   and it will only produce more noise.

            if let Err(err) = res {
                log::error!("{}", err);
            }
        }

        match res {
            Ok(_) => self.consensus_state.record_message_send_success(&uri),
            Err(err) => self.consensus_state.record_message_send_failure(&uri, err),
        }

        Ok(())
    }

    async fn uri(&mut self, peer_id: PeerId) -> anyhow::Result<Uri> {
        let uri = self
            .consensus_state
            .peer_address_by_id()
            .get(&peer_id)
            .cloned();

        match uri {
            Some(uri) => Ok(uri),
            None => self.who_is(peer_id).await,
        }
    }

    async fn who_is(&mut self, peer_id: PeerId) -> anyhow::Result<Uri> {
        let bootstrap_uri = self
            .bootstrap_uri
            .clone()
            .ok_or_else(|| anyhow::format_err!("No bootstrap URI provided"))?;

        let bootstrap_timeout = Duration::from_secs(self.consensus_config.bootstrap_timeout_sec);

        // Use dedicated transport channel for who_is because of specific timeout
        let channel = make_grpc_channel(
            bootstrap_timeout,
            bootstrap_timeout,
            bootstrap_uri,
            self.tls_config.clone(),
        )
        .await
        .map_err(|err| anyhow::format_err!("Failed to create who-is channel: {}", err))?;

        let uri = RaftClient::new(channel)
            .who_is(tonic::Request::new(GrpcPeerId { id: peer_id }))
            .await?
            .into_inner()
            .uri
            .parse()?;

        Ok(uri)
    }
}

fn is_heartbeat(message: &RaftMessage) -> bool {
    message.msg_type == raft::eraftpb::MessageType::MsgHeartbeat as i32
        || message.msg_type == raft::eraftpb::MessageType::MsgHeartbeatResponse as i32
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;
    use std::thread;

    use collection::operations::vector_params_builder::VectorParamsBuilder;
    use collection::shards::channel_service::ChannelService;
    use common::cpu::CpuBudget;
    use segment::types::Distance;
    use slog::Drain;
    use storage::content_manager::collection_meta_ops::{
        CollectionMetaOperations, CreateCollection, CreateCollectionOperation,
    };
    use storage::content_manager::consensus::operation_sender::OperationSender;
    use storage::content_manager::consensus::persistent::Persistent;
    use storage::content_manager::consensus_manager::{ConsensusManager, ConsensusStateRef};
    use storage::content_manager::toc::TableOfContent;
    use storage::dispatcher::Dispatcher;
    use storage::rbac::Access;
    use tempfile::Builder;

    use super::Consensus;
    use crate::common::helpers::create_general_purpose_runtime;
    use crate::settings::ConsensusConfig;

    #[test]
    fn collection_creation_passes_consensus() {
        // Given
        let storage_dir = Builder::new().prefix("storage").tempdir().unwrap();
        let mut settings = crate::Settings::new(None).expect("Can't read config.");
        settings.storage.storage_path = storage_dir.path().to_str().unwrap().to_string();
        tracing_subscriber::fmt::init();
        let search_runtime =
            crate::create_search_runtime(settings.storage.performance.max_search_threads)
                .expect("Can't create search runtime.");
        let update_runtime =
            crate::create_update_runtime(settings.storage.performance.max_search_threads)
                .expect("Can't create update runtime.");
        let general_runtime =
            create_general_purpose_runtime().expect("Can't create general purpose runtime.");
        let handle = general_runtime.handle().clone();
        let (propose_sender, propose_receiver) = std::sync::mpsc::channel();
        let persistent_state =
            Persistent::load_or_init(&settings.storage.storage_path, true, false).unwrap();
        let operation_sender = OperationSender::new(propose_sender);
        let toc = TableOfContent::new(
            &settings.storage,
            search_runtime,
            update_runtime,
            general_runtime,
            CpuBudget::default(),
            ChannelService::new(settings.service.http_port, None),
            persistent_state.this_peer_id(),
            Some(operation_sender.clone()),
        );
        let toc_arc = Arc::new(toc);
        let storage_path = toc_arc.storage_path();
        let consensus_state: ConsensusStateRef = ConsensusManager::new(
            persistent_state,
            toc_arc.clone(),
            operation_sender,
            storage_path,
        )
        .into();
        let dispatcher = Dispatcher::new(toc_arc.clone()).with_consensus(consensus_state.clone());
        let slog_logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), slog::o!());
        let (mut consensus, message_sender) = Consensus::new(
            &slog_logger,
            consensus_state.clone(),
            None,
            Some("http://127.0.0.1:6335".parse().unwrap()),
            6335,
            ConsensusConfig::default(),
            None,
            ChannelService::new(settings.service.http_port, None),
            handle.clone(),
            false,
        )
        .unwrap();

        let is_leader_established = consensus_state.is_leader_established.clone();
        thread::spawn(move || consensus.start().unwrap());
        thread::spawn(move || {
            while let Ok(entry) = propose_receiver.recv() {
                if message_sender
                    .blocking_send(super::Message::FromClient(entry))
                    .is_err()
                {
                    log::error!("Can not forward new entry to consensus as it was stopped.");
                    break;
                }
            }
        });
        // Wait for Raft to establish the leader
        is_leader_established.await_ready();
        // Leader election produces a raft log entry, and then origin peer adds itself to consensus
        assert_eq!(consensus_state.hard_state().commit, 2);
        // Initially there are 0 collections
        assert_eq!(toc_arc.all_collections_sync().len(), 0);

        // When

        // New runtime is used as timers need to be enabled.
        handle
            .block_on(
                dispatcher.submit_collection_meta_op(
                    CollectionMetaOperations::CreateCollection(CreateCollectionOperation::new(
                        "test".to_string(),
                        CreateCollection {
                            vectors: VectorParamsBuilder::new(10, Distance::Cosine)
                                .build()
                                .into(),
                            sparse_vectors: None,
                            hnsw_config: None,
                            wal_config: None,
                            optimizers_config: None,
                            shard_number: Some(2),
                            on_disk_payload: None,
                            replication_factor: None,
                            write_consistency_factor: None,
                            init_from: None,
                            quantization_config: None,
                            sharding_method: None,
                            strict_mode_config: None,
                            uuid: None,
                        },
                    )),
                    Access::full("For test"),
                    None,
                ),
            )
            .unwrap();

        // Then
        assert_eq!(consensus_state.hard_state().commit, 5); // first peer self-election + add first peer + create collection + activate shard x2
        assert_eq!(toc_arc.all_collections_sync(), vec!["test"]);
    }
}