use std::num::NonZeroU32; use std::sync::Arc; use std::time::{Duration, Instant}; use collection::config::ShardingMethod; use collection::operations::verification::VerificationPass; use common::counter::hardware_accumulator::HwMeasurementAcc; use common::defaults::CONSENSUS_META_OP_WAIT; use segment::types::default_shard_number_per_node_const; use crate::content_manager::collection_meta_ops::AliasOperations; use crate::content_manager::shard_distribution::ShardDistributionProposal; use crate::rbac::Access; use crate::{ ClusterStatus, CollectionMetaOperations, ConsensusOperations, ConsensusStateRef, StorageError, TableOfContent, }; #[derive(Clone)] pub struct Dispatcher { toc: Arc, consensus_state: Option, } impl Dispatcher { pub fn new(toc: Arc) -> Self { Self { toc, consensus_state: None, } } pub fn with_consensus(self, state_ref: ConsensusStateRef) -> Self { Self { consensus_state: Some(state_ref), ..self } } /// Get the table of content. /// The `_access` and `_verification_pass` parameter are not used, but it's required to verify caller's possession /// of both objects. pub fn toc( &self, _access: &Access, _verification_pass: &VerificationPass, ) -> &Arc { &self.toc } pub fn consensus_state(&self) -> Option<&ConsensusStateRef> { self.consensus_state.as_ref() } /// If `wait_timeout` is not supplied - then default duration will be used. /// This function needs to be called from a runtime with timers enabled. pub async fn submit_collection_meta_op( &self, operation: CollectionMetaOperations, access: Access, wait_timeout: Option, ) -> Result { access.check_collection_meta_operation(&operation)?; // if distributed deployment is enabled if let Some(state) = self.consensus_state.as_ref() { let start = Instant::now(); // List of operations to await for collection to be operational let mut expect_operations: Vec = vec![]; let op = match operation { CollectionMetaOperations::CreateCollection(mut op) => { self.toc.check_write_lock()?; if !op.is_distribution_set() { match op.create_collection.sharding_method.unwrap_or_default() { ShardingMethod::Auto => { // Suggest even distribution of shards across nodes let number_of_peers = state.0.peer_count(); let shard_nr_per_node = self .toc .storage_config .collection .as_ref() .map(|i| i.shard_number_per_node) .unwrap_or(default_shard_number_per_node_const()); let suggested_shard_nr = number_of_peers as u32 * shard_nr_per_node; let shard_distribution = self.toc.suggest_shard_distribution( &op, NonZeroU32::new(suggested_shard_nr) .expect("Peer count should be always >= 1"), ); // Expect all replicas to become active eventually for (shard_id, peer_ids) in &shard_distribution.distribution { for peer_id in peer_ids { expect_operations.push( ConsensusOperations::initialize_replica( op.collection_name.clone(), *shard_id, *peer_id, ), ); } } op.set_distribution(shard_distribution); } ShardingMethod::Custom => { // If custom sharding is used - we don't create any shards in advance let empty_distribution = ShardDistributionProposal::empty(); op.set_distribution(empty_distribution); } } } if let Some(uuid) = &op.create_collection.uuid { log::warn!( "Collection UUID {uuid} explicitly specified, \ when proposing create collection {} operation, \ new random UUID will be generated instead", op.collection_name, ); } op.create_collection.uuid = Some(uuid::Uuid::new_v4()); CollectionMetaOperations::CreateCollection(op) } CollectionMetaOperations::CreateShardKey(op) => { self.toc.check_write_lock()?; CollectionMetaOperations::CreateShardKey(op) } op => op, }; let operation_awaiter = // If explicit timeout is set - then we need to wait for all expected operations. // E.g. in case of `CreateCollection` we will explicitly wait for all replicas to be activated. // We need to register receivers(by calling the function) before submitting the operation. if !expect_operations.is_empty() { Some(state.await_for_multiple_operations(expect_operations, wait_timeout)) } else { None }; let do_sync_nodes = match &op { // Sync nodes after collection or shard key creation CollectionMetaOperations::CreateCollection(_) | CollectionMetaOperations::CreateShardKey(_) => true, // Sync nodes when creating or renaming collection aliases CollectionMetaOperations::ChangeAliases(changes) => { changes.actions.iter().any(|change| match change { AliasOperations::CreateAlias(_) | AliasOperations::RenameAlias(_) => true, AliasOperations::DeleteAlias(_) => false, }) } // TODO(resharding): Do we need/want to synchronize `Resharding` operations? CollectionMetaOperations::Resharding(_, _) => false, // No need to sync nodes for other operations CollectionMetaOperations::UpdateCollection(_) | CollectionMetaOperations::DeleteCollection(_) | CollectionMetaOperations::TransferShard(_, _) | CollectionMetaOperations::SetShardReplicaState(_) | CollectionMetaOperations::DropShardKey(_) | CollectionMetaOperations::CreatePayloadIndex(_) | CollectionMetaOperations::DropPayloadIndex(_) | CollectionMetaOperations::Nop { .. } => false, }; let res = state .propose_consensus_op_with_await( ConsensusOperations::CollectionMeta(Box::new(op)), wait_timeout, ) .await?; if let Some(operation_awaiter) = operation_awaiter { // Actually await for expected operations to complete on the consensus match operation_awaiter.await { Ok(Ok(())) => {} // all good Ok(Err(err)) => { log::warn!("Not all expected operations were completed: {}", err) } Err(err) => log::warn!("Awaiting for expected operations timed out: {}", err), } } // On some operations, synchronize all nodes to ensure all are ready for point operations if do_sync_nodes { let remaining_timeout = wait_timeout.map(|timeout| timeout.saturating_sub(start.elapsed())); if let Err(err) = self.await_consensus_sync(remaining_timeout).await { log::warn!("Failed to synchronize all nodes after collection operation in time, some nodes may not be ready: {err}"); } } Ok(res) } else { if let CollectionMetaOperations::CreateCollection(_) = &operation { self.toc.check_write_lock()?; } self.toc.perform_collection_meta_op(operation).await } } pub fn cluster_status(&self) -> ClusterStatus { match self.consensus_state.as_ref() { Some(state) => state.cluster_status(), None => ClusterStatus::Disabled, } } pub async fn await_consensus_sync( &self, timeout: Option, ) -> Result<(), StorageError> { let timeout = timeout.unwrap_or(CONSENSUS_META_OP_WAIT); if let Some(state) = self.consensus_state.as_ref() { let state = state.hard_state(); let term = state.term; let commit = state.commit; let channel_service = self.toc.get_channel_service(); let this_peer_id = self.toc.this_peer_id; channel_service .await_commit_on_all_peers(this_peer_id, commit, term, timeout) .await?; log::debug!( "Consensus is synchronized with term: {}, commit: {}", term, commit ); Ok(()) } else { Ok(()) } } #[must_use] pub fn get_collection_hw_metrics(&self, collection: String) -> Arc { self.toc.get_collection_hw_metrics(collection) } }