Gouzi Mohaled
Ajout du dossier lib
84d2a97
pub mod clock_set;
mod execute_read_operation;
mod locally_disabled_peers;
mod read_ops;
mod shard_transfer;
mod snapshots;
mod update;
use std::collections::{HashMap, HashSet};
use std::ops::Deref as _;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use common::cpu::CpuBudget;
use common::types::TelemetryDetail;
use schemars::JsonSchema;
use segment::types::{ExtendedPointId, Filter};
use serde::{Deserialize, Serialize};
use tokio::runtime::Handle;
use tokio::sync::{Mutex, RwLock};
use super::local_shard::clock_map::RecoveryPoint;
use super::local_shard::LocalShard;
use super::remote_shard::RemoteShard;
use super::transfer::ShardTransfer;
use super::CollectionId;
use crate::collection::payload_index_schema::PayloadIndexSchema;
use crate::common::snapshots_manager::SnapshotStorageManager;
use crate::config::CollectionConfigInternal;
use crate::operations::point_ops::{self};
use crate::operations::shared_storage_config::SharedStorageConfig;
use crate::operations::types::{CollectionError, CollectionResult, UpdateResult, UpdateStatus};
use crate::operations::CollectionUpdateOperations;
use crate::optimizers_builder::OptimizersConfig;
use crate::save_on_disk::SaveOnDisk;
use crate::shards::channel_service::ChannelService;
use crate::shards::dummy_shard::DummyShard;
use crate::shards::replica_set::clock_set::ClockSet;
use crate::shards::shard::{PeerId, Shard, ShardId};
use crate::shards::shard_config::ShardConfig;
use crate::shards::telemetry::ReplicaSetTelemetry;
// β”‚ Collection Created
// β”‚
// β–Ό
// β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
// β”‚ β”‚
// β”‚ Initializing β”‚
// β”‚ β”‚
// β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜
// β”‚ Report created β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
// └────────────────────► β”‚
// Activate β”‚ Consensus β”‚
// β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚
// β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
// β”Œβ”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” User Promote β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
// β”‚ ◄──────────────────────────► β”‚
// β”‚ Active β”‚ β”‚ Listener β”‚
// β”‚ ◄───────────┐ β”‚ β”‚
// β””β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚Transfer β””β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜
// β”‚ β”‚Finished β”‚
// β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚Update
// β”‚Update β”‚ β”‚ β”‚Failure
// β”‚Failure β”‚ Partial β”œβ”€β”€β”€β” β”‚
// β”‚ β”‚ β”‚ β”‚ β”‚
// β”‚ β””β”€β”€β”€β”€β”€β”€β”€β–²β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚
// β”‚ β”‚ β”‚ β”‚
// β”Œβ”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” Transfer β”‚ β”‚ β”‚
// β”‚ β”‚ Started β”‚ β”‚ β”‚
// β”‚ Dead β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚
// β”‚ β”‚ β”‚ β”‚
// β””β”€β–²β”€β”€β”€β”€β”€β”€β”€β–²β”€β”€β”€β”˜ Transfer β”‚ β”‚
// β”‚ β”‚ Failed/Cancelledβ”‚ β”‚
// β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
// β”‚ β”‚
// β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
//
/// A set of shard replicas.
///
/// Handles operations so that the state is consistent across all the replicas of the shard.
/// Prefers local shard for read-only operations.
/// Perform updates on all replicas and report error if there is at least one failure.
///
pub struct ShardReplicaSet {
local: RwLock<Option<Shard>>, // Abstract Shard to be able to use a Proxy during replication
remotes: RwLock<Vec<RemoteShard>>,
replica_state: Arc<SaveOnDisk<ReplicaSetState>>,
/// List of peers that are marked as dead locally, but are not yet submitted to the consensus.
/// List is checked on each consensus round and submitted to the consensus.
/// If the state of the peer is changed in the consensus, it is removed from the list.
/// Update and read operations are not performed on the peers marked as dead.
locally_disabled_peers: parking_lot::RwLock<locally_disabled_peers::Registry>,
pub(crate) shard_path: PathBuf,
pub(crate) shard_id: ShardId,
notify_peer_failure_cb: ChangePeerFromState,
abort_shard_transfer_cb: AbortShardTransfer,
channel_service: ChannelService,
collection_id: CollectionId,
collection_config: Arc<RwLock<CollectionConfigInternal>>,
optimizers_config: OptimizersConfig,
pub(crate) shared_storage_config: Arc<SharedStorageConfig>,
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>,
update_runtime: Handle,
search_runtime: Handle,
optimizer_cpu_budget: CpuBudget,
/// Lock to serialized write operations on the replicaset when a write ordering is used.
write_ordering_lock: Mutex<()>,
/// Local clock set, used to tag new operations on this shard.
clock_set: Mutex<ClockSet>,
}
pub type AbortShardTransfer = Arc<dyn Fn(ShardTransfer, &str) + Send + Sync>;
pub type ChangePeerState = Arc<dyn Fn(PeerId, ShardId) + Send + Sync>;
pub type ChangePeerFromState = Arc<dyn Fn(PeerId, ShardId, Option<ReplicaState>) + Send + Sync>;
const REPLICA_STATE_FILE: &str = "replica_state.json";
impl ShardReplicaSet {
/// Create a new fresh replica set, no previous state is expected.
#[allow(clippy::too_many_arguments)]
pub async fn build(
shard_id: ShardId,
collection_id: CollectionId,
this_peer_id: PeerId,
local: bool,
remotes: HashSet<PeerId>,
on_peer_failure: ChangePeerFromState,
abort_shard_transfer: AbortShardTransfer,
collection_path: &Path,
collection_config: Arc<RwLock<CollectionConfigInternal>>,
effective_optimizers_config: OptimizersConfig,
shared_storage_config: Arc<SharedStorageConfig>,
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>,
channel_service: ChannelService,
update_runtime: Handle,
search_runtime: Handle,
optimizer_cpu_budget: CpuBudget,
init_state: Option<ReplicaState>,
) -> CollectionResult<Self> {
let shard_path = super::create_shard_dir(collection_path, shard_id).await?;
let local = if local {
let shard = LocalShard::build(
shard_id,
collection_id.clone(),
&shard_path,
collection_config.clone(),
shared_storage_config.clone(),
payload_index_schema.clone(),
update_runtime.clone(),
search_runtime.clone(),
optimizer_cpu_budget.clone(),
effective_optimizers_config.clone(),
)
.await?;
Some(Shard::Local(shard))
} else {
None
};
let replica_state: SaveOnDisk<ReplicaSetState> =
SaveOnDisk::load_or_init_default(shard_path.join(REPLICA_STATE_FILE))?;
let init_replica_state = init_state.unwrap_or(ReplicaState::Initializing);
replica_state.write(|rs| {
rs.this_peer_id = this_peer_id;
if local.is_some() {
rs.is_local = true;
rs.set_peer_state(this_peer_id, init_replica_state);
}
for peer in remotes {
rs.set_peer_state(peer, init_replica_state);
}
})?;
let remote_shards = Self::init_remote_shards(
shard_id,
collection_id.clone(),
&replica_state.read(),
&channel_service,
);
// Save shard config as the last step, to ensure that the file state is consistent
// Presence of shard config indicates that the shard is ready to be used
let replica_set_shard_config = ShardConfig::new_replica_set();
replica_set_shard_config.save(&shard_path)?;
Ok(Self {
shard_id,
local: RwLock::new(local),
remotes: RwLock::new(remote_shards),
replica_state: replica_state.into(),
locally_disabled_peers: Default::default(),
shard_path,
abort_shard_transfer_cb: abort_shard_transfer,
notify_peer_failure_cb: on_peer_failure,
channel_service,
collection_id,
collection_config,
optimizers_config: effective_optimizers_config,
shared_storage_config,
payload_index_schema,
update_runtime,
search_runtime,
optimizer_cpu_budget,
write_ordering_lock: Mutex::new(()),
clock_set: Default::default(),
})
}
/// Recovers shard from disk.
///
/// WARN: This method intended to be used only on the initial start of the node.
/// It does not implement any logic to recover from a failure.
/// Will panic or load partial state if there is a failure.
#[allow(clippy::too_many_arguments)]
pub async fn load(
shard_id: ShardId,
collection_id: CollectionId,
shard_path: &Path,
collection_config: Arc<RwLock<CollectionConfigInternal>>,
effective_optimizers_config: OptimizersConfig,
shared_storage_config: Arc<SharedStorageConfig>,
payload_index_schema: Arc<SaveOnDisk<PayloadIndexSchema>>,
channel_service: ChannelService,
on_peer_failure: ChangePeerFromState,
abort_shard_transfer: AbortShardTransfer,
this_peer_id: PeerId,
update_runtime: Handle,
search_runtime: Handle,
optimizer_cpu_budget: CpuBudget,
) -> Self {
let replica_state: SaveOnDisk<ReplicaSetState> =
SaveOnDisk::load_or_init_default(shard_path.join(REPLICA_STATE_FILE)).unwrap();
if replica_state.read().this_peer_id != this_peer_id {
replica_state
.write(|rs| {
let this_peer_id = rs.this_peer_id;
let local_state = rs.remove_peer_state(this_peer_id);
if let Some(state) = local_state {
rs.set_peer_state(this_peer_id, state);
}
rs.this_peer_id = this_peer_id;
})
.map_err(|e| {
panic!("Failed to update replica state in {shard_path:?}: {e}");
})
.unwrap();
}
let remote_shards: Vec<_> = Self::init_remote_shards(
shard_id,
collection_id.clone(),
&replica_state.read(),
&channel_service,
);
let mut local_load_failure = false;
let local = if replica_state.read().is_local {
let shard = if let Some(recovery_reason) = &shared_storage_config.recovery_mode {
Shard::Dummy(DummyShard::new(recovery_reason))
} else {
let res = LocalShard::load(
shard_id,
collection_id.clone(),
shard_path,
collection_config.clone(),
effective_optimizers_config.clone(),
shared_storage_config.clone(),
payload_index_schema.clone(),
update_runtime.clone(),
search_runtime.clone(),
optimizer_cpu_budget.clone(),
)
.await;
match res {
Ok(shard) => Shard::Local(shard),
Err(err) => {
if !shared_storage_config.handle_collection_load_errors {
panic!("Failed to load local shard {shard_path:?}: {err}")
}
local_load_failure = true;
log::error!(
"Failed to load local shard {shard_path:?}, \
initializing \"dummy\" shard instead: \
{err}"
);
Shard::Dummy(DummyShard::new(format!(
"Failed to load local shard {shard_path:?}: {err}"
)))
}
}
};
Some(shard)
} else {
None
};
let replica_set = Self {
shard_id,
local: RwLock::new(local),
remotes: RwLock::new(remote_shards),
replica_state: replica_state.into(),
// TODO: move to collection config
locally_disabled_peers: Default::default(),
shard_path: shard_path.to_path_buf(),
notify_peer_failure_cb: on_peer_failure,
abort_shard_transfer_cb: abort_shard_transfer,
channel_service,
collection_id,
collection_config,
optimizers_config: effective_optimizers_config,
shared_storage_config,
payload_index_schema,
update_runtime,
search_runtime,
optimizer_cpu_budget,
write_ordering_lock: Mutex::new(()),
clock_set: Default::default(),
};
if local_load_failure && replica_set.active_remote_shards().is_empty() {
replica_set
.locally_disabled_peers
.write()
.disable_peer(this_peer_id);
}
replica_set
}
pub fn this_peer_id(&self) -> PeerId {
self.replica_state.read().this_peer_id
}
pub async fn has_local_shard(&self) -> bool {
self.local.read().await.is_some()
}
pub async fn is_local(&self) -> bool {
let local_read = self.local.read().await;
matches!(*local_read, Some(Shard::Local(_) | Shard::Dummy(_)))
}
pub async fn is_queue_proxy(&self) -> bool {
let local_read = self.local.read().await;
matches!(*local_read, Some(Shard::QueueProxy(_)))
}
pub async fn is_dummy(&self) -> bool {
let local_read = self.local.read().await;
matches!(*local_read, Some(Shard::Dummy(_)))
}
pub fn peers(&self) -> HashMap<PeerId, ReplicaState> {
self.replica_state.read().peers()
}
pub fn is_last_active_replica(&self, peer_id: PeerId) -> bool {
let active_peers = self.replica_state.read().active_peers();
active_peers.len() == 1 && active_peers.contains(&peer_id)
}
pub fn peer_state(&self, peer_id: PeerId) -> Option<ReplicaState> {
self.replica_state.read().get_peer_state(peer_id)
}
/// List the peer IDs on which this shard is active, both the local and remote peers.
pub fn active_shards(&self) -> Vec<PeerId> {
let replica_state = self.replica_state.read();
replica_state
.active_peers()
.into_iter()
.filter(|&peer_id| !self.is_locally_disabled(peer_id))
.collect()
}
/// List the remote peer IDs on which this shard is active, excludes the local peer ID.
pub fn active_remote_shards(&self) -> Vec<PeerId> {
let replica_state = self.replica_state.read();
let this_peer_id = replica_state.this_peer_id;
replica_state
.active_peers()
.into_iter()
.filter(|&peer_id| !self.is_locally_disabled(peer_id) && peer_id != this_peer_id)
.collect()
}
/// Wait for a local shard to be initialized.
///
/// Uses a blocking thread internally.
pub async fn wait_for_local(&self, timeout: Duration) -> CollectionResult<()> {
self.wait_for(|replica_set_state| replica_set_state.is_local, timeout)
.await
}
pub fn wait_for_state_condition_sync<F>(&self, check: F, timeout: Duration) -> bool
where
F: Fn(&ReplicaSetState) -> bool,
{
let replica_state = self.replica_state.clone();
replica_state.wait_for(check, timeout)
}
/// Wait for a local shard to get into `state`
///
/// Uses a blocking thread internally.
pub async fn wait_for_local_state(
&self,
state: ReplicaState,
timeout: Duration,
) -> CollectionResult<()> {
self.wait_for(
move |replica_set_state| {
replica_set_state.get_peer_state(replica_set_state.this_peer_id) == Some(state)
},
timeout,
)
.await
}
/// Wait for a peer shard to get into `state`
///
/// Uses a blocking thread internally.
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn wait_for_state(
&self,
peer_id: PeerId,
state: ReplicaState,
timeout: Duration,
) -> CollectionResult<()> {
self.wait_for(
move |replica_set_state| replica_set_state.get_peer_state(peer_id) == Some(state),
timeout,
)
.await
}
/// Wait for a replica set state condition to be true.
///
/// Uses a blocking thread internally.
///
/// # Cancel safety
///
/// This method is cancel safe.
async fn wait_for<F>(&self, check: F, timeout: Duration) -> CollectionResult<()>
where
F: Fn(&ReplicaSetState) -> bool + Send + 'static,
{
// TODO: Propagate cancellation into `spawn_blocking` task!?
let replica_state = self.replica_state.clone();
let timed_out =
!tokio::task::spawn_blocking(move || replica_state.wait_for(check, timeout))
.await
.map_err(|err| {
CollectionError::service_error(format!(
"Failed to wait for replica set state: {err}"
))
})?;
if timed_out {
return Err(CollectionError::service_error(
"Failed to wait for replica set state, timed out",
));
}
Ok(())
}
pub async fn init_empty_local_shard(&self) -> CollectionResult<()> {
let mut local = self.local.write().await;
let current_shard = local.take();
// ToDo: Remove shard files here?
let local_shard_res = LocalShard::build(
self.shard_id,
self.collection_id.clone(),
&self.shard_path,
self.collection_config.clone(),
self.shared_storage_config.clone(),
self.payload_index_schema.clone(),
self.update_runtime.clone(),
self.search_runtime.clone(),
self.optimizer_cpu_budget.clone(),
self.optimizers_config.clone(),
)
.await;
match local_shard_res {
Ok(local_shard) => {
*local = Some(Shard::Local(local_shard));
Ok(())
}
Err(err) => {
log::error!(
"Failed to initialize local shard {:?}: {err}",
self.shard_path
);
*local = current_shard;
Err(err)
}
}
}
pub async fn set_local(
&self,
local: LocalShard,
state: Option<ReplicaState>,
) -> CollectionResult<Option<Shard>> {
let old_shard = self.local.write().await.replace(Shard::Local(local));
if !self.replica_state.read().is_local || state.is_some() {
self.replica_state.write(|rs| {
rs.is_local = true;
if let Some(active) = state {
rs.set_peer_state(self.this_peer_id(), active);
}
})?;
}
self.update_locally_disabled(self.this_peer_id());
Ok(old_shard)
}
pub async fn remove_local(&self) -> CollectionResult<()> {
// TODO: Ensure cancel safety!
self.replica_state.write(|rs| {
rs.is_local = false;
let this_peer_id = rs.this_peer_id;
rs.remove_peer_state(this_peer_id);
})?;
self.update_locally_disabled(self.this_peer_id());
let removing_local = {
let mut local = self.local.write().await;
local.take()
};
if let Some(removing_local) = removing_local {
// stop ongoing tasks and delete data
drop(removing_local);
LocalShard::clear(&self.shard_path).await?;
}
Ok(())
}
pub async fn add_remote(&self, peer_id: PeerId, state: ReplicaState) -> CollectionResult<()> {
debug_assert!(peer_id != self.this_peer_id());
self.replica_state.write(|rs| {
rs.set_peer_state(peer_id, state);
})?;
self.update_locally_disabled(peer_id);
let mut remotes = self.remotes.write().await;
// check remote already exists
if remotes.iter().any(|remote| remote.peer_id == peer_id) {
return Ok(());
}
remotes.push(RemoteShard::new(
self.shard_id,
self.collection_id.clone(),
peer_id,
self.channel_service.clone(),
));
Ok(())
}
pub async fn remove_remote(&self, peer_id: PeerId) -> CollectionResult<()> {
self.replica_state.write(|rs| {
rs.remove_peer_state(peer_id);
})?;
self.update_locally_disabled(peer_id);
let mut remotes = self.remotes.write().await;
remotes.retain(|remote| remote.peer_id != peer_id);
Ok(())
}
/// Change state of the replica to the given.
/// Ensure that remote shard is initialized.
pub async fn ensure_replica_with_state(
&self,
peer_id: PeerId,
state: ReplicaState,
) -> CollectionResult<()> {
if peer_id == self.this_peer_id() {
self.set_replica_state(peer_id, state)?;
} else {
// Create remote shard if necessary
self.add_remote(peer_id, state).await?;
}
Ok(())
}
pub fn set_replica_state(&self, peer_id: PeerId, state: ReplicaState) -> CollectionResult<()> {
log::debug!(
"Changing local shard {}:{} state from {:?} to {state:?}",
self.collection_id,
self.shard_id,
self.replica_state.read().get_peer_state(peer_id),
);
self.replica_state.write(|rs| {
if rs.this_peer_id == peer_id {
rs.is_local = true;
}
rs.set_peer_state(peer_id, state);
})?;
self.update_locally_disabled(peer_id);
Ok(())
}
pub async fn remove_peer(&self, peer_id: PeerId) -> CollectionResult<()> {
if self.this_peer_id() == peer_id {
self.remove_local().await?;
} else {
self.remove_remote(peer_id).await?;
}
Ok(())
}
pub async fn apply_state(
&self,
replicas: HashMap<PeerId, ReplicaState>,
) -> CollectionResult<()> {
let old_peers = self.replica_state.read().peers();
self.replica_state.write(|state| {
state.set_peers(replicas.clone());
})?;
self.locally_disabled_peers.write().clear();
let removed_peers = old_peers
.keys()
.filter(|peer_id| !replicas.contains_key(peer_id))
.copied()
.collect::<Vec<_>>();
for peer_id in removed_peers {
self.remove_peer(peer_id).await?;
}
for (peer_id, state) in replicas {
let peer_already_exists = old_peers.contains_key(&peer_id);
if peer_already_exists {
// do nothing
// We only need to change state and it is already saved
continue;
}
if peer_id == self.this_peer_id() {
// Consensus wants a local replica on this peer
let local_shard = LocalShard::build(
self.shard_id,
self.collection_id.clone(),
&self.shard_path,
self.collection_config.clone(),
self.shared_storage_config.clone(),
self.payload_index_schema.clone(),
self.update_runtime.clone(),
self.search_runtime.clone(),
self.optimizer_cpu_budget.clone(),
self.optimizers_config.clone(),
)
.await?;
match state {
ReplicaState::Active | ReplicaState::Listener => {
// No way we can provide up-to-date replica right away at this point,
// so we report a failure to consensus
self.set_local(local_shard, Some(state)).await?;
self.notify_peer_failure(peer_id, Some(state));
}
ReplicaState::Dead
| ReplicaState::Partial
| ReplicaState::Initializing
| ReplicaState::PartialSnapshot
| ReplicaState::Recovery
| ReplicaState::Resharding => {
self.set_local(local_shard, Some(state)).await?;
}
}
continue;
}
// Otherwise it is a missing remote replica, we simply create it
let new_remote = RemoteShard::new(
self.shard_id,
self.collection_id.clone(),
peer_id,
self.channel_service.clone(),
);
self.remotes.write().await.push(new_remote);
}
Ok(())
}
pub(crate) async fn on_optimizer_config_update(&self) -> CollectionResult<()> {
let read_local = self.local.read().await;
if let Some(shard) = &*read_local {
shard.on_optimizer_config_update().await
} else {
Ok(())
}
}
/// Check if the are any locally disabled peers
/// And if so, report them to the consensus
pub fn sync_local_state<F>(&self, get_shard_transfers: F) -> CollectionResult<()>
where
F: Fn(ShardId, PeerId) -> Vec<ShardTransfer>,
{
let peers_to_notify: Vec<_> = self
.locally_disabled_peers
.write()
.notify_elapsed()
.collect();
for (failed_peer_id, from_state) in peers_to_notify {
self.notify_peer_failure(failed_peer_id, from_state);
for transfer in get_shard_transfers(self.shard_id, failed_peer_id) {
self.abort_shard_transfer(
transfer,
&format!(
"{failed_peer_id}/{}:{} replica failed",
self.collection_id, self.shard_id,
),
);
}
}
Ok(())
}
pub(crate) async fn get_telemetry_data(&self, detail: TelemetryDetail) -> ReplicaSetTelemetry {
let local_shard = self.local.read().await;
let local = local_shard.as_ref();
let local_telemetry = match local {
Some(local_shard) => Some(local_shard.get_telemetry_data(detail).await),
None => None,
};
ReplicaSetTelemetry {
id: self.shard_id,
local: local_telemetry,
remote: self
.remotes
.read()
.await
.iter()
.map(|remote| remote.get_telemetry_data(detail))
.collect(),
replicate_states: self.replica_state.read().peers(),
}
}
pub(crate) async fn health_check(&self, peer_id: PeerId) -> CollectionResult<()> {
let remotes = self.remotes.read().await;
let Some(remote) = remotes.iter().find(|remote| remote.peer_id == peer_id) else {
return Err(CollectionError::NotFound {
what: format!("{}/{}:{} shard", peer_id, self.collection_id, self.shard_id),
});
};
remote.health_check().await?;
Ok(())
}
pub async fn delete_local_points(&self, filter: Filter) -> CollectionResult<UpdateResult> {
let local_shard_guard = self.local.read().await;
let Some(local_shard) = local_shard_guard.deref() else {
return Err(CollectionError::NotFound {
what: format!("local shard {}:{}", self.collection_id, self.shard_id),
});
};
let mut next_offset = Some(ExtendedPointId::NumId(0));
let mut ids = Vec::new();
while let Some(current_offset) = next_offset {
const BATCH_SIZE: usize = 1000;
let mut points = local_shard
.get()
.scroll_by(
Some(current_offset),
BATCH_SIZE + 1,
&false.into(),
&false.into(),
Some(&filter),
&self.search_runtime,
None,
None,
)
.await?;
if points.len() > BATCH_SIZE {
next_offset = points.pop().map(|points| points.id);
} else {
next_offset = None;
}
ids.extend(points.into_iter().map(|points| points.id));
}
if ids.is_empty() {
return Ok(UpdateResult {
operation_id: None,
status: UpdateStatus::Completed,
clock_tag: None,
});
}
drop(local_shard_guard);
let op =
CollectionUpdateOperations::PointOperation(point_ops::PointOperations::DeletePoints {
ids,
});
// TODO(resharding): Assign clock tag to the operation!? πŸ€”
let result = self.update_local(op.into(), false).await?.ok_or_else(|| {
CollectionError::bad_request(format!(
"local shard {}:{} does not exist or is unavailable",
self.collection_id, self.shard_id,
))
})?;
Ok(result)
}
fn init_remote_shards(
shard_id: ShardId,
collection_id: CollectionId,
state: &ReplicaSetState,
channel_service: &ChannelService,
) -> Vec<RemoteShard> {
state
.peers()
.iter()
.filter(|(peer, _)| **peer != state.this_peer_id)
.map(|(peer_id, _is_active)| {
RemoteShard::new(
shard_id,
collection_id.clone(),
*peer_id,
channel_service.clone(),
)
})
.collect()
}
/// Check whether a peer is registered as `active`.
/// Unknown peers are not active.
fn peer_is_active(&self, peer_id: PeerId) -> bool {
self.peer_state(peer_id) == Some(ReplicaState::Active) && !self.is_locally_disabled(peer_id)
}
fn peer_is_active_or_resharding(&self, peer_id: PeerId) -> bool {
let is_active_or_resharding = matches!(
self.peer_state(peer_id),
Some(ReplicaState::Active | ReplicaState::Resharding)
);
let is_locally_disabled = self.is_locally_disabled(peer_id);
is_active_or_resharding && !is_locally_disabled
}
fn is_locally_disabled(&self, peer_id: PeerId) -> bool {
self.locally_disabled_peers.read().is_disabled(peer_id)
}
/// Locally disable given peer
///
/// Disables the peer and notifies consensus periodically.
///
/// Prevents disabling the last peer (according to consensus).
///
/// If `from_state` is given, the peer will only be disabled if the given state matches
/// consensus.
fn add_locally_disabled(
&self,
state: &ReplicaSetState,
peer_id: PeerId,
from_state: Option<ReplicaState>,
) {
let other_peers = state
.active_or_resharding_peers()
.filter(|id| id != &peer_id);
let mut locally_disabled_peers_guard = self.locally_disabled_peers.upgradable_read();
// Prevent disabling last peer in consensus
{
if !locally_disabled_peers_guard.is_disabled(peer_id)
&& locally_disabled_peers_guard.is_all_disabled(other_peers)
{
log::warn!("Cannot locally disable last active peer {peer_id} for replica");
return;
}
}
locally_disabled_peers_guard.with_upgraded(|locally_disabled_peers| {
if locally_disabled_peers.disable_peer_and_notify_if_elapsed(peer_id, from_state) {
self.notify_peer_failure(peer_id, from_state);
}
});
}
/// Make sure that locally disabled peers do not contradict the consensus
fn update_locally_disabled(&self, peer_id_to_remove: PeerId) {
let mut locally_disabled_peers = self.locally_disabled_peers.write();
// Check that we are not trying to disable the last active peer
if locally_disabled_peers
.is_all_disabled(self.replica_state.read().active_or_resharding_peers())
{
log::warn!("Resolving consensus/local state inconsistency");
locally_disabled_peers.clear();
} else {
locally_disabled_peers.enable_peer(peer_id_to_remove);
}
}
fn notify_peer_failure(&self, peer_id: PeerId, from_state: Option<ReplicaState>) {
log::debug!("Notify peer failure: {}", peer_id);
self.notify_peer_failure_cb.deref()(peer_id, self.shard_id, from_state)
}
fn abort_shard_transfer(&self, transfer: ShardTransfer, reason: &str) {
log::debug!(
"Abort {}:{} / {} -> {} shard transfer",
self.collection_id,
transfer.shard_id,
transfer.from,
transfer.to,
);
self.abort_shard_transfer_cb.deref()(transfer, reason)
}
/// Get shard recovery point for WAL.
pub(crate) async fn shard_recovery_point(&self) -> CollectionResult<RecoveryPoint> {
let local_shard = self.local.read().await;
let Some(local_shard) = local_shard.as_ref() else {
return Err(CollectionError::NotFound {
what: "Peer does not have local shard".into(),
});
};
local_shard.shard_recovery_point().await
}
/// Update the cutoff point for the local shard.
pub(crate) async fn update_shard_cutoff_point(
&self,
cutoff: &RecoveryPoint,
) -> CollectionResult<()> {
let local_shard = self.local.read().await;
let Some(local_shard) = local_shard.as_ref() else {
return Err(CollectionError::NotFound {
what: "Peer does not have local shard".into(),
});
};
local_shard.update_cutoff(cutoff).await
}
pub(crate) fn get_snapshots_storage_manager(&self) -> CollectionResult<SnapshotStorageManager> {
SnapshotStorageManager::new(&self.shared_storage_config.snapshots_config)
}
pub(crate) async fn trigger_optimizers(&self) -> bool {
let shard = self.local.read().await;
let Some(shard) = shard.as_ref() else {
return false;
};
shard.trigger_optimizers();
true
}
}
/// Represents a replica set state
#[derive(Debug, Deserialize, Serialize, Default, PartialEq, Eq, Clone)]
pub struct ReplicaSetState {
pub is_local: bool,
pub this_peer_id: PeerId,
peers: HashMap<PeerId, ReplicaState>,
}
impl ReplicaSetState {
pub fn get_peer_state(&self, peer_id: PeerId) -> Option<ReplicaState> {
self.peers.get(&peer_id).copied()
}
pub fn set_peer_state(&mut self, peer_id: PeerId, state: ReplicaState) {
self.peers.insert(peer_id, state);
}
pub fn remove_peer_state(&mut self, peer_id: PeerId) -> Option<ReplicaState> {
self.peers.remove(&peer_id)
}
pub fn peers(&self) -> HashMap<PeerId, ReplicaState> {
self.peers.clone()
}
pub fn active_peers(&self) -> Vec<PeerId> {
self.peers
.iter()
.filter_map(|(peer_id, state)| {
matches!(state, ReplicaState::Active).then_some(*peer_id)
})
.collect()
}
pub fn active_or_resharding_peers(&self) -> impl Iterator<Item = PeerId> + '_ {
self.peers.iter().filter_map(|(peer_id, state)| {
matches!(state, ReplicaState::Active | ReplicaState::Resharding).then_some(*peer_id)
})
}
pub fn set_peers(&mut self, peers: HashMap<PeerId, ReplicaState>) {
self.peers = peers;
}
}
/// State of the single shard within a replica set.
#[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Eq, Hash, Clone, Copy)]
pub enum ReplicaState {
// Active and sound
#[default]
Active,
// Failed for some reason
Dead,
// The shard is partially loaded and is currently receiving data from other shards
Partial,
// Collection is being created
Initializing,
// A shard which receives data, but is not used for search
// Useful for backup shards
Listener,
// Deprecated since Qdrant 1.9.0, used in Qdrant 1.7.0 and 1.8.0
//
// Snapshot shard transfer is in progress, updates aren't sent to the shard
// Normally rejects updates. Since 1.8 it allows updates if force is true.
#[schemars(skip)]
PartialSnapshot,
// Shard is undergoing recovery by an external node
// Normally rejects updates, accepts updates if force is true
Recovery,
// Points are being migrated to this shard as part of resharding
#[schemars(skip)]
Resharding,
}
impl ReplicaState {
/// Check whether the replica state is active or listener or resharding.
pub fn is_active_or_listener_or_resharding(self) -> bool {
match self {
ReplicaState::Active | ReplicaState::Listener | ReplicaState::Resharding => true,
ReplicaState::Dead
| ReplicaState::Initializing
| ReplicaState::Partial
| ReplicaState::PartialSnapshot
| ReplicaState::Recovery => false,
}
}
/// Check whether the replica state is partial or partial-like.
///
/// In other words: is the state related to shard transfers?
pub fn is_partial_or_recovery(self) -> bool {
match self {
ReplicaState::Partial
| ReplicaState::PartialSnapshot
| ReplicaState::Recovery
| ReplicaState::Resharding => true,
ReplicaState::Active
| ReplicaState::Dead
| ReplicaState::Initializing
| ReplicaState::Listener => false,
}
}
}
/// Represents a change in replica set, due to scaling of `replication_factor`
#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Hash, Clone)]
pub enum Change {
Remove(ShardId, PeerId),
}