colibri.qdrant / lib /collection /src /shards /forward_proxy_shard.rs
Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::path::Path;
use std::sync::Arc;
use std::time::Duration;
use async_trait::async_trait;
use common::counter::hardware_accumulator::HwMeasurementAcc;
use common::tar_ext;
use common::types::TelemetryDetail;
use segment::data_types::facets::{FacetParams, FacetResponse};
use segment::data_types::order_by::OrderBy;
use segment::types::{
ExtendedPointId, Filter, PointIdType, ScoredPoint, SnapshotFormat, WithPayload,
WithPayloadInterface, WithVector,
};
use tokio::runtime::Handle;
use tokio::sync::Mutex;
use super::shard::ShardId;
use super::update_tracker::UpdateTracker;
use crate::hash_ring::HashRingRouter;
use crate::operations::point_ops::{
PointInsertOperationsInternal, PointOperations, PointStructPersisted, PointSyncOperation,
};
use crate::operations::types::{
CollectionError, CollectionInfo, CollectionResult, CoreSearchRequestBatch,
CountRequestInternal, CountResult, PointRequestInternal, RecordInternal, UpdateResult,
UpdateStatus,
};
use crate::operations::universal_query::shard_query::{ShardQueryRequest, ShardQueryResponse};
use crate::operations::{
CollectionUpdateOperations, CreateIndex, FieldIndexOperations, OperationToShard,
OperationWithClockTag, SplitByShard as _,
};
use crate::shards::local_shard::LocalShard;
use crate::shards::remote_shard::RemoteShard;
use crate::shards::shard_trait::ShardOperation;
use crate::shards::telemetry::LocalShardTelemetry;
/// ForwardProxyShard
///
/// ForwardProxyShard is a wrapper type for a LocalShard.
///
/// It can be used to provide all read and write operations while the wrapped shard is being transferred to another node.
/// Proxy forwards all operations to remote shards.
pub struct ForwardProxyShard {
shard_id: ShardId,
pub(crate) wrapped_shard: LocalShard,
pub(crate) remote_shard: RemoteShard,
resharding_hash_ring: Option<HashRingRouter>,
/// Lock required to protect transfer-in-progress updates.
/// It should block data updating operations while the batch is being transferred.
update_lock: Mutex<()>,
}
impl ForwardProxyShard {
pub fn new(
shard_id: ShardId,
wrapped_shard: LocalShard,
remote_shard: RemoteShard,
resharding_hash_ring: Option<HashRingRouter>,
) -> Self {
// Validate that `ForwardProxyShard` initialized correctly
debug_assert!({
let is_regular = shard_id == remote_shard.id && resharding_hash_ring.is_none();
let is_resharding = shard_id != remote_shard.id && resharding_hash_ring.is_some();
is_regular || is_resharding
});
if shard_id == remote_shard.id && resharding_hash_ring.is_some() {
log::warn!(
"ForwardProxyShard initialized with resharding hashring, \
but wrapped shard id and remote shard id are the same",
);
}
Self {
shard_id,
wrapped_shard,
remote_shard,
resharding_hash_ring,
update_lock: Mutex::new(()),
}
}
/// Create payload indexes in the remote shard same as in the wrapped shard.
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn transfer_indexes(&self) -> CollectionResult<()> {
let _update_lock = self.update_lock.lock().await;
for (index_key, index_type) in self.wrapped_shard.info().await?.payload_schema {
// TODO: Is cancelling `RemoteShard::update` safe for *receiver*?
self.remote_shard
.update(
// TODO: Assign clock tag!? 🤔
OperationWithClockTag::from(CollectionUpdateOperations::FieldIndexOperation(
FieldIndexOperations::CreateIndex(CreateIndex {
field_name: index_key,
field_schema: Some(index_type.try_into()?),
}),
)),
false,
)
.await?;
}
Ok(())
}
/// Move batch of points to the remote shard.
/// Returns an offset of the next batch to be transferred.
///
/// # Cancel safety
///
/// This method is cancel safe.
pub async fn transfer_batch(
&self,
offset: Option<PointIdType>,
batch_size: usize,
hashring_filter: Option<&HashRingRouter>,
merge_points: bool,
runtime_handle: &Handle,
) -> CollectionResult<Option<PointIdType>> {
debug_assert!(batch_size > 0);
let limit = batch_size + 1;
let _update_lock = self.update_lock.lock().await;
let mut batch = self
.wrapped_shard
.scroll_by(
offset,
limit,
&WithPayloadInterface::Bool(true),
&true.into(),
None,
runtime_handle,
None,
None, // no timeout
)
.await?;
let next_page_offset = if batch.len() < limit {
// This was the last page
None
} else {
// remove extra point, it would be a first point of the next page
Some(batch.pop().unwrap().id)
};
let points: Result<Vec<PointStructPersisted>, String> = batch
.into_iter()
// If using a hashring filter, only transfer points that moved, otherwise transfer all
.filter(|point| {
hashring_filter
.map(|hashring| hashring.is_in_shard(&point.id, self.remote_shard.id))
.unwrap_or(true)
})
.map(PointStructPersisted::try_from)
.collect();
let points = points?;
// Use sync API to leverage potentially existing points
// Normally use SyncPoints, to completely replace everything in the target shard
// For resharding we need to merge points from multiple transfers, requiring a different operation
let point_operation = if !merge_points {
PointOperations::SyncPoints(PointSyncOperation {
from_id: offset,
to_id: next_page_offset,
points,
})
} else {
PointOperations::UpsertPoints(PointInsertOperationsInternal::PointsList(points))
};
let insert_points_operation = CollectionUpdateOperations::PointOperation(point_operation);
// We only need to wait for the last batch.
let wait = next_page_offset.is_none();
// TODO: Is cancelling `RemoteShard::update` safe for *receiver*?
self.remote_shard
.update(OperationWithClockTag::from(insert_points_operation), wait) // TODO: Assign clock tag!? 🤔
.await?;
Ok(next_page_offset)
}
pub fn deconstruct(self) -> (LocalShard, RemoteShard) {
(self.wrapped_shard, self.remote_shard)
}
/// Forward `create_snapshot` to `wrapped_shard`
pub async fn create_snapshot(
&self,
temp_path: &Path,
tar: &tar_ext::BuilderExt,
format: SnapshotFormat,
save_wal: bool,
) -> CollectionResult<()> {
self.wrapped_shard
.create_snapshot(temp_path, tar, format, save_wal)
.await
}
pub async fn on_optimizer_config_update(&self) -> CollectionResult<()> {
self.wrapped_shard.on_optimizer_config_update().await
}
pub fn trigger_optimizers(&self) {
self.wrapped_shard.trigger_optimizers();
}
pub fn get_telemetry_data(&self, detail: TelemetryDetail) -> LocalShardTelemetry {
self.wrapped_shard.get_telemetry_data(detail)
}
pub fn update_tracker(&self) -> &UpdateTracker {
self.wrapped_shard.update_tracker()
}
}
#[async_trait]
impl ShardOperation for ForwardProxyShard {
/// Update `wrapped_shard` while keeping track of the changed points
///
/// # Cancel safety
///
/// This method is *not* cancel safe.
async fn update(
&self,
operation: OperationWithClockTag,
_wait: bool,
) -> CollectionResult<UpdateResult> {
// If we apply `local_shard` update, we *have to* execute `remote_shard` update to completion
// (or we *might* introduce an inconsistency between shards?), so this method is not cancel
// safe.
let _update_lock = self.update_lock.lock().await;
// Shard update is within a write lock scope, because we need a way to block the shard updates
// during the transfer restart and finalization.
// We always have to wait for the result of the update, cause after we release the lock,
// the transfer needs to have access to the latest version of points.
let mut result = self.wrapped_shard.update(operation.clone(), true).await?;
let forward_operation = if let Some(ring) = &self.resharding_hash_ring {
// If `ForwardProxyShard::resharding_hash_ring` is `Some`, we assume that proxy is used
// during *resharding* shard transfer, which forwards points to a remote shard with
// *different* shard ID.
debug_assert_ne!(self.shard_id, self.remote_shard.id);
// Only forward a *part* of the operation that belongs to remote shard.
let op = match operation.operation.split_by_shard(ring) {
OperationToShard::ToAll(op) => Some(op),
OperationToShard::ByShard(by_shard) => by_shard
.into_iter()
.find(|&(shard_id, _)| shard_id == self.remote_shard.id)
.map(|(_, op)| op),
};
// Strip the clock tag from the operation, because clock tags are incompatible between
// different shards.
//
// Even though we expect (and assert) that this whole branch is only executed when
// forwarding to a *different* remote shard, we still handle the case when local and
// remote shards are the same, *just in case*.
//
// In such case `split_by_shard` call above would be a no-op, and we can preserve the
// clock tag.
let tag = if self.shard_id != self.remote_shard.id {
None
} else {
log::warn!(
"ForwardProxyShard contains resharding hashring, \
but wrapped shard id and remote shard id are the same",
);
operation.clock_tag
};
op.map(|op| OperationWithClockTag::new(op, tag))
} else {
// If `ForwardProxyShard::resharding_hash_ring` is `None`, we assume that proxy is used
// during *regular* shard transfer, so operation can be forwarded as-is, without any
// additional handling.
debug_assert_eq!(self.shard_id, self.remote_shard.id);
Some(operation)
};
if let Some(operation) = forward_operation {
let remote_result =
self.remote_shard
.update(operation, false)
.await
.map_err(|err| {
CollectionError::forward_proxy_error(self.remote_shard.peer_id, err)
})?;
// Merge `result` and `remote_result`:
//
// - Pick `clock_tag` with *newer* `clock_tick`
let tick = result.clock_tag.map(|tag| tag.clock_tick);
let remote_tick = remote_result.clock_tag.map(|tag| tag.clock_tick);
if remote_tick > tick || tick.is_none() {
result.clock_tag = remote_result.clock_tag;
}
// - If any node *rejected* the operation, propagate `UpdateStatus::ClockRejected`
if remote_result.status == UpdateStatus::ClockRejected {
result.status = UpdateStatus::ClockRejected;
}
}
Ok(result)
}
/// Forward read-only `scroll_by` to `wrapped_shard`
async fn scroll_by(
&self,
offset: Option<ExtendedPointId>,
limit: usize,
with_payload_interface: &WithPayloadInterface,
with_vector: &WithVector,
filter: Option<&Filter>,
search_runtime_handle: &Handle,
order_by: Option<&OrderBy>,
timeout: Option<Duration>,
) -> CollectionResult<Vec<RecordInternal>> {
let local_shard = &self.wrapped_shard;
local_shard
.scroll_by(
offset,
limit,
with_payload_interface,
with_vector,
filter,
search_runtime_handle,
order_by,
timeout,
)
.await
}
async fn info(&self) -> CollectionResult<CollectionInfo> {
let local_shard = &self.wrapped_shard;
local_shard.info().await
}
async fn core_search(
&self,
request: Arc<CoreSearchRequestBatch>,
search_runtime_handle: &Handle,
timeout: Option<Duration>,
hw_measurement_acc: &HwMeasurementAcc,
) -> CollectionResult<Vec<Vec<ScoredPoint>>> {
let local_shard = &self.wrapped_shard;
local_shard
.core_search(request, search_runtime_handle, timeout, hw_measurement_acc)
.await
}
async fn count(
&self,
request: Arc<CountRequestInternal>,
search_runtime_handle: &Handle,
timeout: Option<Duration>,
hw_measurement_acc: &HwMeasurementAcc,
) -> CollectionResult<CountResult> {
let local_shard = &self.wrapped_shard;
local_shard
.count(request, search_runtime_handle, timeout, hw_measurement_acc)
.await
}
async fn retrieve(
&self,
request: Arc<PointRequestInternal>,
with_payload: &WithPayload,
with_vector: &WithVector,
search_runtime_handle: &Handle,
timeout: Option<Duration>,
) -> CollectionResult<Vec<RecordInternal>> {
let local_shard = &self.wrapped_shard;
local_shard
.retrieve(
request,
with_payload,
with_vector,
search_runtime_handle,
timeout,
)
.await
}
async fn query_batch(
&self,
requests: Arc<Vec<ShardQueryRequest>>,
search_runtime_handle: &Handle,
timeout: Option<Duration>,
hw_measurement_acc: &HwMeasurementAcc,
) -> CollectionResult<Vec<ShardQueryResponse>> {
let local_shard = &self.wrapped_shard;
local_shard
.query_batch(requests, search_runtime_handle, timeout, hw_measurement_acc)
.await
}
async fn facet(
&self,
request: Arc<FacetParams>,
search_runtime_handle: &Handle,
timeout: Option<Duration>,
) -> CollectionResult<FacetResponse> {
let local_shard = &self.wrapped_shard;
local_shard
.facet(request, search_runtime_handle, timeout)
.await
}
}