Spaces:

reztilop
/

colibri.qdrant

Build error

File size: 15,186 Bytes

84d2a97

use std::path::Path;
use std::sync::Arc;
use std::time::Duration;

use async_trait::async_trait;
use common::counter::hardware_accumulator::HwMeasurementAcc;
use common::tar_ext;
use common::types::TelemetryDetail;
use segment::data_types::facets::{FacetParams, FacetResponse};
use segment::data_types::order_by::OrderBy;
use segment::types::{
    ExtendedPointId, Filter, PointIdType, ScoredPoint, SnapshotFormat, WithPayload,
    WithPayloadInterface, WithVector,
};
use tokio::runtime::Handle;
use tokio::sync::Mutex;

use super::shard::ShardId;
use super::update_tracker::UpdateTracker;
use crate::hash_ring::HashRingRouter;
use crate::operations::point_ops::{
    PointInsertOperationsInternal, PointOperations, PointStructPersisted, PointSyncOperation,
};
use crate::operations::types::{
    CollectionError, CollectionInfo, CollectionResult, CoreSearchRequestBatch,
    CountRequestInternal, CountResult, PointRequestInternal, RecordInternal, UpdateResult,
    UpdateStatus,
};
use crate::operations::universal_query::shard_query::{ShardQueryRequest, ShardQueryResponse};
use crate::operations::{
    CollectionUpdateOperations, CreateIndex, FieldIndexOperations, OperationToShard,
    OperationWithClockTag, SplitByShard as _,
};
use crate::shards::local_shard::LocalShard;
use crate::shards::remote_shard::RemoteShard;
use crate::shards::shard_trait::ShardOperation;
use crate::shards::telemetry::LocalShardTelemetry;

/// ForwardProxyShard
///
/// ForwardProxyShard is a wrapper type for a LocalShard.
///
/// It can be used to provide all read and write operations while the wrapped shard is being transferred to another node.
/// Proxy forwards all operations to remote shards.
pub struct ForwardProxyShard {
    shard_id: ShardId,
    pub(crate) wrapped_shard: LocalShard,
    pub(crate) remote_shard: RemoteShard,
    resharding_hash_ring: Option<HashRingRouter>,
    /// Lock required to protect transfer-in-progress updates.
    /// It should block data updating operations while the batch is being transferred.
    update_lock: Mutex<()>,
}

impl ForwardProxyShard {
    pub fn new(
        shard_id: ShardId,
        wrapped_shard: LocalShard,
        remote_shard: RemoteShard,
        resharding_hash_ring: Option<HashRingRouter>,
    ) -> Self {
        // Validate that `ForwardProxyShard` initialized correctly

        debug_assert!({
            let is_regular = shard_id == remote_shard.id && resharding_hash_ring.is_none();
            let is_resharding = shard_id != remote_shard.id && resharding_hash_ring.is_some();

            is_regular || is_resharding
        });

        if shard_id == remote_shard.id && resharding_hash_ring.is_some() {
            log::warn!(
                "ForwardProxyShard initialized with resharding hashring, \
                 but wrapped shard id and remote shard id are the same",
            );
        }

        Self {
            shard_id,
            wrapped_shard,
            remote_shard,
            resharding_hash_ring,
            update_lock: Mutex::new(()),
        }
    }

    /// Create payload indexes in the remote shard same as in the wrapped shard.
    ///
    /// # Cancel safety
    ///
    /// This method is cancel safe.
    pub async fn transfer_indexes(&self) -> CollectionResult<()> {
        let _update_lock = self.update_lock.lock().await;
        for (index_key, index_type) in self.wrapped_shard.info().await?.payload_schema {
            // TODO: Is cancelling `RemoteShard::update` safe for *receiver*?
            self.remote_shard
                .update(
                    // TODO: Assign clock tag!? 🤔
                    OperationWithClockTag::from(CollectionUpdateOperations::FieldIndexOperation(
                        FieldIndexOperations::CreateIndex(CreateIndex {
                            field_name: index_key,
                            field_schema: Some(index_type.try_into()?),
                        }),
                    )),
                    false,
                )
                .await?;
        }
        Ok(())
    }

    /// Move batch of points to the remote shard.
    /// Returns an offset of the next batch to be transferred.
    ///
    /// # Cancel safety
    ///
    /// This method is cancel safe.
    pub async fn transfer_batch(
        &self,
        offset: Option<PointIdType>,
        batch_size: usize,
        hashring_filter: Option<&HashRingRouter>,
        merge_points: bool,
        runtime_handle: &Handle,
    ) -> CollectionResult<Option<PointIdType>> {
        debug_assert!(batch_size > 0);
        let limit = batch_size + 1;
        let _update_lock = self.update_lock.lock().await;
        let mut batch = self
            .wrapped_shard
            .scroll_by(
                offset,
                limit,
                &WithPayloadInterface::Bool(true),
                &true.into(),
                None,
                runtime_handle,
                None,
                None, // no timeout
            )
            .await?;
        let next_page_offset = if batch.len() < limit {
            // This was the last page
            None
        } else {
            // remove extra point, it would be a first point of the next page
            Some(batch.pop().unwrap().id)
        };

        let points: Result<Vec<PointStructPersisted>, String> = batch
            .into_iter()
            // If using a hashring filter, only transfer points that moved, otherwise transfer all
            .filter(|point| {
                hashring_filter
                    .map(|hashring| hashring.is_in_shard(&point.id, self.remote_shard.id))
                    .unwrap_or(true)
            })
            .map(PointStructPersisted::try_from)
            .collect();

        let points = points?;

        // Use sync API to leverage potentially existing points
        // Normally use SyncPoints, to completely replace everything in the target shard
        // For resharding we need to merge points from multiple transfers, requiring a different operation
        let point_operation = if !merge_points {
            PointOperations::SyncPoints(PointSyncOperation {
                from_id: offset,
                to_id: next_page_offset,
                points,
            })
        } else {
            PointOperations::UpsertPoints(PointInsertOperationsInternal::PointsList(points))
        };
        let insert_points_operation = CollectionUpdateOperations::PointOperation(point_operation);

        // We only need to wait for the last batch.
        let wait = next_page_offset.is_none();

        // TODO: Is cancelling `RemoteShard::update` safe for *receiver*?
        self.remote_shard
            .update(OperationWithClockTag::from(insert_points_operation), wait) // TODO: Assign clock tag!? 🤔
            .await?;

        Ok(next_page_offset)
    }

    pub fn deconstruct(self) -> (LocalShard, RemoteShard) {
        (self.wrapped_shard, self.remote_shard)
    }

    /// Forward `create_snapshot` to `wrapped_shard`
    pub async fn create_snapshot(
        &self,
        temp_path: &Path,
        tar: &tar_ext::BuilderExt,
        format: SnapshotFormat,
        save_wal: bool,
    ) -> CollectionResult<()> {
        self.wrapped_shard
            .create_snapshot(temp_path, tar, format, save_wal)
            .await
    }

    pub async fn on_optimizer_config_update(&self) -> CollectionResult<()> {
        self.wrapped_shard.on_optimizer_config_update().await
    }

    pub fn trigger_optimizers(&self) {
        self.wrapped_shard.trigger_optimizers();
    }

    pub fn get_telemetry_data(&self, detail: TelemetryDetail) -> LocalShardTelemetry {
        self.wrapped_shard.get_telemetry_data(detail)
    }

    pub fn update_tracker(&self) -> &UpdateTracker {
        self.wrapped_shard.update_tracker()
    }
}

#[async_trait]
impl ShardOperation for ForwardProxyShard {
    /// Update `wrapped_shard` while keeping track of the changed points
    ///
    /// # Cancel safety
    ///
    /// This method is *not* cancel safe.
    async fn update(
        &self,
        operation: OperationWithClockTag,
        _wait: bool,
    ) -> CollectionResult<UpdateResult> {
        // If we apply `local_shard` update, we *have to* execute `remote_shard` update to completion
        // (or we *might* introduce an inconsistency between shards?), so this method is not cancel
        // safe.

        let _update_lock = self.update_lock.lock().await;

        // Shard update is within a write lock scope, because we need a way to block the shard updates
        // during the transfer restart and finalization.

        // We always have to wait for the result of the update, cause after we release the lock,
        // the transfer needs to have access to the latest version of points.
        let mut result = self.wrapped_shard.update(operation.clone(), true).await?;

        let forward_operation = if let Some(ring) = &self.resharding_hash_ring {
            // If `ForwardProxyShard::resharding_hash_ring` is `Some`, we assume that proxy is used
            // during *resharding* shard transfer, which forwards points to a remote shard with
            // *different* shard ID.

            debug_assert_ne!(self.shard_id, self.remote_shard.id);

            // Only forward a *part* of the operation that belongs to remote shard.
            let op = match operation.operation.split_by_shard(ring) {
                OperationToShard::ToAll(op) => Some(op),
                OperationToShard::ByShard(by_shard) => by_shard
                    .into_iter()
                    .find(|&(shard_id, _)| shard_id == self.remote_shard.id)
                    .map(|(_, op)| op),
            };

            // Strip the clock tag from the operation, because clock tags are incompatible between
            // different shards.
            //
            // Even though we expect (and assert) that this whole branch is only executed when
            // forwarding to a *different* remote shard, we still handle the case when local and
            // remote shards are the same, *just in case*.
            //
            // In such case `split_by_shard` call above would be a no-op, and we can preserve the
            // clock tag.
            let tag = if self.shard_id != self.remote_shard.id {
                None
            } else {
                log::warn!(
                    "ForwardProxyShard contains resharding hashring, \
                     but wrapped shard id and remote shard id are the same",
                );

                operation.clock_tag
            };

            op.map(|op| OperationWithClockTag::new(op, tag))
        } else {
            // If `ForwardProxyShard::resharding_hash_ring` is `None`, we assume that proxy is used
            // during *regular* shard transfer, so operation can be forwarded as-is, without any
            // additional handling.

            debug_assert_eq!(self.shard_id, self.remote_shard.id);

            Some(operation)
        };

        if let Some(operation) = forward_operation {
            let remote_result =
                self.remote_shard
                    .update(operation, false)
                    .await
                    .map_err(|err| {
                        CollectionError::forward_proxy_error(self.remote_shard.peer_id, err)
                    })?;

            // Merge `result` and `remote_result`:
            //
            // - Pick `clock_tag` with *newer* `clock_tick`
            let tick = result.clock_tag.map(|tag| tag.clock_tick);
            let remote_tick = remote_result.clock_tag.map(|tag| tag.clock_tick);

            if remote_tick > tick || tick.is_none() {
                result.clock_tag = remote_result.clock_tag;
            }

            // - If any node *rejected* the operation, propagate `UpdateStatus::ClockRejected`
            if remote_result.status == UpdateStatus::ClockRejected {
                result.status = UpdateStatus::ClockRejected;
            }
        }

        Ok(result)
    }

    /// Forward read-only `scroll_by` to `wrapped_shard`
    async fn scroll_by(
        &self,
        offset: Option<ExtendedPointId>,
        limit: usize,
        with_payload_interface: &WithPayloadInterface,
        with_vector: &WithVector,
        filter: Option<&Filter>,
        search_runtime_handle: &Handle,
        order_by: Option<&OrderBy>,
        timeout: Option<Duration>,
    ) -> CollectionResult<Vec<RecordInternal>> {
        let local_shard = &self.wrapped_shard;
        local_shard
            .scroll_by(
                offset,
                limit,
                with_payload_interface,
                with_vector,
                filter,
                search_runtime_handle,
                order_by,
                timeout,
            )
            .await
    }

    async fn info(&self) -> CollectionResult<CollectionInfo> {
        let local_shard = &self.wrapped_shard;
        local_shard.info().await
    }
    async fn core_search(
        &self,
        request: Arc<CoreSearchRequestBatch>,
        search_runtime_handle: &Handle,
        timeout: Option<Duration>,
        hw_measurement_acc: &HwMeasurementAcc,
    ) -> CollectionResult<Vec<Vec<ScoredPoint>>> {
        let local_shard = &self.wrapped_shard;
        local_shard
            .core_search(request, search_runtime_handle, timeout, hw_measurement_acc)
            .await
    }

    async fn count(
        &self,
        request: Arc<CountRequestInternal>,
        search_runtime_handle: &Handle,
        timeout: Option<Duration>,
        hw_measurement_acc: &HwMeasurementAcc,
    ) -> CollectionResult<CountResult> {
        let local_shard = &self.wrapped_shard;
        local_shard
            .count(request, search_runtime_handle, timeout, hw_measurement_acc)
            .await
    }

    async fn retrieve(
        &self,
        request: Arc<PointRequestInternal>,
        with_payload: &WithPayload,
        with_vector: &WithVector,
        search_runtime_handle: &Handle,
        timeout: Option<Duration>,
    ) -> CollectionResult<Vec<RecordInternal>> {
        let local_shard = &self.wrapped_shard;
        local_shard
            .retrieve(
                request,
                with_payload,
                with_vector,
                search_runtime_handle,
                timeout,
            )
            .await
    }

    async fn query_batch(
        &self,
        requests: Arc<Vec<ShardQueryRequest>>,
        search_runtime_handle: &Handle,
        timeout: Option<Duration>,
        hw_measurement_acc: &HwMeasurementAcc,
    ) -> CollectionResult<Vec<ShardQueryResponse>> {
        let local_shard = &self.wrapped_shard;
        local_shard
            .query_batch(requests, search_runtime_handle, timeout, hw_measurement_acc)
            .await
    }

    async fn facet(
        &self,
        request: Arc<FacetParams>,
        search_runtime_handle: &Handle,
        timeout: Option<Duration>,
    ) -> CollectionResult<FacetResponse> {
        let local_shard = &self.wrapped_shard;
        local_shard
            .facet(request, search_runtime_handle, timeout)
            .await
    }
}