Spaces:

reztilop
/

colibri.qdrant

Build error

File size: 14,643 Bytes

84d2a97

use std::any::TypeId;
use std::collections::{HashMap, HashSet};
use std::sync::OnceLock;
use std::time::Duration;

use http::header::CONTENT_TYPE;
use http::{HeaderMap, HeaderValue, Method, Uri};
use issues::{Action, Code, ImmediateSolution, Issue, Solution};
use itertools::Itertools;
use strum::IntoEnumIterator as _;

use crate::common::operation_error::OperationError;
use crate::data_types::index::{TextIndexParams, TextIndexType, TokenizerType};
use crate::json_path::JsonPath;
use crate::types::{
    AnyVariants, Condition, FieldCondition, Filter, Match, MatchValue, PayloadFieldSchema,
    PayloadKeyType, PayloadSchemaParams, PayloadSchemaType, RangeInterface, UuidPayloadType,
};
#[derive(Debug)]
pub struct UnindexedField {
    field_name: JsonPath,
    field_schemas: HashSet<PayloadFieldSchema>,
    collection_name: String,
    endpoint: Uri,
    instance_id: String,
}

/// Don't use this directly, use `UnindexedField::slow_query_threshold()` instead
pub static SLOW_QUERY_THRESHOLD: OnceLock<Duration> = OnceLock::new();

impl UnindexedField {
    const DEFAULT_SLOW_QUERY_SECS: f32 = 1.2;

    pub fn slow_query_threshold() -> Duration {
        *SLOW_QUERY_THRESHOLD.get_or_init(|| Duration::from_secs_f32(Self::DEFAULT_SLOW_QUERY_SECS))
    }

    pub fn get_instance_id(collection_name: &str, field_name: &JsonPath) -> String {
        format!("{collection_name}/{field_name}")
    }

    pub fn get_collection_name(code: &Code) -> &str {
        debug_assert!(code.issue_type == TypeId::of::<Self>());
        code.instance_id.split('/').next().unwrap_or("") // Code format is always the same
    }

    /// Try to form an issue from a field condition and a collection name
    ///
    /// # Errors
    ///
    /// Will fail if the field condition cannot be used for inferring an appropriate schema.
    /// For example, when there is no index that can be built to improve performance.
    pub fn try_new(
        field_name: JsonPath,
        field_schemas: HashSet<PayloadFieldSchema>,
        collection_name: String,
    ) -> Result<Self, OperationError> {
        if field_schemas.is_empty() {
            return Err(OperationError::ValidationError {
                description: "Cannot create issue which won't have a solution".to_string(),
            });
        }

        let endpoint = match Uri::builder()
            .path_and_query(format!("/collections/{collection_name}/index").as_str())
            .build()
        {
            Ok(uri) => uri,
            Err(e) => {
                log::trace!("Failed to build uri: {e}");
                return Err(OperationError::ValidationError {
                    description: "Bad collection name".to_string(),
                });
            }
        };

        let instance_id = Self::get_instance_id(&collection_name, &field_name);

        Ok(Self {
            field_name,
            field_schemas,
            collection_name,
            endpoint,
            instance_id,
        })
    }

    pub fn submit_possible_suspects(
        filter: &Filter,
        payload_schema: &HashMap<PayloadKeyType, PayloadFieldSchema>,
        collection_name: String,
    ) {
        let unindexed_issues =
            IssueExtractor::new(filter, payload_schema, collection_name).into_issues();

        log::trace!("Found unindexed issues: {unindexed_issues:#?}");

        for issue in unindexed_issues {
            issue.submit();
        }
    }
}

impl Issue for UnindexedField {
    fn instance_id(&self) -> &str {
        &self.instance_id
    }

    fn name() -> &'static str {
        "UNINDEXED_FIELD"
    }

    fn description(&self) -> String {
        format!(
            "Unindexed field '{}' might be slowing queries down in collection '{}'",
            self.field_name, self.collection_name
        )
    }

    fn solution(&self) -> Solution {
        let mut solutions = self.field_schemas.iter().cloned().map(|field_schema| {
            let request_body = serde_json::json!({
                "field_name": self.field_name,
                "field_schema": field_schema,
            })
            .as_object()
            .unwrap()
            .clone();

            let headers = HeaderMap::from_iter([
                (CONTENT_TYPE, HeaderValue::from_static("application/json")),
            ]);

            ImmediateSolution {
                message: format!(
                    "Create an index on field '{}' of schema {} in collection '{}'. Check the documentation for more details: https://qdrant.tech/documentation/concepts/indexing/#payload-index",
                    self.field_name, serde_json::to_string(&field_schema).unwrap(), self.collection_name
                ),
                action: Action {
                    method: Method::PUT,
                    uri: self.endpoint.clone(),
                    headers,
                    body: Some(request_body),
                },
            }
        }).collect_vec();

        match solutions.len() {
            0 => unreachable!(
                "Cannot create a solution without a field schema, protected by try_new()"
            ),
            1 => Solution::Immediate(solutions.pop().unwrap()),
            _ => Solution::ImmediateChoice(solutions),
        }
    }
}

/// Suggest any index, let user choose depending on their data type
fn all_indexes() -> impl Iterator<Item = PayloadFieldSchema> {
    PayloadSchemaType::iter().map(PayloadFieldSchema::FieldType)
}

fn infer_schema_from_match_value(value: &MatchValue) -> Vec<PayloadFieldSchema> {
    match &value.value {
        crate::types::ValueVariants::String(string) => {
            let mut inferred = Vec::new();

            if UuidPayloadType::parse_str(string).is_ok() {
                inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
            }

            inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));

            inferred
        }
        crate::types::ValueVariants::Integer(_integer) => {
            vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
        }
        crate::types::ValueVariants::Bool(_boolean) => {
            vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Bool)]
        }
    }
}

fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec<PayloadFieldSchema> {
    match value {
        AnyVariants::Strings(strings) => {
            let mut inferred = Vec::new();

            if strings
                .iter()
                .all(|s| UuidPayloadType::parse_str(s).is_ok())
            {
                inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
            }

            inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));

            inferred
        }
        AnyVariants::Integers(_integers) => {
            vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
        }
    }
}

fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec<PayloadFieldSchema> {
    let FieldCondition {
        key: _key,
        r#match,
        range,
        geo_bounding_box,
        geo_radius,
        geo_polygon,
        values_count,
    } = field_condition;

    let mut inferred = Vec::new();

    if let Some(r#match) = r#match {
        inferred.extend(match r#match {
            Match::Value(match_value) => infer_schema_from_match_value(match_value),
            Match::Text(_match_text) => {
                vec![PayloadFieldSchema::FieldParams(PayloadSchemaParams::Text(
                    TextIndexParams {
                        r#type: TextIndexType::Text,
                        tokenizer: TokenizerType::default(),
                        min_token_len: None,
                        max_token_len: None,
                        lowercase: None,
                        on_disk: None,
                    },
                ))]
            }
            Match::Any(match_any) => infer_schema_from_any_variants(&match_any.any),
            Match::Except(match_except) => infer_schema_from_any_variants(&match_except.except),
        })
    }
    if let Some(range_interface) = range {
        match range_interface {
            RangeInterface::DateTime(_) => {
                inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime));
            }
            RangeInterface::Float(_) => {
                inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Float));
                inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Integer));
            }
        }
    }
    if geo_bounding_box.is_some() || geo_radius.is_some() || geo_polygon.is_some() {
        inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Geo));
    }
    if values_count.is_some() {
        // Any index will do, let user choose depending on their data type
        inferred.extend(all_indexes());
    }

    inferred
}

pub struct IssueExtractor<'a> {
    extractor: Extractor<'a>,
    collection_name: String,
}

impl<'a> IssueExtractor<'a> {
    pub fn new(
        filter: &Filter,
        payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
        collection_name: String,
    ) -> Self {
        let extractor = Extractor::new_eager(filter, payload_schema);

        Self {
            extractor,
            collection_name,
        }
    }

    fn into_issues(self) -> Vec<UnindexedField> {
        self.extractor
            .unindexed_schema
            .into_iter()
            .filter_map(|(key, field_schemas)| {
                let field_schemas: HashSet<_> = field_schemas
                    .iter()
                    .map(PayloadFieldSchema::kind)
                    .filter(|kind| {
                        let is_advanced = matches!(kind, PayloadSchemaType::Uuid);
                        !is_advanced
                    })
                    .map(PayloadFieldSchema::from)
                    .collect();

                UnindexedField::try_new(key, field_schemas, self.collection_name.clone()).ok()
            })
            .collect()
    }
}

pub struct Extractor<'a> {
    payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
    unindexed_schema: HashMap<PayloadKeyType, Vec<PayloadFieldSchema>>,
}

impl<'a> Extractor<'a> {
    /// Creates an extractor and eagerly extracts all unindexed fields from the provided filter.
    fn new_eager(
        filter: &Filter,
        payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
    ) -> Self {
        let mut extractor = Self {
            payload_schema,
            unindexed_schema: HashMap::new(),
        };

        extractor.update_from_filter(None, filter);

        extractor
    }

    /// Creates a new lazy 'Extractor'. It needs to call some update method to extract unindexed fields.
    pub fn new(payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>) -> Self {
        Self {
            payload_schema,
            unindexed_schema: HashMap::new(),
        }
    }

    /// Current unindexed schema.
    pub fn unindexed_schema(&self) -> &HashMap<PayloadKeyType, Vec<PayloadFieldSchema>> {
        &self.unindexed_schema
    }

    /// Checks the filter for unindexed fields.
    fn update_from_filter(&mut self, nested_prefix: Option<&JsonPath>, filter: &Filter) {
        for condition in filter.iter_conditions() {
            self.update_from_condition(nested_prefix, condition);
        }
    }

    /// Checks the filter for an unindexed field, stops at the first one found.
    pub fn update_from_filter_once(&mut self, nested_prefix: Option<&JsonPath>, filter: &Filter) {
        for condition in filter.iter_conditions() {
            self.update_from_condition(nested_prefix, condition);
            if !self.unindexed_schema.is_empty() {
                break;
            }
        }
    }

    fn update_from_condition(&mut self, nested_prefix: Option<&JsonPath>, condition: &Condition) {
        let key;
        let inferred;

        match condition {
            Condition::Field(field_condition) => {
                key = &field_condition.key;
                inferred = infer_schema_from_field_condition(field_condition);
            }
            Condition::Filter(filter) => {
                self.update_from_filter(nested_prefix, filter);
                return;
            }
            Condition::Nested(nested) => {
                self.update_from_filter(
                    Some(&JsonPath::extend_or_new(nested_prefix, nested.raw_key())),
                    nested.filter(),
                );
                return;
            }
            // Any index will suffice
            Condition::IsEmpty(is_empty) => {
                key = &is_empty.is_empty.key;
                inferred = all_indexes().collect();
            }
            Condition::IsNull(is_null) => {
                key = &is_null.is_null.key;
                inferred = all_indexes().collect();
            }
            // No index needed
            Condition::HasId(_) => return,
            Condition::CustomIdChecker(_) => return,
            Condition::HasVector(_) => return,
        };

        let full_key = JsonPath::extend_or_new(nested_prefix, key);

        let needs_index = match self.payload_schema.get(&full_key) {
            Some(index_info) => {
                let index_info_kind = index_info.kind();

                let already_indexed = inferred
                    .iter()
                    // TODO(strict-mode):
                    // Use better comparisons for parametrized indexes. An idea is to make the inferring step
                    // also output valid parametrized indexes and compare those instead of just the kind (index type)
                    //
                    // The only reason why it would be needed is because integer index can be parametrized
                    // with just lookup or just range, so it is possible to make a false negative here. E.g.
                    //
                    // condition: MatchValue
                    // inferred: FieldType(Integer)
                    // index_info: FieldParams(IntegerIndex(range))
                    //
                    // In this case, we would assume that the field is indexed correctly when it is not
                    .map(PayloadFieldSchema::kind)
                    .any(|inferred| inferred == index_info_kind);

                !already_indexed
            }
            None => true,
        };

        if needs_index {
            self.unindexed_schema
                .entry(full_key)
                .or_default()
                .extend(inferred);
        }
    }
}