use std::any::TypeId; use std::collections::{HashMap, HashSet}; use std::sync::OnceLock; use std::time::Duration; use http::header::CONTENT_TYPE; use http::{HeaderMap, HeaderValue, Method, Uri}; use issues::{Action, Code, ImmediateSolution, Issue, Solution}; use itertools::Itertools; use strum::IntoEnumIterator as _; use crate::common::operation_error::OperationError; use crate::data_types::index::{TextIndexParams, TextIndexType, TokenizerType}; use crate::json_path::JsonPath; use crate::types::{ AnyVariants, Condition, FieldCondition, Filter, Match, MatchValue, PayloadFieldSchema, PayloadKeyType, PayloadSchemaParams, PayloadSchemaType, RangeInterface, UuidPayloadType, }; #[derive(Debug)] pub struct UnindexedField { field_name: JsonPath, field_schemas: HashSet, collection_name: String, endpoint: Uri, instance_id: String, } /// Don't use this directly, use `UnindexedField::slow_query_threshold()` instead pub static SLOW_QUERY_THRESHOLD: OnceLock = OnceLock::new(); impl UnindexedField { const DEFAULT_SLOW_QUERY_SECS: f32 = 1.2; pub fn slow_query_threshold() -> Duration { *SLOW_QUERY_THRESHOLD.get_or_init(|| Duration::from_secs_f32(Self::DEFAULT_SLOW_QUERY_SECS)) } pub fn get_instance_id(collection_name: &str, field_name: &JsonPath) -> String { format!("{collection_name}/{field_name}") } pub fn get_collection_name(code: &Code) -> &str { debug_assert!(code.issue_type == TypeId::of::()); code.instance_id.split('/').next().unwrap_or("") // Code format is always the same } /// Try to form an issue from a field condition and a collection name /// /// # Errors /// /// Will fail if the field condition cannot be used for inferring an appropriate schema. /// For example, when there is no index that can be built to improve performance. pub fn try_new( field_name: JsonPath, field_schemas: HashSet, collection_name: String, ) -> Result { if field_schemas.is_empty() { return Err(OperationError::ValidationError { description: "Cannot create issue which won't have a solution".to_string(), }); } let endpoint = match Uri::builder() .path_and_query(format!("/collections/{collection_name}/index").as_str()) .build() { Ok(uri) => uri, Err(e) => { log::trace!("Failed to build uri: {e}"); return Err(OperationError::ValidationError { description: "Bad collection name".to_string(), }); } }; let instance_id = Self::get_instance_id(&collection_name, &field_name); Ok(Self { field_name, field_schemas, collection_name, endpoint, instance_id, }) } pub fn submit_possible_suspects( filter: &Filter, payload_schema: &HashMap, collection_name: String, ) { let unindexed_issues = IssueExtractor::new(filter, payload_schema, collection_name).into_issues(); log::trace!("Found unindexed issues: {unindexed_issues:#?}"); for issue in unindexed_issues { issue.submit(); } } } impl Issue for UnindexedField { fn instance_id(&self) -> &str { &self.instance_id } fn name() -> &'static str { "UNINDEXED_FIELD" } fn description(&self) -> String { format!( "Unindexed field '{}' might be slowing queries down in collection '{}'", self.field_name, self.collection_name ) } fn solution(&self) -> Solution { let mut solutions = self.field_schemas.iter().cloned().map(|field_schema| { let request_body = serde_json::json!({ "field_name": self.field_name, "field_schema": field_schema, }) .as_object() .unwrap() .clone(); let headers = HeaderMap::from_iter([ (CONTENT_TYPE, HeaderValue::from_static("application/json")), ]); ImmediateSolution { message: format!( "Create an index on field '{}' of schema {} in collection '{}'. Check the documentation for more details: https://qdrant.tech/documentation/concepts/indexing/#payload-index", self.field_name, serde_json::to_string(&field_schema).unwrap(), self.collection_name ), action: Action { method: Method::PUT, uri: self.endpoint.clone(), headers, body: Some(request_body), }, } }).collect_vec(); match solutions.len() { 0 => unreachable!( "Cannot create a solution without a field schema, protected by try_new()" ), 1 => Solution::Immediate(solutions.pop().unwrap()), _ => Solution::ImmediateChoice(solutions), } } } /// Suggest any index, let user choose depending on their data type fn all_indexes() -> impl Iterator { PayloadSchemaType::iter().map(PayloadFieldSchema::FieldType) } fn infer_schema_from_match_value(value: &MatchValue) -> Vec { match &value.value { crate::types::ValueVariants::String(string) => { let mut inferred = Vec::new(); if UuidPayloadType::parse_str(string).is_ok() { inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid)) } inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword)); inferred } crate::types::ValueVariants::Integer(_integer) => { vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)] } crate::types::ValueVariants::Bool(_boolean) => { vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Bool)] } } } fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec { match value { AnyVariants::Strings(strings) => { let mut inferred = Vec::new(); if strings .iter() .all(|s| UuidPayloadType::parse_str(s).is_ok()) { inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid)) } inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword)); inferred } AnyVariants::Integers(_integers) => { vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)] } } } fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec { let FieldCondition { key: _key, r#match, range, geo_bounding_box, geo_radius, geo_polygon, values_count, } = field_condition; let mut inferred = Vec::new(); if let Some(r#match) = r#match { inferred.extend(match r#match { Match::Value(match_value) => infer_schema_from_match_value(match_value), Match::Text(_match_text) => { vec![PayloadFieldSchema::FieldParams(PayloadSchemaParams::Text( TextIndexParams { r#type: TextIndexType::Text, tokenizer: TokenizerType::default(), min_token_len: None, max_token_len: None, lowercase: None, on_disk: None, }, ))] } Match::Any(match_any) => infer_schema_from_any_variants(&match_any.any), Match::Except(match_except) => infer_schema_from_any_variants(&match_except.except), }) } if let Some(range_interface) = range { match range_interface { RangeInterface::DateTime(_) => { inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime)); } RangeInterface::Float(_) => { inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Float)); inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)); } } } if geo_bounding_box.is_some() || geo_radius.is_some() || geo_polygon.is_some() { inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Geo)); } if values_count.is_some() { // Any index will do, let user choose depending on their data type inferred.extend(all_indexes()); } inferred } pub struct IssueExtractor<'a> { extractor: Extractor<'a>, collection_name: String, } impl<'a> IssueExtractor<'a> { pub fn new( filter: &Filter, payload_schema: &'a HashMap, collection_name: String, ) -> Self { let extractor = Extractor::new_eager(filter, payload_schema); Self { extractor, collection_name, } } fn into_issues(self) -> Vec { self.extractor .unindexed_schema .into_iter() .filter_map(|(key, field_schemas)| { let field_schemas: HashSet<_> = field_schemas .iter() .map(PayloadFieldSchema::kind) .filter(|kind| { let is_advanced = matches!(kind, PayloadSchemaType::Uuid); !is_advanced }) .map(PayloadFieldSchema::from) .collect(); UnindexedField::try_new(key, field_schemas, self.collection_name.clone()).ok() }) .collect() } } pub struct Extractor<'a> { payload_schema: &'a HashMap, unindexed_schema: HashMap>, } impl<'a> Extractor<'a> { /// Creates an extractor and eagerly extracts all unindexed fields from the provided filter. fn new_eager( filter: &Filter, payload_schema: &'a HashMap, ) -> Self { let mut extractor = Self { payload_schema, unindexed_schema: HashMap::new(), }; extractor.update_from_filter(None, filter); extractor } /// Creates a new lazy 'Extractor'. It needs to call some update method to extract unindexed fields. pub fn new(payload_schema: &'a HashMap) -> Self { Self { payload_schema, unindexed_schema: HashMap::new(), } } /// Current unindexed schema. pub fn unindexed_schema(&self) -> &HashMap> { &self.unindexed_schema } /// Checks the filter for unindexed fields. fn update_from_filter(&mut self, nested_prefix: Option<&JsonPath>, filter: &Filter) { for condition in filter.iter_conditions() { self.update_from_condition(nested_prefix, condition); } } /// Checks the filter for an unindexed field, stops at the first one found. pub fn update_from_filter_once(&mut self, nested_prefix: Option<&JsonPath>, filter: &Filter) { for condition in filter.iter_conditions() { self.update_from_condition(nested_prefix, condition); if !self.unindexed_schema.is_empty() { break; } } } fn update_from_condition(&mut self, nested_prefix: Option<&JsonPath>, condition: &Condition) { let key; let inferred; match condition { Condition::Field(field_condition) => { key = &field_condition.key; inferred = infer_schema_from_field_condition(field_condition); } Condition::Filter(filter) => { self.update_from_filter(nested_prefix, filter); return; } Condition::Nested(nested) => { self.update_from_filter( Some(&JsonPath::extend_or_new(nested_prefix, nested.raw_key())), nested.filter(), ); return; } // Any index will suffice Condition::IsEmpty(is_empty) => { key = &is_empty.is_empty.key; inferred = all_indexes().collect(); } Condition::IsNull(is_null) => { key = &is_null.is_null.key; inferred = all_indexes().collect(); } // No index needed Condition::HasId(_) => return, Condition::CustomIdChecker(_) => return, Condition::HasVector(_) => return, }; let full_key = JsonPath::extend_or_new(nested_prefix, key); let needs_index = match self.payload_schema.get(&full_key) { Some(index_info) => { let index_info_kind = index_info.kind(); let already_indexed = inferred .iter() // TODO(strict-mode): // Use better comparisons for parametrized indexes. An idea is to make the inferring step // also output valid parametrized indexes and compare those instead of just the kind (index type) // // The only reason why it would be needed is because integer index can be parametrized // with just lookup or just range, so it is possible to make a false negative here. E.g. // // condition: MatchValue // inferred: FieldType(Integer) // index_info: FieldParams(IntegerIndex(range)) // // In this case, we would assume that the field is indexed correctly when it is not .map(PayloadFieldSchema::kind) .any(|inferred| inferred == index_info_kind); !already_indexed } None => true, }; if needs_index { self.unindexed_schema .entry(full_key) .or_default() .extend(inferred); } } }