Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / lib /segment /src /problems /unindexed_field.rs

Gouzi Mohaled

Ajout du dossier lib

84d2a97 7 months ago

14.6 kB

	use std::any::TypeId;
	use std::collections::{HashMap, HashSet};
	use std::sync::OnceLock;
	use std::time::Duration;

	use http::header::CONTENT_TYPE;
	use http::{HeaderMap, HeaderValue, Method, Uri};
	use issues::{Action, Code, ImmediateSolution, Issue, Solution};
	use itertools::Itertools;
	use strum::IntoEnumIterator as _;

	use crate::common::operation_error::OperationError;
	use crate::data_types::index::{TextIndexParams, TextIndexType, TokenizerType};
	use crate::json_path::JsonPath;
	use crate::types::{
	AnyVariants, Condition, FieldCondition, Filter, Match, MatchValue, PayloadFieldSchema,
	PayloadKeyType, PayloadSchemaParams, PayloadSchemaType, RangeInterface, UuidPayloadType,
	};
	#[derive(Debug)]
	pub struct UnindexedField {
	field_name: JsonPath,
	field_schemas: HashSet<PayloadFieldSchema>,
	collection_name: String,
	endpoint: Uri,
	instance_id: String,
	}

	/// Don't use this directly, use `UnindexedField::slow_query_threshold()` instead
	pub static SLOW_QUERY_THRESHOLD: OnceLock<Duration> = OnceLock::new();

	impl UnindexedField {
	const DEFAULT_SLOW_QUERY_SECS: f32 = 1.2;

	pub fn slow_query_threshold() -> Duration {
	*SLOW_QUERY_THRESHOLD.get_or_init(\|\| Duration::from_secs_f32(Self::DEFAULT_SLOW_QUERY_SECS))
	}

	pub fn get_instance_id(collection_name: &str, field_name: &JsonPath) -> String {
	format!("{collection_name}/{field_name}")
	}

	pub fn get_collection_name(code: &Code) -> &str {
	debug_assert!(code.issue_type == TypeId::of::<Self>());
	code.instance_id.split('/').next().unwrap_or("") // Code format is always the same
	}

	/// Try to form an issue from a field condition and a collection name
	///
	/// # Errors
	///
	/// Will fail if the field condition cannot be used for inferring an appropriate schema.
	/// For example, when there is no index that can be built to improve performance.
	pub fn try_new(
	field_name: JsonPath,
	field_schemas: HashSet<PayloadFieldSchema>,
	collection_name: String,
	) -> Result<Self, OperationError> {
	if field_schemas.is_empty() {
	return Err(OperationError::ValidationError {
	description: "Cannot create issue which won't have a solution".to_string(),
	});
	}

	let endpoint = match Uri::builder()
	.path_and_query(format!("/collections/{collection_name}/index").as_str())
	.build()
	{
	Ok(uri) => uri,
	Err(e) => {
	log::trace!("Failed to build uri: {e}");
	return Err(OperationError::ValidationError {
	description: "Bad collection name".to_string(),
	});
	}
	};

	let instance_id = Self::get_instance_id(&collection_name, &field_name);

	Ok(Self {
	field_name,
	field_schemas,
	collection_name,
	endpoint,
	instance_id,
	})
	}

	pub fn submit_possible_suspects(
	filter: &Filter,
	payload_schema: &HashMap<PayloadKeyType, PayloadFieldSchema>,
	collection_name: String,
	) {
	let unindexed_issues =
	IssueExtractor::new(filter, payload_schema, collection_name).into_issues();

	log::trace!("Found unindexed issues: {unindexed_issues:#?}");

	for issue in unindexed_issues {
	issue.submit();
	}
	}
	}

	impl Issue for UnindexedField {
	fn instance_id(&self) -> &str {
	&self.instance_id
	}

	fn name() -> &'static str {
	"UNINDEXED_FIELD"
	}

	fn description(&self) -> String {
	format!(
	"Unindexed field '{}' might be slowing queries down in collection '{}'",
	self.field_name, self.collection_name
	)
	}

	fn solution(&self) -> Solution {
	let mut solutions = self.field_schemas.iter().cloned().map(\|field_schema\| {
	let request_body = serde_json::json!({
	"field_name": self.field_name,
	"field_schema": field_schema,
	})
	.as_object()
	.unwrap()
	.clone();

	let headers = HeaderMap::from_iter([
	(CONTENT_TYPE, HeaderValue::from_static("application/json")),
	]);

	ImmediateSolution {
	message: format!(
	"Create an index on field '{}' of schema {} in collection '{}'. Check the documentation for more details: https://qdrant.tech/documentation/concepts/indexing/#payload-index",
	self.field_name, serde_json::to_string(&field_schema).unwrap(), self.collection_name
	),
	action: Action {
	method: Method::PUT,
	uri: self.endpoint.clone(),
	headers,
	body: Some(request_body),
	},
	}
	}).collect_vec();

	match solutions.len() {
	0 => unreachable!(
	"Cannot create a solution without a field schema, protected by try_new()"
	),
	1 => Solution::Immediate(solutions.pop().unwrap()),
	_ => Solution::ImmediateChoice(solutions),
	}
	}
	}

	/// Suggest any index, let user choose depending on their data type
	fn all_indexes() -> impl Iterator<Item = PayloadFieldSchema> {
	PayloadSchemaType::iter().map(PayloadFieldSchema::FieldType)
	}

	fn infer_schema_from_match_value(value: &MatchValue) -> Vec<PayloadFieldSchema> {
	match &value.value {
	crate::types::ValueVariants::String(string) => {
	let mut inferred = Vec::new();

	if UuidPayloadType::parse_str(string).is_ok() {
	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
	}

	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));

	inferred
	}
	crate::types::ValueVariants::Integer(_integer) => {
	vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
	}
	crate::types::ValueVariants::Bool(_boolean) => {
	vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Bool)]
	}
	}
	}

	fn infer_schema_from_any_variants(value: &AnyVariants) -> Vec<PayloadFieldSchema> {
	match value {
	AnyVariants::Strings(strings) => {
	let mut inferred = Vec::new();

	if strings
	.iter()
	.all(\|s\| UuidPayloadType::parse_str(s).is_ok())
	{
	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Uuid))
	}

	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Keyword));

	inferred
	}
	AnyVariants::Integers(_integers) => {
	vec![PayloadFieldSchema::FieldType(PayloadSchemaType::Integer)]
	}
	}
	}

	fn infer_schema_from_field_condition(field_condition: &FieldCondition) -> Vec<PayloadFieldSchema> {
	let FieldCondition {
	key: _key,
	r#match,
	range,
	geo_bounding_box,
	geo_radius,
	geo_polygon,
	values_count,
	} = field_condition;

	let mut inferred = Vec::new();

	if let Some(r#match) = r#match {
	inferred.extend(match r#match {
	Match::Value(match_value) => infer_schema_from_match_value(match_value),
	Match::Text(_match_text) => {
	vec![PayloadFieldSchema::FieldParams(PayloadSchemaParams::Text(
	TextIndexParams {
	r#type: TextIndexType::Text,
	tokenizer: TokenizerType::default(),
	min_token_len: None,
	max_token_len: None,
	lowercase: None,
	on_disk: None,
	},
	))]
	}
	Match::Any(match_any) => infer_schema_from_any_variants(&match_any.any),
	Match::Except(match_except) => infer_schema_from_any_variants(&match_except.except),
	})
	}
	if let Some(range_interface) = range {
	match range_interface {
	RangeInterface::DateTime(_) => {
	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Datetime));
	}
	RangeInterface::Float(_) => {
	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Float));
	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Integer));
	}
	}
	}
	if geo_bounding_box.is_some() \|\| geo_radius.is_some() \|\| geo_polygon.is_some() {
	inferred.push(PayloadFieldSchema::FieldType(PayloadSchemaType::Geo));
	}
	if values_count.is_some() {
	// Any index will do, let user choose depending on their data type
	inferred.extend(all_indexes());
	}

	inferred
	}

	pub struct IssueExtractor<'a> {
	extractor: Extractor<'a>,
	collection_name: String,
	}

	impl<'a> IssueExtractor<'a> {
	pub fn new(
	filter: &Filter,
	payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
	collection_name: String,
	) -> Self {
	let extractor = Extractor::new_eager(filter, payload_schema);

	Self {
	extractor,
	collection_name,
	}
	}

	fn into_issues(self) -> Vec<UnindexedField> {
	self.extractor
	.unindexed_schema
	.into_iter()
	.filter_map(\|(key, field_schemas)\| {
	let field_schemas: HashSet<_> = field_schemas
	.iter()
	.map(PayloadFieldSchema::kind)
	.filter(\|kind\| {
	let is_advanced = matches!(kind, PayloadSchemaType::Uuid);
	!is_advanced
	})
	.map(PayloadFieldSchema::from)
	.collect();

	UnindexedField::try_new(key, field_schemas, self.collection_name.clone()).ok()
	})
	.collect()
	}
	}

	pub struct Extractor<'a> {
	payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
	unindexed_schema: HashMap<PayloadKeyType, Vec<PayloadFieldSchema>>,
	}

	impl<'a> Extractor<'a> {
	/// Creates an extractor and eagerly extracts all unindexed fields from the provided filter.
	fn new_eager(
	filter: &Filter,
	payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>,
	) -> Self {
	let mut extractor = Self {
	payload_schema,
	unindexed_schema: HashMap::new(),
	};

	extractor.update_from_filter(None, filter);

	extractor
	}

	/// Creates a new lazy 'Extractor'. It needs to call some update method to extract unindexed fields.
	pub fn new(payload_schema: &'a HashMap<PayloadKeyType, PayloadFieldSchema>) -> Self {
	Self {
	payload_schema,
	unindexed_schema: HashMap::new(),
	}
	}

	/// Current unindexed schema.
	pub fn unindexed_schema(&self) -> &HashMap<PayloadKeyType, Vec<PayloadFieldSchema>> {
	&self.unindexed_schema
	}

	/// Checks the filter for unindexed fields.
	fn update_from_filter(&mut self, nested_prefix: Option<&JsonPath>, filter: &Filter) {
	for condition in filter.iter_conditions() {
	self.update_from_condition(nested_prefix, condition);
	}
	}

	/// Checks the filter for an unindexed field, stops at the first one found.
	pub fn update_from_filter_once(&mut self, nested_prefix: Option<&JsonPath>, filter: &Filter) {
	for condition in filter.iter_conditions() {
	self.update_from_condition(nested_prefix, condition);
	if !self.unindexed_schema.is_empty() {
	break;
	}
	}
	}

	fn update_from_condition(&mut self, nested_prefix: Option<&JsonPath>, condition: &Condition) {
	let key;
	let inferred;

	match condition {
	Condition::Field(field_condition) => {
	key = &field_condition.key;
	inferred = infer_schema_from_field_condition(field_condition);
	}
	Condition::Filter(filter) => {
	self.update_from_filter(nested_prefix, filter);
	return;
	}
	Condition::Nested(nested) => {
	self.update_from_filter(
	Some(&JsonPath::extend_or_new(nested_prefix, nested.raw_key())),
	nested.filter(),
	);
	return;
	}
	// Any index will suffice
	Condition::IsEmpty(is_empty) => {
	key = &is_empty.is_empty.key;
	inferred = all_indexes().collect();
	}
	Condition::IsNull(is_null) => {
	key = &is_null.is_null.key;
	inferred = all_indexes().collect();
	}
	// No index needed
	Condition::HasId(_) => return,
	Condition::CustomIdChecker(_) => return,
	Condition::HasVector(_) => return,
	};

	let full_key = JsonPath::extend_or_new(nested_prefix, key);

	let needs_index = match self.payload_schema.get(&full_key) {
	Some(index_info) => {
	let index_info_kind = index_info.kind();

	let already_indexed = inferred
	.iter()
	// TODO(strict-mode):
	// Use better comparisons for parametrized indexes. An idea is to make the inferring step
	// also output valid parametrized indexes and compare those instead of just the kind (index type)
	//
	// The only reason why it would be needed is because integer index can be parametrized
	// with just lookup or just range, so it is possible to make a false negative here. E.g.
	//
	// condition: MatchValue
	// inferred: FieldType(Integer)
	// index_info: FieldParams(IntegerIndex(range))
	//
	// In this case, we would assume that the field is indexed correctly when it is not
	.map(PayloadFieldSchema::kind)
	.any(\|inferred\| inferred == index_info_kind);

	!already_indexed
	}
	None => true,
	};

	if needs_index {
	self.unindexed_schema
	.entry(full_key)
	.or_default()
	.extend(inferred);
	}
	}
	}