Spaces:
Build error
Build error
//! Contains functions for estimating of how many points should be processed for a given filter query | |
//! | |
//! Filter query is used e.g. for determining how would be faster to process the query: | |
//! - use vector index or payload index first | |
use std::cmp::{max, min}; | |
use itertools::Itertools; | |
use crate::index::field_index::{CardinalityEstimation, PrimaryCondition}; | |
use crate::types::{Condition, Filter, MinShould}; | |
/// Re-estimate cardinality based on number of available vectors | |
/// Assuming that deleted vectors are not correlated with the filter | |
/// | |
/// # Arguments | |
/// | |
/// * `estimation` - cardinality estimations of number of points selected by payload filter | |
/// * `available_vectors` - number of available vectors for the named vector storage | |
/// * `total_vectors` - total number of points in the segment | |
/// | |
/// # Result | |
/// | |
/// * `CardinalityEstimation` - new cardinality estimation | |
pub fn adjust_to_available_vectors( | |
estimation: CardinalityEstimation, | |
available_vectors: usize, | |
available_points: usize, | |
) -> CardinalityEstimation { | |
if available_points == 0 || available_vectors == 0 { | |
return CardinalityEstimation { | |
primary_clauses: estimation.primary_clauses, | |
min: 0, | |
exp: 0, | |
max: 0, | |
}; | |
} | |
let number_of_deleted_vectors = available_points.saturating_sub(available_vectors); | |
// It is possible, all deleted vectors are selected in worst case | |
let min = estimation.min.saturating_sub(number_of_deleted_vectors); | |
// Another extreme case - all deleted vectors are not selected | |
let max = estimation.max.min(available_vectors).min(available_points); | |
let availability_prob = (available_vectors as f64 / available_points as f64).min(1.0); | |
let exp = (estimation.exp as f64 * availability_prob).round() as usize; | |
debug_assert!( | |
min <= exp, | |
"estimation: {estimation:?}, available_vectors: {available_vectors}, available_points: {available_points}, min: {min}, exp: {exp}" | |
); | |
debug_assert!( | |
exp <= max, | |
"estimation: {estimation:?}, available_vectors: {available_vectors}, available_points: {available_points}, exp: {exp}, max: {max}" | |
); | |
CardinalityEstimation { | |
primary_clauses: estimation.primary_clauses, | |
min, | |
exp, | |
max, | |
} | |
} | |
pub fn combine_should_estimations( | |
estimations: &[CardinalityEstimation], | |
total: usize, | |
) -> CardinalityEstimation { | |
let mut clauses: Vec<PrimaryCondition> = vec![]; | |
for estimation in estimations { | |
if estimation.primary_clauses.is_empty() { | |
// If some branch is un-indexed - we can't make | |
// any assumptions about the whole `should` clause | |
clauses = vec![]; | |
break; | |
} | |
clauses.append(&mut estimation.primary_clauses.clone()); | |
} | |
let element_not_hit_prob: f64 = estimations | |
.iter() | |
.map(|x| (total - x.exp) as f64 / (total as f64)) | |
.product(); | |
let element_hit_prob = 1.0 - element_not_hit_prob; | |
let expected_count = (element_hit_prob * (total as f64)).round() as usize; | |
CardinalityEstimation { | |
primary_clauses: clauses, | |
min: estimations.iter().map(|x| x.min).max().unwrap_or(0), | |
exp: expected_count, | |
max: min(estimations.iter().map(|x| x.max).sum(), total), | |
} | |
} | |
pub fn combine_min_should_estimations( | |
estimations: &[CardinalityEstimation], | |
min_count: usize, | |
total: usize, | |
) -> CardinalityEstimation { | |
/* | |
| First estimate cardinality of intersections and then combine the estimations | |
| ex) min_count : 2, # of estimations : 4 | |
| |(A β B) βͺ (A β C) βͺ (A β D) βͺ (B β C) βͺ (B β D) βͺ (C β D)| | |
*/ | |
let intersection_estimations = estimations | |
.iter() | |
.combinations(min_count) | |
.map(|intersection| { | |
combine_must_estimations(&intersection.into_iter().cloned().collect_vec(), total) | |
}) | |
.collect_vec(); | |
combine_should_estimations(&intersection_estimations, total) | |
} | |
pub fn combine_must_estimations( | |
estimations: &[CardinalityEstimation], | |
total: usize, | |
) -> CardinalityEstimation { | |
let min_estimation = estimations | |
.iter() | |
.map(|x| x.min) | |
.fold(total as i64, |acc, x| { | |
max(0, acc + (x as i64) - (total as i64)) | |
}) as usize; | |
let max_estimation = estimations.iter().map(|x| x.max).min().unwrap_or(total); | |
let exp_estimation_prob: f64 = estimations | |
.iter() | |
.map(|x| (x.exp as f64) / (total as f64)) | |
.product(); | |
let exp_estimation = (exp_estimation_prob * (total as f64)).round() as usize; | |
let clauses = estimations | |
.iter() | |
.filter(|x| !x.primary_clauses.is_empty()) | |
.min_by_key(|x| x.exp) | |
.map(|x| x.primary_clauses.clone()) | |
.unwrap_or_default(); | |
CardinalityEstimation { | |
primary_clauses: clauses, | |
min: min_estimation, | |
exp: exp_estimation, | |
max: max_estimation, | |
} | |
} | |
fn estimate_condition<F>( | |
estimator: &F, | |
condition: &Condition, | |
total: usize, | |
) -> CardinalityEstimation | |
where | |
F: Fn(&Condition) -> CardinalityEstimation, | |
{ | |
match condition { | |
Condition::Filter(filter) => estimate_filter(estimator, filter, total), | |
_ => estimator(condition), | |
} | |
} | |
pub fn estimate_filter<F>(estimator: &F, filter: &Filter, total: usize) -> CardinalityEstimation | |
where | |
F: Fn(&Condition) -> CardinalityEstimation, | |
{ | |
let mut filter_estimations: Vec<CardinalityEstimation> = vec![]; | |
match &filter.must { | |
None => {} | |
Some(conditions) => { | |
if !conditions.is_empty() { | |
filter_estimations.push(estimate_must(estimator, conditions, total)); | |
} | |
} | |
} | |
match &filter.should { | |
None => {} | |
Some(conditions) => { | |
if !conditions.is_empty() { | |
filter_estimations.push(estimate_should(estimator, conditions, total)); | |
} | |
} | |
} | |
match &filter.min_should { | |
None => {} | |
Some(MinShould { | |
conditions, | |
min_count, | |
}) => filter_estimations.push(estimate_min_should( | |
estimator, conditions, *min_count, total, | |
)), | |
} | |
match &filter.must_not { | |
None => {} | |
Some(conditions) => { | |
if !conditions.is_empty() { | |
filter_estimations.push(estimate_must_not(estimator, conditions, total)) | |
} | |
} | |
} | |
combine_must_estimations(&filter_estimations, total) | |
} | |
fn estimate_should<F>( | |
estimator: &F, | |
conditions: &[Condition], | |
total: usize, | |
) -> CardinalityEstimation | |
where | |
F: Fn(&Condition) -> CardinalityEstimation, | |
{ | |
let estimate = |x| estimate_condition(estimator, x, total); | |
let should_estimations = conditions.iter().map(estimate).collect_vec(); | |
combine_should_estimations(&should_estimations, total) | |
} | |
fn estimate_min_should<F>( | |
estimator: &F, | |
conditions: &[Condition], | |
min_count: usize, | |
total: usize, | |
) -> CardinalityEstimation | |
where | |
F: Fn(&Condition) -> CardinalityEstimation, | |
{ | |
let estimate = |x| estimate_condition(estimator, x, total); | |
let min_should_estimations = conditions.iter().map(estimate).collect_vec(); | |
combine_min_should_estimations(&min_should_estimations, min_count, total) | |
} | |
fn estimate_must<F>(estimator: &F, conditions: &[Condition], total: usize) -> CardinalityEstimation | |
where | |
F: Fn(&Condition) -> CardinalityEstimation, | |
{ | |
let estimate = |x| estimate_condition(estimator, x, total); | |
let must_estimations = conditions.iter().map(estimate).collect_vec(); | |
combine_must_estimations(&must_estimations, total) | |
} | |
pub fn invert_estimation( | |
estimation: &CardinalityEstimation, | |
total: usize, | |
) -> CardinalityEstimation { | |
CardinalityEstimation { | |
primary_clauses: vec![], | |
min: total - estimation.max, | |
exp: total - estimation.exp, | |
max: total - estimation.min, | |
} | |
} | |
fn estimate_must_not<F>( | |
estimator: &F, | |
conditions: &[Condition], | |
total: usize, | |
) -> CardinalityEstimation | |
where | |
F: Fn(&Condition) -> CardinalityEstimation, | |
{ | |
let estimate = |x| invert_estimation(&estimate_condition(estimator, x, total), total); | |
let must_not_estimations = conditions.iter().map(estimate).collect_vec(); | |
combine_must_estimations(&must_not_estimations, total) | |
} | |
mod tests { | |
use super::*; | |
use crate::json_path::JsonPath; | |
use crate::types::{FieldCondition, HasIdCondition}; | |
const TOTAL: usize = 1000; | |
fn test_condition(key: &str) -> Condition { | |
Condition::Field(FieldCondition { | |
key: JsonPath::new(key), | |
r#match: None, | |
range: None, | |
geo_bounding_box: None, | |
geo_radius: None, | |
values_count: None, | |
geo_polygon: None, | |
}) | |
} | |
fn test_estimator(condition: &Condition) -> CardinalityEstimation { | |
match condition { | |
Condition::Filter(_) => panic!("unexpected Filter"), | |
Condition::Nested(_) => panic!("unexpected Nested"), | |
Condition::CustomIdChecker(_) => panic!("unexpected CustomIdChecker"), | |
Condition::Field(field) => match field.key.to_string().as_str() { | |
"color" => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::Condition(field.clone())], | |
min: 100, | |
exp: 200, | |
max: 300, | |
}, | |
"size" => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::Condition(field.clone())], | |
min: 100, | |
exp: 100, | |
max: 100, | |
}, | |
"price" => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::Condition(field.clone())], | |
min: 10, | |
exp: 15, | |
max: 20, | |
}, | |
_ => CardinalityEstimation::unknown(TOTAL), | |
}, | |
Condition::HasId(has_id) => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::Ids( | |
has_id | |
.has_id | |
.iter() | |
.map(|&x| format!("{x}").parse().unwrap()) // hack to convert ID as "number" | |
.collect(), | |
)], | |
min: has_id.has_id.len(), | |
exp: has_id.has_id.len(), | |
max: has_id.has_id.len(), | |
}, | |
Condition::IsEmpty(condition) => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::IsEmpty(condition.to_owned())], | |
min: 0, | |
exp: TOTAL / 2, | |
max: TOTAL, | |
}, | |
Condition::IsNull(condition) => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::IsNull(condition.to_owned())], | |
min: 0, | |
exp: TOTAL / 2, | |
max: TOTAL, | |
}, | |
Condition::HasVector(condition) => CardinalityEstimation { | |
primary_clauses: vec![PrimaryCondition::HasVector(condition.has_vector.clone())], | |
min: 0, | |
exp: TOTAL / 2, | |
max: TOTAL, | |
}, | |
} | |
} | |
fn simple_query_estimation_test() { | |
let query = Filter::new_must(test_condition("color")); | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.exp, 200); | |
assert!(!estimation.primary_clauses.is_empty()); | |
} | |
fn must_estimation_query_test() { | |
let query = Filter { | |
should: None, | |
min_should: None, | |
must: Some(vec![ | |
test_condition("color"), | |
test_condition("size"), | |
test_condition("un-indexed"), | |
]), | |
must_not: None, | |
}; | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 1); | |
match &estimation.primary_clauses[0] { | |
PrimaryCondition::Condition(field) => assert_eq!(&field.key.to_string(), "size"), | |
_ => panic!(), | |
} | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn should_estimation_query_test() { | |
let query = Filter { | |
should: Some(vec![test_condition("color"), test_condition("size")]), | |
min_should: None, | |
must: None, | |
must_not: None, | |
}; | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 2); | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn another_should_estimation_query_test() { | |
let query = Filter { | |
should: Some(vec![ | |
test_condition("color"), | |
test_condition("size"), | |
test_condition("un-indexed"), | |
]), | |
min_should: None, | |
must: None, | |
must_not: None, | |
}; | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 0); | |
eprintln!("estimation = {estimation:#?}"); | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn min_should_estimation_query_test() { | |
let query = Filter::new_min_should(MinShould { | |
conditions: vec![test_condition("color"), test_condition("size")], | |
min_count: 1, | |
}); | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 2); | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn another_min_should_estimation_query_test() { | |
let query = Filter::new_min_should(MinShould { | |
conditions: vec![ | |
test_condition("color"), | |
test_condition("size"), | |
test_condition("price"), | |
], | |
min_count: 2, | |
}); | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 3); | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn min_should_with_min_count_same_as_condition_count_is_equivalent_to_must() { | |
let conditions = vec![ | |
test_condition("color"), | |
test_condition("size"), | |
test_condition("price"), | |
]; | |
let min_should_query = Filter::new_min_should(MinShould { | |
conditions: conditions.clone(), | |
min_count: 3, | |
}); | |
let estimation = estimate_filter(&test_estimator, &min_should_query, TOTAL); | |
let must_query = Filter { | |
should: None, | |
min_should: None, | |
must: Some(conditions), | |
must_not: None, | |
}; | |
let expected_estimation = estimate_filter(&test_estimator, &must_query, TOTAL); | |
assert_eq!( | |
estimation.primary_clauses, | |
expected_estimation.primary_clauses | |
); | |
assert_eq!(estimation.max, expected_estimation.max); | |
assert_eq!(estimation.exp, expected_estimation.exp); | |
assert_eq!(estimation.min, expected_estimation.min); | |
} | |
fn complex_estimation_query_test() { | |
let query = Filter { | |
should: Some(vec![ | |
Condition::Filter(Filter { | |
should: None, | |
min_should: None, | |
must: Some(vec![test_condition("color"), test_condition("size")]), | |
must_not: None, | |
}), | |
Condition::Filter(Filter { | |
should: None, | |
min_should: None, | |
must: Some(vec![test_condition("price"), test_condition("size")]), | |
must_not: None, | |
}), | |
]), | |
min_should: None, | |
must: None, | |
must_not: Some(vec![Condition::HasId(HasIdCondition { | |
has_id: [1, 2, 3, 4, 5].into_iter().map(|x| x.into()).collect(), | |
})]), | |
}; | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 2); | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn another_complex_estimation_query_test() { | |
let query = Filter { | |
should: None, | |
min_should: None, | |
must: Some(vec![ | |
Condition::Filter(Filter { | |
must: None, | |
should: Some(vec![test_condition("color"), test_condition("size")]), | |
min_should: None, | |
must_not: None, | |
}), | |
Condition::Filter(Filter { | |
must: None, | |
should: Some(vec![test_condition("price"), test_condition("size")]), | |
min_should: None, | |
must_not: None, | |
}), | |
]), | |
must_not: Some(vec![Condition::HasId(HasIdCondition { | |
has_id: [1, 2, 3, 4, 5].into_iter().map(|x| x.into()).collect(), | |
})]), | |
}; | |
let estimation = estimate_filter(&test_estimator, &query, TOTAL); | |
assert_eq!(estimation.primary_clauses.len(), 2); | |
estimation.primary_clauses.iter().for_each(|x| match x { | |
PrimaryCondition::Condition(field) => { | |
assert!(["price", "size"].contains(&field.key.to_string().as_str())) | |
} | |
_ => panic!("Should not go here"), | |
}); | |
assert!(estimation.max <= TOTAL); | |
assert!(estimation.exp <= estimation.max); | |
assert!(estimation.min <= estimation.exp); | |
} | |
fn test_combine_must_estimations() { | |
let estimations = vec![CardinalityEstimation { | |
primary_clauses: vec![], | |
min: 12, | |
exp: 12, | |
max: 12, | |
}]; | |
let res = combine_must_estimations(&estimations, 10_000); | |
eprintln!("res = {res:#?}"); | |
} | |
fn test_adjust_to_available_vectors() { | |
let estimation = CardinalityEstimation { | |
primary_clauses: vec![], | |
min: 0, | |
exp: 64, | |
max: 100, | |
}; | |
let new_estimation = adjust_to_available_vectors(estimation, 50, 200); | |
assert_eq!(new_estimation.min, 0); | |
assert_eq!(new_estimation.exp, 16); | |
assert_eq!(new_estimation.max, 50); | |
} | |
} | |