Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / lib /segment /src /index /field_index /stat_tools.rs

Gouzi Mohaled

Ajout du dossier lib

84d2a97 8 months ago

6.43 kB

	use std::f64::consts::{E, PI};

	/// This function estimates how many real points were selected with the filter.
	/// It is assumed that each real point has, on average, X values in correspondence. As a response
	/// to the execution of the query it is possible to establish only the number of matched associated
	/// values.
	///
	/// # Arguments
	///
	/// * `total_points` - total number of the unique points in the whole collection
	/// * `total_values` - total number of payload values in the collection
	/// * `selected_values_count` - amount of values selected during the query
	///
	/// # Result
	///
	/// Expected amount of unique points contained in selected values
	/// The result might overflow at some corner cases
	/// so it is better to limit its value with min and max
	///
	pub fn estimate_multi_value_selection_cardinality(
	total_points: usize,
	total_values: usize,
	selected_values_count: usize,
	) -> f64 {
	// Value >= 1.0
	assert!(total_values >= total_points);
	let values_per_point = total_values as f64 / total_points as f64;
	// Probability to select each unique value
	let prob_select = 1. - prob_not_select(total_values, values_per_point, selected_values_count);
	prob_select * total_points as f64
	}

	/// Fast approximate computation of $ln(n!)$
	/// See: <https://en.wikipedia.org/wiki/Stirling%27s_approximation>
	fn approx_fact_log(n: f64) -> f64 {
	if n < 1.0 {
	return 1.0; // By definition
	}
	(2. * PI * n).sqrt().ln() + n * (n / E).ln()
	}

	/// Probability of each individual unique point to be selected with the query
	///
	/// Straight equation:
	/// $\prod_{i=0}^{N-1} \frac{total - avg - i}{total - i}$
	/// , where `N` - number of selected points
	///
	/// Proof:
	///
	/// $$
	/// \prod_{i=0}^{N-1} \frac{total - avg - i}{total - i}
	/// = \frac{\prod_{i=0}^{N-1} (total - avg - i)}{\prod_{i=0}^{N-1}(total - i)}
	/// = \frac{\prod_{i=1}^{N} (total - avg - i + 1)}{\prod_{i=1}^{N}(total - i + 1)}\\
	/// = \frac{\prod_{i=1}^{N} (total - avg - (N - i + 1) + 1)}{\prod_{i=1}^{N}(total - (N - i + 1) + 1)}
	/// = \frac{\prod_{i=1}^{N} (i + total - avg - N)}{\prod_{i=1}^{N}(i + total - N)}\\
	/// = \frac{\prod_{i=1}^{total - avg} i}{\prod_{i=1}^{total - avg - N} i} \frac{\prod_{i=1}^{total - N} i}{\prod_{i=1}^{total} i}
	/// = \frac{(total - avg)!(total - N)!}{(total - avg - N)!(total)!}
	/// = \exp(\ln{\frac{(total - avg)!(total - N)!}{(total - avg - N)!(total)!}})\\
	/// = \exp(\ln((total - avg)!(total - N)!) - \ln((total - avg - N)!(total)!))
	/// = \exp( \ln((total - avg)!) + \ln((total - N)!) - \ln((total - avg - N)!) - \ln(total!))
	/// $$
	///
	/// Hint: use <https://latex.codecogs.com/eqneditor/editor.php> to render formula
	fn prob_not_select(total: usize, avg: f64, selected: usize) -> f64 {
	let total = total as f64;
	let selected = selected as f64;
	(approx_fact_log(total - avg) + approx_fact_log(total - selected)
	- approx_fact_log(total - avg - selected)
	- approx_fact_log(total))
	.exp()
	}

	/// Calculate number of selected points, based on the amount of matched values.
	/// Assuming that values are randomly distributed among points and each point can have multiple values.
	/// Math is based on: <https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives>
	pub fn number_of_selected_points(points: usize, values: usize) -> usize {
	let prob_of_selection = 1. - (-(values as f64 / points as f64)).exp();
	(prob_of_selection * points as f64).round() as usize
	}

	#[cfg(test)]
	mod tests {
	use std::collections::HashSet;

	use rand::prelude::StdRng;
	use rand::seq::SliceRandom;
	use rand::SeedableRng;

	use super::*;

	#[test]
	fn test_selected_points_est() {
	let res = number_of_selected_points(100, 1000);
	assert!(res > 95);
	assert!(res <= 100);

	let res = number_of_selected_points(1000, 10);
	assert!(res > 5);
	assert!(res <= 10);
	}

	fn simulate(uniq: usize, avg: usize, selected: usize) -> usize {
	let mut data: Vec<_> = vec![];
	for i in 0..uniq {
	for _ in 0..avg {
	data.push(i);
	}
	}
	data.shuffle(&mut StdRng::seed_from_u64(42));

	let mut unique_selected: HashSet<_> = Default::default();
	for x in data.into_iter().take(selected) {
	unique_selected.insert(x);
	}

	unique_selected.len()
	}

	#[test]
	fn approx_factorial() {
	let approx = approx_fact_log(10.).exp();
	let real = f64::from(2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10);
	let error = (approx / real - 1.0).abs();
	assert!(error < 0.01);
	}

	#[test]
	fn test_estimation_corner_cases() {
	let count = estimate_multi_value_selection_cardinality(10, 20, 20);
	assert!(!count.is_nan());
	eprintln!("count = {count:#?}");
	let count = estimate_multi_value_selection_cardinality(100, 100, 100);
	assert!(!count.is_nan());
	eprintln!("count = {count:#?}");
	let count = estimate_multi_value_selection_cardinality(100, 100, 50);
	assert!(!count.is_nan());
	eprintln!("count = {count:#?}");
	let count = estimate_multi_value_selection_cardinality(10, 10, 10);
	assert!(!count.is_nan());
	eprintln!("count = {count:#?}");
	let count = estimate_multi_value_selection_cardinality(1, 1, 1);
	assert!(!count.is_nan());
	eprintln!("count = {count:#?}");
	let count = estimate_multi_value_selection_cardinality(1, 1, 0);
	assert!(!count.is_nan());
	eprintln!("count = {count:#?}");
	}

	#[test]
	fn test_estimation_1() {
	let total = 2000;
	let unique = 1000;
	let selected = 50;

	let estimation = estimate_multi_value_selection_cardinality(unique, total, selected);
	let experiment = simulate(unique, total / unique, selected);

	let error = (estimation / experiment as f64 - 1.0).abs();
	assert!(error < 0.05);
	}

	#[test]
	fn test_estimation_2() {
	let total = 2000;
	let unique = 1000;
	let selected = 300;

	let estimation = estimate_multi_value_selection_cardinality(unique, total, selected);
	let experiment = simulate(unique, total / unique, selected);
	let error = (estimation / experiment as f64 - 1.0).abs();
	assert!(error < 0.05);
	}
	}