Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::f64::consts::{E, PI};
/// This function estimates how many real points were selected with the filter.
/// It is assumed that each real point has, on average, X values in correspondence. As a response
/// to the execution of the query it is possible to establish only the number of matched associated
/// values.
///
/// # Arguments
///
/// * `total_points` - total number of the unique points in the whole collection
/// * `total_values` - total number of payload values in the collection
/// * `selected_values_count` - amount of values selected during the query
///
/// # Result
///
/// Expected amount of unique points contained in selected values
/// The result might overflow at some corner cases
/// so it is better to limit its value with min and max
///
pub fn estimate_multi_value_selection_cardinality(
total_points: usize,
total_values: usize,
selected_values_count: usize,
) -> f64 {
// Value >= 1.0
assert!(total_values >= total_points);
let values_per_point = total_values as f64 / total_points as f64;
// Probability to select each unique value
let prob_select = 1. - prob_not_select(total_values, values_per_point, selected_values_count);
prob_select * total_points as f64
}
/// Fast approximate computation of $ln(n!)$
/// See: <https://en.wikipedia.org/wiki/Stirling%27s_approximation>
fn approx_fact_log(n: f64) -> f64 {
if n < 1.0 {
return 1.0; // By definition
}
(2. * PI * n).sqrt().ln() + n * (n / E).ln()
}
/// Probability of each individual unique point to be selected with the query
///
/// Straight equation:
/// $\prod_{i=0}^{N-1} \frac{total - avg - i}{total - i}$
/// , where `N` - number of selected points
///
/// Proof:
///
/// $$
/// \prod_{i=0}^{N-1} \frac{total - avg - i}{total - i}
/// = \frac{\prod_{i=0}^{N-1} (total - avg - i)}{\prod_{i=0}^{N-1}(total - i)}
/// = \frac{\prod_{i=1}^{N} (total - avg - i + 1)}{\prod_{i=1}^{N}(total - i + 1)}\\
/// = \frac{\prod_{i=1}^{N} (total - avg - (N - i + 1) + 1)}{\prod_{i=1}^{N}(total - (N - i + 1) + 1)}
/// = \frac{\prod_{i=1}^{N} (i + total - avg - N)}{\prod_{i=1}^{N}(i + total - N)}\\
/// = \frac{\prod_{i=1}^{total - avg} i}{\prod_{i=1}^{total - avg - N} i} \frac{\prod_{i=1}^{total - N} i}{\prod_{i=1}^{total} i}
/// = \frac{(total - avg)!(total - N)!}{(total - avg - N)!(total)!}
/// = \exp(\ln{\frac{(total - avg)!(total - N)!}{(total - avg - N)!(total)!}})\\
/// = \exp(\ln((total - avg)!(total - N)!) - \ln((total - avg - N)!(total)!))
/// = \exp( \ln((total - avg)!) + \ln((total - N)!) - \ln((total - avg - N)!) - \ln(total!))
/// $$
///
/// Hint: use <https://latex.codecogs.com/eqneditor/editor.php> to render formula
fn prob_not_select(total: usize, avg: f64, selected: usize) -> f64 {
let total = total as f64;
let selected = selected as f64;
(approx_fact_log(total - avg) + approx_fact_log(total - selected)
- approx_fact_log(total - avg - selected)
- approx_fact_log(total))
.exp()
}
/// Calculate number of selected points, based on the amount of matched values.
/// Assuming that values are randomly distributed among points and each point can have multiple values.
/// Math is based on: <https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives>
pub fn number_of_selected_points(points: usize, values: usize) -> usize {
let prob_of_selection = 1. - (-(values as f64 / points as f64)).exp();
(prob_of_selection * points as f64).round() as usize
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use rand::prelude::StdRng;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use super::*;
#[test]
fn test_selected_points_est() {
let res = number_of_selected_points(100, 1000);
assert!(res > 95);
assert!(res <= 100);
let res = number_of_selected_points(1000, 10);
assert!(res > 5);
assert!(res <= 10);
}
fn simulate(uniq: usize, avg: usize, selected: usize) -> usize {
let mut data: Vec<_> = vec![];
for i in 0..uniq {
for _ in 0..avg {
data.push(i);
}
}
data.shuffle(&mut StdRng::seed_from_u64(42));
let mut unique_selected: HashSet<_> = Default::default();
for x in data.into_iter().take(selected) {
unique_selected.insert(x);
}
unique_selected.len()
}
#[test]
fn approx_factorial() {
let approx = approx_fact_log(10.).exp();
let real = f64::from(2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10);
let error = (approx / real - 1.0).abs();
assert!(error < 0.01);
}
#[test]
fn test_estimation_corner_cases() {
let count = estimate_multi_value_selection_cardinality(10, 20, 20);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(100, 100, 100);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(100, 100, 50);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(10, 10, 10);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(1, 1, 1);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(1, 1, 0);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
}
#[test]
fn test_estimation_1() {
let total = 2000;
let unique = 1000;
let selected = 50;
let estimation = estimate_multi_value_selection_cardinality(unique, total, selected);
let experiment = simulate(unique, total / unique, selected);
let error = (estimation / experiment as f64 - 1.0).abs();
assert!(error < 0.05);
}
#[test]
fn test_estimation_2() {
let total = 2000;
let unique = 1000;
let selected = 300;
let estimation = estimate_multi_value_selection_cardinality(unique, total, selected);
let experiment = simulate(unique, total / unique, selected);
let error = (estimation / experiment as f64 - 1.0).abs();
assert!(error < 0.05);
}
}