Spaces:
Build error
Build error
File size: 6,432 Bytes
84d2a97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
use std::f64::consts::{E, PI};
/// This function estimates how many real points were selected with the filter.
/// It is assumed that each real point has, on average, X values in correspondence. As a response
/// to the execution of the query it is possible to establish only the number of matched associated
/// values.
///
/// # Arguments
///
/// * `total_points` - total number of the unique points in the whole collection
/// * `total_values` - total number of payload values in the collection
/// * `selected_values_count` - amount of values selected during the query
///
/// # Result
///
/// Expected amount of unique points contained in selected values
/// The result might overflow at some corner cases
/// so it is better to limit its value with min and max
///
pub fn estimate_multi_value_selection_cardinality(
total_points: usize,
total_values: usize,
selected_values_count: usize,
) -> f64 {
// Value >= 1.0
assert!(total_values >= total_points);
let values_per_point = total_values as f64 / total_points as f64;
// Probability to select each unique value
let prob_select = 1. - prob_not_select(total_values, values_per_point, selected_values_count);
prob_select * total_points as f64
}
/// Fast approximate computation of $ln(n!)$
/// See: <https://en.wikipedia.org/wiki/Stirling%27s_approximation>
fn approx_fact_log(n: f64) -> f64 {
if n < 1.0 {
return 1.0; // By definition
}
(2. * PI * n).sqrt().ln() + n * (n / E).ln()
}
/// Probability of each individual unique point to be selected with the query
///
/// Straight equation:
/// $\prod_{i=0}^{N-1} \frac{total - avg - i}{total - i}$
/// , where `N` - number of selected points
///
/// Proof:
///
/// $$
/// \prod_{i=0}^{N-1} \frac{total - avg - i}{total - i}
/// = \frac{\prod_{i=0}^{N-1} (total - avg - i)}{\prod_{i=0}^{N-1}(total - i)}
/// = \frac{\prod_{i=1}^{N} (total - avg - i + 1)}{\prod_{i=1}^{N}(total - i + 1)}\\
/// = \frac{\prod_{i=1}^{N} (total - avg - (N - i + 1) + 1)}{\prod_{i=1}^{N}(total - (N - i + 1) + 1)}
/// = \frac{\prod_{i=1}^{N} (i + total - avg - N)}{\prod_{i=1}^{N}(i + total - N)}\\
/// = \frac{\prod_{i=1}^{total - avg} i}{\prod_{i=1}^{total - avg - N} i} \frac{\prod_{i=1}^{total - N} i}{\prod_{i=1}^{total} i}
/// = \frac{(total - avg)!(total - N)!}{(total - avg - N)!(total)!}
/// = \exp(\ln{\frac{(total - avg)!(total - N)!}{(total - avg - N)!(total)!}})\\
/// = \exp(\ln((total - avg)!(total - N)!) - \ln((total - avg - N)!(total)!))
/// = \exp( \ln((total - avg)!) + \ln((total - N)!) - \ln((total - avg - N)!) - \ln(total!))
/// $$
///
/// Hint: use <https://latex.codecogs.com/eqneditor/editor.php> to render formula
fn prob_not_select(total: usize, avg: f64, selected: usize) -> f64 {
let total = total as f64;
let selected = selected as f64;
(approx_fact_log(total - avg) + approx_fact_log(total - selected)
- approx_fact_log(total - avg - selected)
- approx_fact_log(total))
.exp()
}
/// Calculate number of selected points, based on the amount of matched values.
/// Assuming that values are randomly distributed among points and each point can have multiple values.
/// Math is based on: <https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives>
pub fn number_of_selected_points(points: usize, values: usize) -> usize {
let prob_of_selection = 1. - (-(values as f64 / points as f64)).exp();
(prob_of_selection * points as f64).round() as usize
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use rand::prelude::StdRng;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use super::*;
#[test]
fn test_selected_points_est() {
let res = number_of_selected_points(100, 1000);
assert!(res > 95);
assert!(res <= 100);
let res = number_of_selected_points(1000, 10);
assert!(res > 5);
assert!(res <= 10);
}
fn simulate(uniq: usize, avg: usize, selected: usize) -> usize {
let mut data: Vec<_> = vec![];
for i in 0..uniq {
for _ in 0..avg {
data.push(i);
}
}
data.shuffle(&mut StdRng::seed_from_u64(42));
let mut unique_selected: HashSet<_> = Default::default();
for x in data.into_iter().take(selected) {
unique_selected.insert(x);
}
unique_selected.len()
}
#[test]
fn approx_factorial() {
let approx = approx_fact_log(10.).exp();
let real = f64::from(2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10);
let error = (approx / real - 1.0).abs();
assert!(error < 0.01);
}
#[test]
fn test_estimation_corner_cases() {
let count = estimate_multi_value_selection_cardinality(10, 20, 20);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(100, 100, 100);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(100, 100, 50);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(10, 10, 10);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(1, 1, 1);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
let count = estimate_multi_value_selection_cardinality(1, 1, 0);
assert!(!count.is_nan());
eprintln!("count = {count:#?}");
}
#[test]
fn test_estimation_1() {
let total = 2000;
let unique = 1000;
let selected = 50;
let estimation = estimate_multi_value_selection_cardinality(unique, total, selected);
let experiment = simulate(unique, total / unique, selected);
let error = (estimation / experiment as f64 - 1.0).abs();
assert!(error < 0.05);
}
#[test]
fn test_estimation_2() {
let total = 2000;
let unique = 1000;
let selected = 300;
let estimation = estimate_multi_value_selection_cardinality(unique, total, selected);
let experiment = simulate(unique, total / unique, selected);
let error = (estimation / experiment as f64 - 1.0).abs();
assert!(error < 0.05);
}
}
|