use std::collections::BTreeMap; use std::collections::Bound::{Excluded, Included, Unbounded}; use std::ops::Bound; use std::path::{Path, PathBuf}; use common::types::PointOffsetType; use io::file_operations::{atomic_save_bin, atomic_save_json, read_bin, read_json}; use itertools::Itertools; use num_traits::Num; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use crate::common::operation_error::OperationResult; use crate::index::field_index::utils::check_boundaries; const MIN_BUCKET_SIZE: usize = 10; const CONFIG_PATH: &str = "histogram_config.json"; const BORDERS_PATH: &str = "histogram_borders.bin"; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Counts { pub left: usize, pub right: usize, } #[derive(PartialEq, PartialOrd, Debug, Clone, Serialize, Deserialize)] #[repr(C)] pub struct Point { pub val: T, pub idx: PointOffsetType, } impl Point { pub fn new(val: T, idx: PointOffsetType) -> Self { Self { val, idx } } } impl Eq for Point {} #[allow(clippy::derive_ord_xor_partial_ord)] impl Ord for Point { fn cmp(&self, other: &Point) -> std::cmp::Ordering { (self.val, self.idx) .partial_cmp(&(other.val, other.idx)) .unwrap() } } /// A trait that should represent common properties of integer and floating point types. /// In particular, i64 and f64. pub trait Numericable: Num + PartialEq + PartialOrd + Copy { fn min_value() -> Self; fn max_value() -> Self; fn to_f64(self) -> f64; fn from_f64(x: f64) -> Self; fn from_u128(x: u128) -> Self; fn min(self, b: Self) -> Self { if self < b { self } else { b } } fn max(self, b: Self) -> Self { if self > b { self } else { b } } fn abs_diff(self, b: Self) -> Self { if self > b { self - b } else { b - self } } } impl Numericable for i64 { fn min_value() -> Self { i64::MIN } fn max_value() -> Self { i64::MAX } fn to_f64(self) -> f64 { self as f64 } fn from_f64(x: f64) -> Self { x as Self } fn from_u128(x: u128) -> Self { x as i64 } fn abs_diff(self, b: Self) -> Self { i64::abs_diff(self, b) as i64 } } impl Numericable for f64 { fn min_value() -> Self { f64::MIN } fn max_value() -> Self { f64::MAX } fn to_f64(self) -> f64 { self } fn from_f64(x: f64) -> Self { x } fn from_u128(x: u128) -> Self { x as Self } } impl Numericable for u128 { fn min_value() -> Self { u128::MIN } fn max_value() -> Self { u128::MAX } fn to_f64(self) -> f64 { self as f64 } fn from_f64(x: f64) -> Self { x as u128 } fn from_u128(x: u128) -> Self { x } fn abs_diff(self, b: Self) -> Self { u128::abs_diff(self, b) } } #[derive(Debug, PartialEq)] pub struct Histogram { max_bucket_size: usize, precision: f64, total_count: usize, borders: BTreeMap, Counts>, } #[derive(Debug, Serialize, Deserialize)] struct HistogramConfig { max_bucket_size: usize, precision: f64, total_count: usize, } impl Histogram { pub fn new(max_bucket_size: usize, precision: f64) -> Self { assert!(precision < 1.0); assert!(precision > 0.0); Self { max_bucket_size, precision, total_count: 0, borders: BTreeMap::default(), } } pub fn load(path: &Path) -> OperationResult { let config_path = path.join(CONFIG_PATH); let borders_path = path.join(BORDERS_PATH); let histogram_config: HistogramConfig = read_json(&config_path)?; let histogram_buckets: Vec<(Point, Counts)> = read_bin(&borders_path)?; Ok(Self { max_bucket_size: histogram_config.max_bucket_size, precision: histogram_config.precision, total_count: histogram_config.total_count, borders: histogram_buckets.into_iter().collect(), }) } pub fn save(&self, path: &Path) -> OperationResult<()> { let config_path = path.join(CONFIG_PATH); let borders_path = path.join(BORDERS_PATH); atomic_save_json( &config_path, &HistogramConfig { max_bucket_size: self.max_bucket_size, precision: self.precision, total_count: self.total_count, }, )?; let borders: Vec<(Point, Counts)> = self .borders .iter() .map(|(k, v)| (k.clone(), v.clone())) .collect(); atomic_save_bin(&borders_path, &borders)?; Ok(()) } pub fn files(path: &Path) -> Vec { vec![path.join(CONFIG_PATH), path.join(BORDERS_PATH)] } #[cfg(test)] pub fn total_count(&self) -> usize { self.total_count } #[cfg(test)] pub fn borders(&self) -> &BTreeMap, Counts> { &self.borders } pub fn current_bucket_size(&self) -> usize { let bucket_size = (self.total_count as f64 * self.precision) as usize; bucket_size.clamp(MIN_BUCKET_SIZE, self.max_bucket_size) } pub fn get_total_count(&self) -> usize { self.total_count } /// Infers boundaries for bucket of given size and starting point. /// Returns `to` range of values starting provided `from`value which is expected to contain /// `range_size` values /// /// Returns `Unbounded` if there are no points stored pub fn get_range_by_size(&self, from: Bound, range_size: usize) -> Bound { let from_ = match from { Included(val) => Included(Point { val, idx: PointOffsetType::MIN, }), Excluded(val) => Excluded(Point { val, idx: PointOffsetType::MAX, }), Unbounded => Unbounded, }; let mut reached_count = 0; for (border, counts) in self.borders.range((from_, Unbounded)) { if reached_count + counts.left > range_size { // required size reached return Included(border.val); } else { // Size not yet reached reached_count += counts.left; } } Unbounded } pub fn estimate(&self, from: Bound, to: Bound) -> (usize, usize, usize) { let from_ = match &from { Included(val) => Included(Point { val: *val, idx: PointOffsetType::MIN, }), Excluded(val) => Excluded(Point { val: *val, idx: PointOffsetType::MAX, }), Unbounded => Unbounded, }; let to_ = match &to { Included(val) => Included(Point { val: *val, idx: PointOffsetType::MAX, }), Excluded(val) => Excluded(Point { val: *val, idx: PointOffsetType::MIN, }), Unbounded => Unbounded, }; // Value for range fraction estimation let from_val = match from { Included(val) => val, Excluded(val) => val, Unbounded => T::min_value(), }; let to_val = match to { Included(val) => val, Excluded(val) => val, Unbounded => T::max_value(), }; let left_border = { if matches!(from_, Unbounded) { None } else { self.borders.range((Unbounded, from_.clone())).next_back() } }; let right_border = { if matches!(to_, Unbounded) { None } else { self.borders.range((to_.clone(), Unbounded)).next() } }; if !check_boundaries(&from_, &to_) { return (0, 0, 0); } let estimation = left_border .into_iter() .chain(self.borders.range((from_, to_))) .chain(right_border) .tuple_windows() .map( |((a, a_count), (b, _b_count)): ((&Point, &Counts), (&Point, _))| { let val_range = (b.val - a.val).to_f64(); if val_range == 0. { // Zero-length range is always covered let estimates = a_count.right + 1; return (estimates, estimates, estimates); } if a_count.right == 0 { // Range covers most-right border return (1, 1, 1); } let cover_range = (to_val.min(b.val) - from_val.max(a.val)).to_f64(); let covered_frac = cover_range / val_range; let estimate = (a_count.right as f64 * covered_frac).round() as usize + 1; let min_estimate = if cover_range == val_range { a_count.right + 1 } else { 0 }; let max_estimate = a_count.right + 1; (min_estimate, estimate, max_estimate) }, ) .reduce(|a, b| (a.0 + b.0, a.1 + b.1, a.2 + b.2)) .unwrap_or((0, 0, 0)); estimation } pub fn remove(&mut self, val: &Point, left_neighbour: F, right_neighbour: G) where F: Fn(&Point) -> Option>, G: Fn(&Point) -> Option>, { let (mut close_neighbors, (mut far_left_neighbor, mut far_right_neighbor)) = { let mut left_iterator = self .borders .range((Unbounded, Included(val.clone()))) .map(|(k, v)| (k.clone(), v.clone())); let mut right_iterator = self .borders .range((Excluded(val.clone()), Unbounded)) .map(|(k, v)| (k.clone(), v.clone())); ( (left_iterator.next_back(), right_iterator.next()), (left_iterator.next_back(), right_iterator.next()), ) }; let (to_remove, to_create, removed) = match &mut close_neighbors { (None, None) => (None, None, false), // histogram is empty (Some((left_border, ref mut left_border_count)), None) => { if left_border == val { // ....| // ...| if left_border_count.left == 0 { // ...|| // ...| (Some(left_border.clone()), None, true) } else { // ...|..| // ...|.| if let Some((_fln, ref mut fln_count)) = &mut far_left_neighbor { fln_count.right -= 1 } let (new_border, new_border_count) = ( left_neighbour(left_border).unwrap(), Counts { left: left_border_count.left - 1, right: 0, }, ); ( Some(left_border.clone()), Some((new_border, new_border_count)), true, ) } } else { (None, None, false) } } (None, Some((right_border, ref mut right_border_count))) => { if right_border == val { // |... // |.. if right_border_count.right == 0 { // ||... // |... (Some(right_border.clone()), None, true) } else { // |..|... // |.|... if let Some((_frn, ref mut frn_count)) = &mut far_right_neighbor { frn_count.left -= 1 } let (new_border, new_border_count) = ( right_neighbour(right_border).unwrap(), Counts { left: 0, right: right_border_count.right - 1, }, ); ( Some(right_border.clone()), Some((new_border, new_border_count)), true, ) } } else { (None, None, false) } } ( Some((left_border, ref mut left_border_count)), Some((right_border, ref mut right_border_count)), ) => { // ...|...x.|... if left_border == val { // ...|....|... // ... |...|... if left_border_count.right == 0 { // ...||... // ... |... right_border_count.left = left_border_count.left; (Some(left_border.clone()), None, true) } else if right_border_count.left + left_border_count.left <= self.current_bucket_size() && far_left_neighbor.is_some() { // ...|.l..r... // ...|. ..r... if let Some((_fln, ref mut fln_count)) = &mut far_left_neighbor { fln_count.right += right_border_count.left; right_border_count.left = fln_count.right; } (Some(left_border.clone()), None, true) } else { // ...|..|... // ... |.|... right_border_count.left -= 1; let (new_border, new_border_count) = ( right_neighbour(left_border).unwrap(), Counts { left: left_border_count.left, right: left_border_count.right - 1, }, ); ( Some(left_border.clone()), Some((new_border, new_border_count)), true, ) } } else if right_border == val { // ...|....|... // ...|...| ... if right_border_count.left == 0 { // ...||... // ...| ... left_border_count.right = right_border_count.left; (Some(right_border.clone()), None, true) } else if left_border_count.right + right_border_count.right <= self.current_bucket_size() && far_right_neighbor.is_some() { // ...l..r.|... // ...l.. .|... if let Some((_frn, ref mut frn_count)) = &mut far_right_neighbor { frn_count.left += left_border_count.right; left_border_count.right = frn_count.left; } (Some(right_border.clone()), None, true) } else { // ...|..|... // ...|.| ... left_border_count.right -= 1; let (new_border, new_border_count) = ( left_neighbour(right_border).unwrap(), Counts { left: right_border_count.right, right: right_border_count.left - 1, }, ); ( Some(right_border.clone()), Some((new_border, new_border_count)), true, ) } } else if right_border_count.left == 0 { // ...||... // ...||... (None, None, false) } else { // ...|...|... // ...|. .|... right_border_count.left -= 1; left_border_count.right -= 1; (None, None, true) } } }; if removed { self.total_count -= 1; } let (left_border_opt, right_border_opt) = close_neighbors; if let Some((k, v)) = left_border_opt { self.borders.insert(k, v); } if let Some((k, v)) = right_border_opt { self.borders.insert(k, v); } if let Some((k, v)) = far_left_neighbor { self.borders.insert(k, v); } if let Some((k, v)) = far_right_neighbor { self.borders.insert(k, v); } if let Some(remove_border) = to_remove { self.borders.remove(&remove_border); } if let Some((new_border, new_border_count)) = to_create { self.borders.insert(new_border, new_border_count); } } /// Warn: `val` should be unique pub fn insert(&mut self, val: Point, left_neighbour: F, right_neighbour: G) where F: Fn(&Point) -> Option>, G: Fn(&Point) -> Option>, { self.total_count += 1; if self.borders.len() < 2 { self.borders.insert(val, Counts { left: 0, right: 0 }); return; } let (mut close_neighbors, (mut far_left_neighbor, mut far_right_neighbor)) = { let mut left_iterator = self .borders .range((Unbounded, Included(val.clone()))) .map(|(k, v)| (k.clone(), v.clone())); let mut right_iterator = self .borders .range((Excluded(val.clone()), Unbounded)) .map(|(k, v)| (k.clone(), v.clone())); ( (left_iterator.next_back(), right_iterator.next()), (left_iterator.next_back(), right_iterator.next()), ) }; let (to_remove, to_create) = match &mut close_neighbors { (None, Some((right_border, right_border_count))) => { // x|.....|... let new_count = right_border_count.right + 1; let (new_border, mut new_border_count) = ( val, Counts { left: 0, right: new_count, }, ); if new_count > self.current_bucket_size() { // Too many values, can't move the border // x|.....|... // ||.....|... new_border_count.right = 0; (None, Some((new_border, new_border_count))) } else { // x|.....|... // |......|... if let Some((_frn, frn_count)) = &mut far_right_neighbor { frn_count.left = new_count; } ( Some(right_border.clone()), Some((new_border, new_border_count)), ) } } (Some((left_border, left_border_count)), None) => { // ...|.....|x let new_count = left_border_count.left + 1; let (new_border, mut new_border_count) = ( val, Counts { left: new_count, right: 0, }, ); if new_count > self.current_bucket_size() { // Too many values, can't move the border // ...|.....|x // ...|.....|| new_border_count.left = 0; (None, Some((new_border, new_border_count))) } else { // ...|.....|x // ...|......| if let Some((_fln, ref mut fln_count)) = &mut far_left_neighbor { fln_count.right = new_count } ( Some(left_border.clone()), Some((new_border, new_border_count)), ) } } (Some((left_border, left_border_count)), Some((right_border, right_border_count))) => { assert_eq!(left_border_count.right, right_border_count.left); let new_count = left_border_count.right + 1; if new_count > self.current_bucket_size() { // Too many values, let's adjust // Decide which border to move let left_dist = val.val.abs_diff(left_border.val); let right_dist = val.val.abs_diff(right_border.val); if left_dist < right_dist { // left border closer: // ...|..x.........|... let (new_border, mut new_border_count) = ( right_neighbour(left_border).unwrap(), Counts { left: left_border_count.left + 1, right: left_border_count.right, }, ); if left_border_count.left < self.current_bucket_size() && far_left_neighbor.is_some() { //we can move // ...|..x.........|... // ....|.x.........|... if let Some((_fln, ref mut fln_count)) = &mut far_left_neighbor { fln_count.right = new_border_count.left } ( Some(left_border.clone()), Some((new_border, new_border_count)), ) } else { // Can't be moved anymore, create an additional one // ...|..x.........|... // ...||.x.........|... new_border_count.left = 0; left_border_count.right = 0; (None, Some((new_border, new_border_count))) } } else { // right border closer // ...|........x...|... let (new_border, mut new_border_count) = ( left_neighbour(right_border).unwrap(), Counts { left: right_border_count.left, right: right_border_count.right + 1, }, ); if right_border_count.right < self.current_bucket_size() && far_right_neighbor.is_some() { // it's ok, we can move // 1: ...|........x...|... // 2: ...|........x..|.... if let Some((_frn, frn_count)) = &mut far_right_neighbor { frn_count.left = new_border_count.right } ( Some(right_border.clone()), Some((new_border, new_border_count)), ) } else { // Can't be moved anymore, create a new one // 1: ...|........x...|... // 2: ...|........x..||... new_border_count.right = 0; right_border_count.left = 0; (None, Some((new_border, new_border_count))) } } } else { left_border_count.right = new_count; right_border_count.left = new_count; (None, None) } } (None, None) => unreachable!(), }; let (left_border_opt, right_border_opt) = close_neighbors; if let Some((k, v)) = left_border_opt { self.borders.insert(k, v); } if let Some((k, v)) = right_border_opt { self.borders.insert(k, v); } if let Some((k, v)) = far_left_neighbor { self.borders.insert(k, v); } if let Some((k, v)) = far_right_neighbor { self.borders.insert(k, v); } if let Some(remove_border) = to_remove { self.borders.remove(&remove_border); } if let Some((new_border, new_border_count)) = to_create { self.borders.insert(new_border, new_border_count); } } }