Gouzi Mohaled
Ajout du dossier lib
84d2a97
use std::borrow::Cow;
use std::cmp::max;
use std::path::{Path, PathBuf};
use common::fixed_length_priority_queue::FixedLengthPriorityQueue;
use common::types::{PointOffsetType, ScoredPointOffset};
use io::file_operations::{atomic_save_bin, read_bin, FileStorageError};
use itertools::Itertools;
use memory::mmap_ops;
use serde::{Deserialize, Serialize};
use super::entry_points::EntryPoint;
use super::graph_links::{GraphLinks, GraphLinksMmap};
use crate::common::operation_error::OperationResult;
use crate::common::utils::rev_range;
use crate::index::hnsw_index::entry_points::EntryPoints;
use crate::index::hnsw_index::graph_links::GraphLinksConverter;
use crate::index::hnsw_index::point_scorer::FilteredScorer;
use crate::index::hnsw_index::search_context::SearchContext;
use crate::index::visited_pool::{VisitedListHandle, VisitedPool};
pub type LinkContainer = Vec<PointOffsetType>;
pub type LayersContainer = Vec<LinkContainer>;
pub const HNSW_GRAPH_FILE: &str = "graph.bin";
pub const HNSW_LINKS_FILE: &str = "links.bin";
/// Contents of the `graph.bin` file.
#[derive(Deserialize, Serialize, Debug)]
struct GraphLayerData<'a> {
m: usize,
m0: usize,
ef_construct: usize,
entry_points: Cow<'a, EntryPoints>,
}
/// Contents of the `graph.bin` file (Qdrant 0.8.4).
#[derive(Deserialize, Serialize, Debug)]
struct GraphLayersBackwardCompatibility {
max_level: usize,
m: usize,
m0: usize,
ef_construct: usize,
links_layers: Vec<LayersContainer>,
entry_points: EntryPoints,
}
#[derive(Debug)]
pub struct GraphLayers<TGraphLinks: GraphLinks> {
pub(super) m: usize,
pub(super) m0: usize,
pub(super) ef_construct: usize,
pub(super) links: TGraphLinks,
pub(super) entry_points: EntryPoints,
pub(super) visited_pool: VisitedPool,
}
pub trait GraphLayersBase {
fn get_visited_list_from_pool(&self) -> VisitedListHandle;
fn links_map<F>(&self, point_id: PointOffsetType, level: usize, f: F)
where
F: FnMut(PointOffsetType);
/// Get M based on current level
fn get_m(&self, level: usize) -> usize;
/// Greedy search for closest points within a single graph layer
fn _search_on_level(
&self,
searcher: &mut SearchContext,
level: usize,
visited_list: &mut VisitedListHandle,
points_scorer: &mut FilteredScorer,
) {
let limit = self.get_m(level);
let mut points_ids: Vec<PointOffsetType> = Vec::with_capacity(2 * limit);
while let Some(candidate) = searcher.candidates.pop() {
if candidate.score < searcher.lower_bound() {
break;
}
points_ids.clear();
self.links_map(candidate.idx, level, |link| {
if !visited_list.check(link) {
points_ids.push(link);
}
});
let scores = points_scorer.score_points(&mut points_ids, limit);
scores.iter().copied().for_each(|score_point| {
searcher.process_candidate(score_point);
visited_list.check_and_update_visited(score_point.idx);
});
}
}
fn search_on_level(
&self,
level_entry: ScoredPointOffset,
level: usize,
ef: usize,
points_scorer: &mut FilteredScorer,
) -> FixedLengthPriorityQueue<ScoredPointOffset> {
let mut visited_list = self.get_visited_list_from_pool();
visited_list.check_and_update_visited(level_entry.idx);
let mut search_context = SearchContext::new(level_entry, ef);
self._search_on_level(&mut search_context, level, &mut visited_list, points_scorer);
search_context.nearest
}
/// Greedy searches for entry point of level `target_level`.
/// Beam size is 1.
fn search_entry(
&self,
entry_point: PointOffsetType,
top_level: usize,
target_level: usize,
points_scorer: &mut FilteredScorer,
) -> ScoredPointOffset {
let mut links: Vec<PointOffsetType> = Vec::with_capacity(2 * self.get_m(0));
let mut current_point = ScoredPointOffset {
idx: entry_point,
score: points_scorer.score_point(entry_point),
};
for level in rev_range(top_level, target_level) {
let limit = self.get_m(level);
let mut changed = true;
while changed {
changed = false;
links.clear();
self.links_map(current_point.idx, level, |link| {
links.push(link);
});
let scores = points_scorer.score_points(&mut links, limit);
scores.iter().copied().for_each(|score_point| {
if score_point.score > current_point.score {
changed = true;
current_point = score_point;
}
});
}
}
current_point
}
}
impl<TGraphLinks: GraphLinks> GraphLayersBase for GraphLayers<TGraphLinks> {
fn get_visited_list_from_pool(&self) -> VisitedListHandle {
self.visited_pool.get(self.links.num_points())
}
fn links_map<F>(&self, point_id: PointOffsetType, level: usize, mut f: F)
where
F: FnMut(PointOffsetType),
{
for link in self.links.links(point_id, level) {
f(link);
}
}
fn get_m(&self, level: usize) -> usize {
if level == 0 {
self.m0
} else {
self.m
}
}
}
/// Object contains links between nodes for HNSW search
///
/// Assume all scores are similarities. Larger score = closer points
impl<TGraphLinks: GraphLinks> GraphLayers<TGraphLinks> {
/// Returns the highest level this point is included in
pub fn point_level(&self, point_id: PointOffsetType) -> usize {
self.links.point_level(point_id)
}
fn get_entry_point(
&self,
points_scorer: &FilteredScorer,
custom_entry_points: Option<&[PointOffsetType]>,
) -> Option<EntryPoint> {
// Try to get it from custom entry points
custom_entry_points
.and_then(|custom_entry_points| {
custom_entry_points
.iter()
.filter(|&&point_id| points_scorer.check_vector(point_id))
.map(|&point_id| {
let level = self.point_level(point_id);
EntryPoint { point_id, level }
})
.max_by_key(|ep| ep.level)
})
.or_else(|| {
// Otherwise use normal entry points
self.entry_points
.get_entry_point(|point_id| points_scorer.check_vector(point_id))
})
}
pub fn search(
&self,
top: usize,
ef: usize,
mut points_scorer: FilteredScorer,
custom_entry_points: Option<&[PointOffsetType]>,
) -> Vec<ScoredPointOffset> {
let Some(entry_point) = self.get_entry_point(&points_scorer, custom_entry_points) else {
return Vec::default();
};
let zero_level_entry = self.search_entry(
entry_point.point_id,
entry_point.level,
0,
&mut points_scorer,
);
let nearest = self.search_on_level(zero_level_entry, 0, max(top, ef), &mut points_scorer);
nearest.into_iter().take(top).collect_vec()
}
pub fn get_path(path: &Path) -> PathBuf {
path.join(HNSW_GRAPH_FILE)
}
pub fn get_links_path(path: &Path) -> PathBuf {
path.join(HNSW_LINKS_FILE)
}
pub fn num_points(&self) -> usize {
self.links.num_points()
}
}
impl<TGraphLinks> GraphLayers<TGraphLinks>
where
TGraphLinks: GraphLinks,
{
pub fn load(graph_path: &Path, links_path: &Path) -> OperationResult<Self> {
let try_data: Result<GraphLayerData, FileStorageError> = if links_path.exists() {
read_bin(graph_path)
} else {
Err(FileStorageError::generic(format!(
"Links file does not exists: {links_path:?}"
)))
};
match try_data {
Ok(data) => {
let links = TGraphLinks::load_from_file(links_path)?;
Ok(Self {
m: data.m,
m0: data.m0,
ef_construct: data.ef_construct,
links,
entry_points: data.entry_points.into_owned(),
visited_pool: VisitedPool::new(),
})
}
Err(err) => {
let try_legacy: Result<GraphLayersBackwardCompatibility, _> = read_bin(graph_path);
if let Ok(legacy) = try_legacy {
log::debug!("Converting legacy graph to new format");
let mut converter = GraphLinksConverter::new(legacy.links_layers);
converter.save_as(links_path)?;
let links = TGraphLinks::from_converter(converter)?;
let slf = Self {
m: legacy.m,
m0: legacy.m0,
ef_construct: legacy.ef_construct,
links,
entry_points: legacy.entry_points,
visited_pool: VisitedPool::new(),
};
slf.save(graph_path)?;
Ok(slf)
} else {
Err(err)?
}
}
}
}
pub fn save(&self, path: &Path) -> OperationResult<()> {
Ok(atomic_save_bin(path, &self.data())?)
}
fn data(&self) -> GraphLayerData {
GraphLayerData {
m: self.m,
m0: self.m0,
ef_construct: self.ef_construct,
entry_points: Cow::Borrowed(&self.entry_points),
}
}
}
impl GraphLayers<GraphLinksMmap> {
pub fn prefault_mmap_pages(&self, path: &Path) -> mmap_ops::PrefaultMmapPages {
self.links.prefault_mmap_pages(path)
}
}
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::Write;
use itertools::Itertools;
use rand::rngs::StdRng;
use rand::SeedableRng;
use tempfile::Builder;
use super::*;
use crate::data_types::vectors::VectorElementType;
use crate::fixtures::index_fixtures::{
random_vector, FakeFilterContext, TestRawScorerProducer,
};
use crate::index::hnsw_index::graph_links::GraphLinksRam;
use crate::index::hnsw_index::tests::create_graph_layer_fixture;
use crate::spaces::metric::Metric;
use crate::spaces::simple::{CosineMetric, DotProductMetric};
use crate::vector_storage::chunked_vector_storage::VectorOffsetType;
fn search_in_graph<TGraphLinks: GraphLinks>(
query: &[VectorElementType],
top: usize,
vector_storage: &TestRawScorerProducer<CosineMetric>,
graph: &GraphLayers<TGraphLinks>,
) -> Vec<ScoredPointOffset> {
let fake_filter_context = FakeFilterContext {};
let raw_scorer = vector_storage.get_raw_scorer(query.to_owned()).unwrap();
let scorer = FilteredScorer::new(raw_scorer.as_ref(), Some(&fake_filter_context));
let ef = 16;
let res = graph.search(top, ef, scorer, None);
raw_scorer.take_hardware_counter().discard_results();
res
}
const M: usize = 8;
#[test]
fn test_search_on_level() {
let dim = 8;
let m = 8;
let ef_construct = 32;
let entry_points_num = 10;
let num_vectors = 10;
let mut rng = StdRng::seed_from_u64(42);
let vector_holder =
TestRawScorerProducer::<DotProductMetric>::new(dim, num_vectors, &mut rng);
let mut graph_links = vec![vec![Vec::new()]; num_vectors];
graph_links[0][0] = vec![1, 2, 3, 4, 5, 6];
let graph_layers = GraphLayers {
m,
m0: 2 * m,
ef_construct,
links: GraphLinksRam::from_converter(GraphLinksConverter::new(graph_links.clone()))
.unwrap(),
entry_points: EntryPoints::new(entry_points_num),
visited_pool: VisitedPool::new(),
};
let linking_idx: PointOffsetType = 7;
let fake_filter_context = FakeFilterContext {};
let added_vector = vector_holder
.vectors
.get(linking_idx as VectorOffsetType)
.to_vec();
let raw_scorer = vector_holder.get_raw_scorer(added_vector).unwrap();
let mut scorer = FilteredScorer::new(raw_scorer.as_ref(), Some(&fake_filter_context));
let nearest_on_level = graph_layers.search_on_level(
ScoredPointOffset {
idx: 0,
score: scorer.score_point(0),
},
0,
32,
&mut scorer,
);
assert_eq!(nearest_on_level.len(), graph_links[0][0].len() + 1);
for nearest in &nearest_on_level {
// eprintln!("nearest = {:#?}", nearest);
assert_eq!(
nearest.score,
scorer.score_internal(linking_idx, nearest.idx)
)
}
raw_scorer.take_hardware_counter().discard_results();
}
#[test]
fn test_save_and_load() {
let num_vectors = 100;
let dim = 8;
let top = 5;
let mut rng = StdRng::seed_from_u64(42);
let dir = Builder::new().prefix("graph_dir").tempdir().unwrap();
let links_path = GraphLayers::<GraphLinksRam>::get_links_path(dir.path());
let (vector_holder, graph_layers) = create_graph_layer_fixture::<CosineMetric, _>(
num_vectors,
M,
dim,
false,
&mut rng,
Some(&links_path),
);
let query = random_vector(&mut rng, dim);
let res1 = search_in_graph(&query, top, &vector_holder, &graph_layers);
let path = GraphLayers::<GraphLinksRam>::get_path(dir.path());
graph_layers.save(&path).unwrap();
let graph2 = GraphLayers::<GraphLinksRam>::load(&path, &links_path).unwrap();
let res2 = search_in_graph(&query, top, &vector_holder, &graph2);
assert_eq!(res1, res2)
}
#[test]
fn test_add_points() {
let num_vectors = 1000;
let dim = 8;
let mut rng = StdRng::seed_from_u64(42);
type M = CosineMetric;
let (vector_holder, graph_layers) =
create_graph_layer_fixture::<M, _>(num_vectors, M, dim, false, &mut rng, None);
let main_entry = graph_layers
.entry_points
.get_entry_point(|_x| true)
.expect("Expect entry point to exists");
assert!(main_entry.level > 0);
let num_levels = (0..num_vectors)
.map(|i| graph_layers.links.point_level(i as PointOffsetType))
.max()
.unwrap();
assert_eq!(main_entry.level, num_levels);
let total_links_0 = (0..num_vectors)
.map(|i| graph_layers.links.links(i as PointOffsetType, 0).count())
.sum::<usize>();
eprintln!("total_links_0 = {total_links_0:#?}");
eprintln!("num_vectors = {num_vectors:#?}");
assert!(total_links_0 > 0);
assert!(total_links_0 as f64 / num_vectors as f64 > M as f64);
let top = 5;
let query = random_vector(&mut rng, dim);
let processed_query = <M as Metric<VectorElementType>>::preprocess(query.clone());
let mut reference_top = FixedLengthPriorityQueue::new(top);
for idx in 0..vector_holder.vectors.len() as PointOffsetType {
let vec = &vector_holder.vectors.get(idx as VectorOffsetType);
reference_top.push(ScoredPointOffset {
idx,
score: M::similarity(vec, &processed_query),
});
}
let graph_search = search_in_graph(&query, top, &vector_holder, &graph_layers);
assert_eq!(reference_top.into_vec(), graph_search);
}
#[test]
#[ignore]
fn test_draw_hnsw_graph() {
let dim = 2;
let num_vectors = 500;
let mut rng = StdRng::seed_from_u64(42);
let (vector_holder, graph_layers) = create_graph_layer_fixture::<CosineMetric, _>(
num_vectors,
M,
dim,
true,
&mut rng,
None,
);
let graph_json = serde_json::to_string_pretty(&graph_layers.data()).unwrap();
let vectors_json = serde_json::to_string_pretty(
&(0..vector_holder.vectors.len() as PointOffsetType)
.map(|point_id| {
vector_holder
.vectors
.get(point_id as VectorOffsetType)
.to_vec()
})
.collect_vec(),
)
.unwrap();
let mut file = File::create("graph.json").unwrap();
file.write_all(
format!("{{ \"graph\": {graph_json}, \n \"vectors\": {vectors_json} }}").as_bytes(),
)
.unwrap();
}
}