use std::borrow::Cow; use std::mem::size_of; use std::path::{Path, PathBuf}; use std::sync::Arc; use common::types::PointOffsetType; use io::file_operations::{atomic_save_json, read_json}; use io::storage_version::StorageVersion; use memmap2::{Mmap, MmapMut}; use memory::madvise::{Advice, AdviceSetting}; use memory::mmap_ops::{ create_and_ensure_length, open_read_mmap, open_write_mmap, transmute_from_u8, transmute_from_u8_to_slice, transmute_to_u8, transmute_to_u8_slice, }; use serde::{Deserialize, Serialize}; use super::INDEX_FILE_NAME; use crate::common::sparse_vector::RemappedSparseVector; use crate::common::types::{DimId, DimOffset}; use crate::index::inverted_index::inverted_index_ram::InvertedIndexRam; use crate::index::inverted_index::InvertedIndex; use crate::index::posting_list::PostingListIterator; use crate::index::posting_list_common::PostingElementEx; const POSTING_HEADER_SIZE: usize = size_of::(); const INDEX_CONFIG_FILE_NAME: &str = "inverted_index_config.json"; pub struct Version; impl StorageVersion for Version { fn current_raw() -> &'static str { "0.1.0" } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] pub struct InvertedIndexFileHeader { pub posting_count: usize, // number oof posting lists pub vector_count: usize, // number of unique vectors indexed } /// Inverted flatten index from dimension id to posting list #[derive(Debug)] pub struct InvertedIndexMmap { path: PathBuf, mmap: Arc, pub file_header: InvertedIndexFileHeader, } #[derive(Debug, Default, Clone)] struct PostingListFileHeader { pub start_offset: u64, pub end_offset: u64, } impl InvertedIndex for InvertedIndexMmap { type Iter<'a> = PostingListIterator<'a>; type Version = Version; fn open(path: &Path) -> std::io::Result { Self::load(path) } fn save(&self, path: &Path) -> std::io::Result<()> { debug_assert_eq!(path, self.path); // If Self instance exists, it's either constructed by using `open()` (which reads index // files), or using `from_ram_index()` (which writes them). Both assume that the files // exist. If any of the files are missing, then something went wrong. for file in Self::files(path) { debug_assert!(file.exists()); } Ok(()) } fn get(&self, id: &DimId) -> Option { self.get(id).map(PostingListIterator::new) } fn len(&self) -> usize { self.file_header.posting_count } fn posting_list_len(&self, id: &DimOffset) -> Option { self.get(id).map(|posting_list| posting_list.len()) } fn files(path: &Path) -> Vec { vec![ Self::index_file_path(path), Self::index_config_file_path(path), ] } fn remove(&mut self, _id: PointOffsetType, _old_vector: RemappedSparseVector) { panic!("Cannot remove from a read-only Mmap inverted index") } fn upsert( &mut self, _id: PointOffsetType, _vector: RemappedSparseVector, _old_vector: Option, ) { panic!("Cannot upsert into a read-only Mmap inverted index") } fn from_ram_index>( ram_index: Cow, path: P, ) -> std::io::Result { Self::convert_and_save(&ram_index, path) } fn vector_count(&self) -> usize { self.file_header.vector_count } fn max_index(&self) -> Option { match self.file_header.posting_count { 0 => None, len => Some(len as DimId - 1), } } } impl InvertedIndexMmap { pub fn index_file_path(path: &Path) -> PathBuf { path.join(INDEX_FILE_NAME) } pub fn index_config_file_path(path: &Path) -> PathBuf { path.join(INDEX_CONFIG_FILE_NAME) } pub fn get(&self, id: &DimId) -> Option<&[PostingElementEx]> { // check that the id is not out of bounds (posting_count includes the empty zeroth entry) if *id >= self.file_header.posting_count as DimId { return None; } let header_start = *id as usize * POSTING_HEADER_SIZE; let header = transmute_from_u8::( &self.mmap[header_start..header_start + POSTING_HEADER_SIZE], ) .clone(); let elements_bytes = &self.mmap[header.start_offset as usize..header.end_offset as usize]; Some(transmute_from_u8_to_slice(elements_bytes)) } pub fn convert_and_save>( inverted_index_ram: &InvertedIndexRam, path: P, ) -> std::io::Result { let total_posting_headers_size = Self::total_posting_headers_size(inverted_index_ram); let total_posting_elements_size = Self::total_posting_elements_size(inverted_index_ram); let file_length = total_posting_headers_size + total_posting_elements_size; let file_path = Self::index_file_path(path.as_ref()); create_and_ensure_length(file_path.as_ref(), file_length)?; let mut mmap = open_write_mmap( file_path.as_ref(), AdviceSetting::from(Advice::Normal), false, )?; // file index data Self::save_posting_headers(&mut mmap, inverted_index_ram, total_posting_headers_size); Self::save_posting_elements(&mut mmap, inverted_index_ram, total_posting_headers_size); if file_length > 0 { mmap.flush()?; } // save header properties let posting_count = inverted_index_ram.postings.len(); let vector_count = inverted_index_ram.vector_count(); // finalize data with index file. let file_header = InvertedIndexFileHeader { posting_count, vector_count, }; let config_file_path = Self::index_config_file_path(path.as_ref()); atomic_save_json(&config_file_path, &file_header)?; Ok(Self { path: path.as_ref().to_owned(), mmap: Arc::new(mmap.make_read_only()?), file_header, }) } pub fn load>(path: P) -> std::io::Result { // read index config file let config_file_path = Self::index_config_file_path(path.as_ref()); // if the file header does not exist, the index is malformed let file_header: InvertedIndexFileHeader = read_json(&config_file_path)?; // read index data into mmap let file_path = Self::index_file_path(path.as_ref()); let mmap = open_read_mmap( file_path.as_ref(), AdviceSetting::from(Advice::Normal), false, )?; Ok(Self { path: path.as_ref().to_owned(), mmap: Arc::new(mmap), file_header, }) } fn total_posting_headers_size(inverted_index_ram: &InvertedIndexRam) -> usize { inverted_index_ram.postings.len() * POSTING_HEADER_SIZE } fn total_posting_elements_size(inverted_index_ram: &InvertedIndexRam) -> usize { let mut total_posting_elements_size = 0; for posting in &inverted_index_ram.postings { total_posting_elements_size += posting.elements.len() * size_of::(); } total_posting_elements_size } fn save_posting_headers( mmap: &mut MmapMut, inverted_index_ram: &InvertedIndexRam, total_posting_headers_size: usize, ) { let mut elements_offset: usize = total_posting_headers_size; for (id, posting) in inverted_index_ram.postings.iter().enumerate() { let posting_elements_size = posting.elements.len() * size_of::(); let posting_header = PostingListFileHeader { start_offset: elements_offset as u64, end_offset: (elements_offset + posting_elements_size) as u64, }; elements_offset = posting_header.end_offset as usize; // save posting header let posting_header_bytes = transmute_to_u8(&posting_header); let start_posting_offset = id * POSTING_HEADER_SIZE; let end_posting_offset = (id + 1) * POSTING_HEADER_SIZE; mmap[start_posting_offset..end_posting_offset].copy_from_slice(posting_header_bytes); } } fn save_posting_elements( mmap: &mut MmapMut, inverted_index_ram: &InvertedIndexRam, total_posting_headers_size: usize, ) { let mut offset = total_posting_headers_size; for posting in &inverted_index_ram.postings { // save posting element let posting_elements_bytes = transmute_to_u8_slice(&posting.elements); mmap[offset..offset + posting_elements_bytes.len()] .copy_from_slice(posting_elements_bytes); offset += posting_elements_bytes.len(); } } } #[cfg(test)] mod tests { use tempfile::Builder; use super::*; use crate::index::inverted_index::inverted_index_ram_builder::InvertedIndexBuilder; fn compare_indexes( inverted_index_ram: &InvertedIndexRam, inverted_index_mmap: &InvertedIndexMmap, ) { for id in 0..inverted_index_ram.postings.len() as DimId { let posting_list_ram = inverted_index_ram.get(&id).unwrap().elements.as_slice(); let posting_list_mmap = inverted_index_mmap.get(&id).unwrap(); assert_eq!(posting_list_ram.len(), posting_list_mmap.len()); for i in 0..posting_list_ram.len() { assert_eq!(posting_list_ram[i], posting_list_mmap[i]); } } } #[test] fn test_inverted_index_mmap() { // skip 4th dimension let mut builder = InvertedIndexBuilder::new(); builder.add(1, [(1, 10.0), (2, 10.0), (3, 10.0), (5, 10.0)].into()); builder.add(2, [(1, 20.0), (2, 20.0), (3, 20.0), (5, 20.0)].into()); builder.add(3, [(1, 30.0), (2, 30.0), (3, 30.0)].into()); builder.add(4, [(1, 1.0), (2, 1.0)].into()); builder.add(5, [(1, 2.0)].into()); builder.add(6, [(1, 3.0)].into()); builder.add(7, [(1, 4.0)].into()); builder.add(8, [(1, 5.0)].into()); builder.add(9, [(1, 6.0)].into()); let inverted_index_ram = builder.build(); let tmp_dir_path = Builder::new().prefix("test_index_dir").tempdir().unwrap(); { let inverted_index_mmap = InvertedIndexMmap::convert_and_save(&inverted_index_ram, &tmp_dir_path).unwrap(); compare_indexes(&inverted_index_ram, &inverted_index_mmap); } let inverted_index_mmap = InvertedIndexMmap::load(&tmp_dir_path).unwrap(); // posting_count: 0th entry is always empty + 1st + 2nd + 3rd + 4th empty + 5th assert_eq!(inverted_index_mmap.file_header.posting_count, 6); assert_eq!(inverted_index_mmap.file_header.vector_count, 9); compare_indexes(&inverted_index_ram, &inverted_index_mmap); assert!(inverted_index_mmap.get(&0).unwrap().is_empty()); // the first entry is always empty as dimension ids start at 1 assert_eq!(inverted_index_mmap.get(&1).unwrap().len(), 9); assert_eq!(inverted_index_mmap.get(&2).unwrap().len(), 4); assert_eq!(inverted_index_mmap.get(&3).unwrap().len(), 3); assert!(inverted_index_mmap.get(&4).unwrap().is_empty()); // return empty posting list info for intermediary empty ids assert_eq!(inverted_index_mmap.get(&5).unwrap().len(), 2); // index after the last values are None assert!(inverted_index_mmap.get(&6).is_none()); assert!(inverted_index_mmap.get(&7).is_none()); assert!(inverted_index_mmap.get(&100).is_none()); } }