File size: 11,866 Bytes
84d2a97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
use std::borrow::Cow;
use std::mem::size_of;
use std::path::{Path, PathBuf};
use std::sync::Arc;

use common::types::PointOffsetType;
use io::file_operations::{atomic_save_json, read_json};
use io::storage_version::StorageVersion;
use memmap2::{Mmap, MmapMut};
use memory::madvise::{Advice, AdviceSetting};
use memory::mmap_ops::{
    create_and_ensure_length, open_read_mmap, open_write_mmap, transmute_from_u8,
    transmute_from_u8_to_slice, transmute_to_u8, transmute_to_u8_slice,
};
use serde::{Deserialize, Serialize};

use super::INDEX_FILE_NAME;
use crate::common::sparse_vector::RemappedSparseVector;
use crate::common::types::{DimId, DimOffset};
use crate::index::inverted_index::inverted_index_ram::InvertedIndexRam;
use crate::index::inverted_index::InvertedIndex;
use crate::index::posting_list::PostingListIterator;
use crate::index::posting_list_common::PostingElementEx;

const POSTING_HEADER_SIZE: usize = size_of::<PostingListFileHeader>();
const INDEX_CONFIG_FILE_NAME: &str = "inverted_index_config.json";

pub struct Version;

impl StorageVersion for Version {
    fn current_raw() -> &'static str {
        "0.1.0"
    }
}

#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct InvertedIndexFileHeader {
    pub posting_count: usize, // number oof posting lists
    pub vector_count: usize,  // number of unique vectors indexed
}

/// Inverted flatten index from dimension id to posting list
#[derive(Debug)]
pub struct InvertedIndexMmap {
    path: PathBuf,
    mmap: Arc<Mmap>,
    pub file_header: InvertedIndexFileHeader,
}

#[derive(Debug, Default, Clone)]
struct PostingListFileHeader {
    pub start_offset: u64,
    pub end_offset: u64,
}

impl InvertedIndex for InvertedIndexMmap {
    type Iter<'a> = PostingListIterator<'a>;

    type Version = Version;

    fn open(path: &Path) -> std::io::Result<Self> {
        Self::load(path)
    }

    fn save(&self, path: &Path) -> std::io::Result<()> {
        debug_assert_eq!(path, self.path);

        // If Self instance exists, it's either constructed by using `open()` (which reads index
        // files), or using `from_ram_index()` (which writes them). Both assume that the files
        // exist. If any of the files are missing, then something went wrong.
        for file in Self::files(path) {
            debug_assert!(file.exists());
        }

        Ok(())
    }

    fn get(&self, id: &DimId) -> Option<PostingListIterator> {
        self.get(id).map(PostingListIterator::new)
    }

    fn len(&self) -> usize {
        self.file_header.posting_count
    }

    fn posting_list_len(&self, id: &DimOffset) -> Option<usize> {
        self.get(id).map(|posting_list| posting_list.len())
    }

    fn files(path: &Path) -> Vec<PathBuf> {
        vec![
            Self::index_file_path(path),
            Self::index_config_file_path(path),
        ]
    }

    fn remove(&mut self, _id: PointOffsetType, _old_vector: RemappedSparseVector) {
        panic!("Cannot remove from a read-only Mmap inverted index")
    }

    fn upsert(
        &mut self,
        _id: PointOffsetType,
        _vector: RemappedSparseVector,
        _old_vector: Option<RemappedSparseVector>,
    ) {
        panic!("Cannot upsert into a read-only Mmap inverted index")
    }

    fn from_ram_index<P: AsRef<Path>>(
        ram_index: Cow<InvertedIndexRam>,
        path: P,
    ) -> std::io::Result<Self> {
        Self::convert_and_save(&ram_index, path)
    }

    fn vector_count(&self) -> usize {
        self.file_header.vector_count
    }

    fn max_index(&self) -> Option<DimId> {
        match self.file_header.posting_count {
            0 => None,
            len => Some(len as DimId - 1),
        }
    }
}

impl InvertedIndexMmap {
    pub fn index_file_path(path: &Path) -> PathBuf {
        path.join(INDEX_FILE_NAME)
    }

    pub fn index_config_file_path(path: &Path) -> PathBuf {
        path.join(INDEX_CONFIG_FILE_NAME)
    }

    pub fn get(&self, id: &DimId) -> Option<&[PostingElementEx]> {
        // check that the id is not out of bounds (posting_count includes the empty zeroth entry)
        if *id >= self.file_header.posting_count as DimId {
            return None;
        }
        let header_start = *id as usize * POSTING_HEADER_SIZE;
        let header = transmute_from_u8::<PostingListFileHeader>(
            &self.mmap[header_start..header_start + POSTING_HEADER_SIZE],
        )
        .clone();
        let elements_bytes = &self.mmap[header.start_offset as usize..header.end_offset as usize];
        Some(transmute_from_u8_to_slice(elements_bytes))
    }

    pub fn convert_and_save<P: AsRef<Path>>(
        inverted_index_ram: &InvertedIndexRam,
        path: P,
    ) -> std::io::Result<Self> {
        let total_posting_headers_size = Self::total_posting_headers_size(inverted_index_ram);
        let total_posting_elements_size = Self::total_posting_elements_size(inverted_index_ram);

        let file_length = total_posting_headers_size + total_posting_elements_size;
        let file_path = Self::index_file_path(path.as_ref());
        create_and_ensure_length(file_path.as_ref(), file_length)?;

        let mut mmap = open_write_mmap(
            file_path.as_ref(),
            AdviceSetting::from(Advice::Normal),
            false,
        )?;

        // file index data
        Self::save_posting_headers(&mut mmap, inverted_index_ram, total_posting_headers_size);
        Self::save_posting_elements(&mut mmap, inverted_index_ram, total_posting_headers_size);
        if file_length > 0 {
            mmap.flush()?;
        }

        // save header properties
        let posting_count = inverted_index_ram.postings.len();
        let vector_count = inverted_index_ram.vector_count();

        // finalize data with index file.
        let file_header = InvertedIndexFileHeader {
            posting_count,
            vector_count,
        };
        let config_file_path = Self::index_config_file_path(path.as_ref());
        atomic_save_json(&config_file_path, &file_header)?;

        Ok(Self {
            path: path.as_ref().to_owned(),
            mmap: Arc::new(mmap.make_read_only()?),
            file_header,
        })
    }

    pub fn load<P: AsRef<Path>>(path: P) -> std::io::Result<Self> {
        // read index config file
        let config_file_path = Self::index_config_file_path(path.as_ref());
        // if the file header does not exist, the index is malformed
        let file_header: InvertedIndexFileHeader = read_json(&config_file_path)?;
        // read index data into mmap
        let file_path = Self::index_file_path(path.as_ref());
        let mmap = open_read_mmap(
            file_path.as_ref(),
            AdviceSetting::from(Advice::Normal),
            false,
        )?;
        Ok(Self {
            path: path.as_ref().to_owned(),
            mmap: Arc::new(mmap),
            file_header,
        })
    }

    fn total_posting_headers_size(inverted_index_ram: &InvertedIndexRam) -> usize {
        inverted_index_ram.postings.len() * POSTING_HEADER_SIZE
    }

    fn total_posting_elements_size(inverted_index_ram: &InvertedIndexRam) -> usize {
        let mut total_posting_elements_size = 0;
        for posting in &inverted_index_ram.postings {
            total_posting_elements_size += posting.elements.len() * size_of::<PostingElementEx>();
        }

        total_posting_elements_size
    }

    fn save_posting_headers(
        mmap: &mut MmapMut,
        inverted_index_ram: &InvertedIndexRam,
        total_posting_headers_size: usize,
    ) {
        let mut elements_offset: usize = total_posting_headers_size;
        for (id, posting) in inverted_index_ram.postings.iter().enumerate() {
            let posting_elements_size = posting.elements.len() * size_of::<PostingElementEx>();
            let posting_header = PostingListFileHeader {
                start_offset: elements_offset as u64,
                end_offset: (elements_offset + posting_elements_size) as u64,
            };
            elements_offset = posting_header.end_offset as usize;

            // save posting header
            let posting_header_bytes = transmute_to_u8(&posting_header);
            let start_posting_offset = id * POSTING_HEADER_SIZE;
            let end_posting_offset = (id + 1) * POSTING_HEADER_SIZE;
            mmap[start_posting_offset..end_posting_offset].copy_from_slice(posting_header_bytes);
        }
    }

    fn save_posting_elements(
        mmap: &mut MmapMut,
        inverted_index_ram: &InvertedIndexRam,
        total_posting_headers_size: usize,
    ) {
        let mut offset = total_posting_headers_size;
        for posting in &inverted_index_ram.postings {
            // save posting element
            let posting_elements_bytes = transmute_to_u8_slice(&posting.elements);
            mmap[offset..offset + posting_elements_bytes.len()]
                .copy_from_slice(posting_elements_bytes);
            offset += posting_elements_bytes.len();
        }
    }
}

#[cfg(test)]
mod tests {
    use tempfile::Builder;

    use super::*;
    use crate::index::inverted_index::inverted_index_ram_builder::InvertedIndexBuilder;

    fn compare_indexes(
        inverted_index_ram: &InvertedIndexRam,
        inverted_index_mmap: &InvertedIndexMmap,
    ) {
        for id in 0..inverted_index_ram.postings.len() as DimId {
            let posting_list_ram = inverted_index_ram.get(&id).unwrap().elements.as_slice();
            let posting_list_mmap = inverted_index_mmap.get(&id).unwrap();
            assert_eq!(posting_list_ram.len(), posting_list_mmap.len());
            for i in 0..posting_list_ram.len() {
                assert_eq!(posting_list_ram[i], posting_list_mmap[i]);
            }
        }
    }

    #[test]
    fn test_inverted_index_mmap() {
        // skip 4th dimension
        let mut builder = InvertedIndexBuilder::new();
        builder.add(1, [(1, 10.0), (2, 10.0), (3, 10.0), (5, 10.0)].into());
        builder.add(2, [(1, 20.0), (2, 20.0), (3, 20.0), (5, 20.0)].into());
        builder.add(3, [(1, 30.0), (2, 30.0), (3, 30.0)].into());
        builder.add(4, [(1, 1.0), (2, 1.0)].into());
        builder.add(5, [(1, 2.0)].into());
        builder.add(6, [(1, 3.0)].into());
        builder.add(7, [(1, 4.0)].into());
        builder.add(8, [(1, 5.0)].into());
        builder.add(9, [(1, 6.0)].into());
        let inverted_index_ram = builder.build();

        let tmp_dir_path = Builder::new().prefix("test_index_dir").tempdir().unwrap();

        {
            let inverted_index_mmap =
                InvertedIndexMmap::convert_and_save(&inverted_index_ram, &tmp_dir_path).unwrap();

            compare_indexes(&inverted_index_ram, &inverted_index_mmap);
        }
        let inverted_index_mmap = InvertedIndexMmap::load(&tmp_dir_path).unwrap();
        // posting_count: 0th entry is always empty + 1st + 2nd + 3rd + 4th empty + 5th
        assert_eq!(inverted_index_mmap.file_header.posting_count, 6);
        assert_eq!(inverted_index_mmap.file_header.vector_count, 9);

        compare_indexes(&inverted_index_ram, &inverted_index_mmap);

        assert!(inverted_index_mmap.get(&0).unwrap().is_empty()); // the first entry is always empty as dimension ids start at 1
        assert_eq!(inverted_index_mmap.get(&1).unwrap().len(), 9);
        assert_eq!(inverted_index_mmap.get(&2).unwrap().len(), 4);
        assert_eq!(inverted_index_mmap.get(&3).unwrap().len(), 3);
        assert!(inverted_index_mmap.get(&4).unwrap().is_empty()); // return empty posting list info for intermediary empty ids
        assert_eq!(inverted_index_mmap.get(&5).unwrap().len(), 2);
        // index after the last values are None
        assert!(inverted_index_mmap.get(&6).is_none());
        assert!(inverted_index_mmap.get(&7).is_none());
        assert!(inverted_index_mmap.get(&100).is_none());
    }
}