use std::fmt::Debug; use std::path::Path; use std::sync::Arc; use parking_lot::RwLock; //use atomic_refcell::{AtomicRef, AtomicRefCell}; use rocksdb::{ColumnFamily, DBRecoveryMode, LogLevel, Options, WriteOptions, DB}; //use crate::common::arc_rwlock_iterator::ArcRwLockIterator; use crate::common::operation_error::{OperationError, OperationResult}; use crate::common::Flusher; const DB_CACHE_SIZE: usize = 10 * 1024 * 1024; // 10 mb const DB_MAX_LOG_SIZE: usize = 1024 * 1024; // 1 mb const DB_MAX_OPEN_FILES: usize = 256; const DB_DELETE_OBSOLETE_FILES_PERIOD: u64 = 3 * 60 * 1_000_000; // 3 minutes in microseconds pub const DB_VECTOR_CF: &str = "vector"; pub const DB_PAYLOAD_CF: &str = "payload"; pub const DB_MAPPING_CF: &str = "mapping"; pub const DB_VERSIONS_CF: &str = "version"; /// If there is no Column Family specified, key-value pair is associated with Column Family "default". pub const DB_DEFAULT_CF: &str = "default"; #[derive(Clone)] pub struct DatabaseColumnWrapper { database: Arc>, column_name: String, write_options: Arc, db_options: Arc, } impl Debug for DatabaseColumnWrapper { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("DatabaseColumnWrapper") .field("column_name", &self.column_name) .finish() } } pub struct DatabaseColumnIterator<'a> { pub handle: &'a ColumnFamily, pub iter: rocksdb::DBRawIterator<'a>, } pub struct LockedDatabaseColumnWrapper<'a> { guard: parking_lot::RwLockReadGuard<'a, DB>, column_name: &'a str, } /// RocksDB options (both global and for column families) pub fn make_db_options() -> Options { let mut options: Options = Options::default(); options.set_write_buffer_size(DB_CACHE_SIZE); // write_buffer_size is enforced per column family. options.create_if_missing(true); options.set_log_level(LogLevel::Error); options.set_recycle_log_file_num(1); options.set_keep_log_file_num(1); // must be greater than zero options.set_max_log_file_size(DB_MAX_LOG_SIZE); options.set_delete_obsolete_files_period_micros(DB_DELETE_OBSOLETE_FILES_PERIOD); options.create_missing_column_families(true); options.set_max_open_files(DB_MAX_OPEN_FILES as i32); options.set_compression_type(rocksdb::DBCompressionType::Lz4); // Qdrant relies on it's own WAL for durability options.set_wal_recovery_mode(DBRecoveryMode::TolerateCorruptedTailRecords); #[cfg(debug_assertions)] { options.set_paranoid_checks(true); } options } pub fn open_db>( path: &Path, vector_paths: &[T], ) -> Result>, rocksdb::Error> { let mut column_families = vec![DB_PAYLOAD_CF, DB_MAPPING_CF, DB_VERSIONS_CF, DB_DEFAULT_CF]; for vector_path in vector_paths { column_families.push(vector_path.as_ref()); } let options = make_db_options(); // Make sure that all column families have the same options let column_with_options = column_families .into_iter() .map(|cf| (cf, options.clone())) .collect::>(); let db = DB::open_cf_with_opts(&options, path, column_with_options)?; Ok(Arc::new(RwLock::new(db))) } pub fn check_db_exists(path: &Path) -> bool { let db_file = path.join("CURRENT"); db_file.exists() } pub fn open_db_with_existing_cf(path: &Path) -> Result>, rocksdb::Error> { let options = make_db_options(); let existing_column_families = if check_db_exists(path) { DB::list_cf(&options, path)? } else { vec![] }; // Make sure that all column families have the same options let column_with_options = existing_column_families .into_iter() .map(|cf| (cf, options.clone())) .collect::>(); let db = DB::open_cf_with_opts(&options, path, column_with_options)?; Ok(Arc::new(RwLock::new(db))) } impl DatabaseColumnWrapper { pub fn new(database: Arc>, column_name: &str) -> Self { let write_options = Arc::new(Self::make_write_options()); let db_options = Arc::new(make_db_options()); Self { database, column_name: column_name.to_string(), write_options, db_options, } } pub fn put(&self, key: K, value: V) -> OperationResult<()> where K: AsRef<[u8]>, V: AsRef<[u8]>, { let db = self.database.read(); let cf_handle = self.get_column_family(&db)?; db.put_cf_opt(cf_handle, key, value, &self.write_options) .map_err(|err| OperationError::service_error(format!("RocksDB put_cf error: {err}")))?; Ok(()) } pub fn get(&self, key: K) -> OperationResult> where K: AsRef<[u8]>, { let db = self.database.read(); let cf_handle = self.get_column_family(&db)?; db.get_cf(cf_handle, key) .map_err(|err| OperationError::service_error(format!("RocksDB get_cf error: {err}")))? .ok_or_else(|| OperationError::service_error("RocksDB get_cf error: key not found")) } pub fn get_opt(&self, key: K) -> OperationResult>> where K: AsRef<[u8]>, { let db = self.database.read(); let cf_handle = self.get_column_family(&db)?; db.get_cf(cf_handle, key) .map_err(|err| OperationError::service_error(format!("RocksDB get_cf error: {err}"))) } pub fn get_pinned(&self, key: &[u8], f: F) -> OperationResult> where F: FnOnce(&[u8]) -> T, { let db = self.database.read(); let cf_handle = self.get_column_family(&db)?; let result = db .get_pinned_cf(cf_handle, key) .map_err(|err| { OperationError::service_error(format!("RocksDB get_pinned_cf error: {err}")) })? .map(|value| f(&value)); Ok(result) } pub fn remove(&self, key: K) -> OperationResult<()> where K: AsRef<[u8]>, { let db = self.database.read(); let cf_handle = self.get_column_family(&db)?; db.delete_cf_opt(cf_handle, key, &self.write_options) .map_err(|err| { OperationError::service_error(format!("RocksDB delete_cf error: {err}")) })?; Ok(()) } pub fn lock_db(&self) -> LockedDatabaseColumnWrapper { LockedDatabaseColumnWrapper { guard: self.database.read(), column_name: &self.column_name, } } pub fn flusher(&self) -> Flusher { let database = self.database.clone(); let column_name = self.column_name.clone(); Box::new(move || { let db = database.read(); let Some(column_family) = db.cf_handle(&column_name) else { // It is possible, that the index was removed during the flush by user or another thread. // In this case, non-existing column family is not an error, but an expected behavior. // Still we want to log this event, for potential debugging. log::warn!( "Flush: RocksDB cf_handle error: Cannot find column family {}. Ignoring", &column_name ); return Ok(()); // ignore error }; db.flush_cf(column_family).map_err(|err| { OperationError::service_error(format!("RocksDB flush_cf error: {err}")) })?; Ok(()) }) } pub fn create_column_family_if_not_exists(&self) -> OperationResult<()> { let mut db = self.database.write(); if db.cf_handle(&self.column_name).is_none() { db.create_cf(&self.column_name, &self.db_options) .map_err(|err| { OperationError::service_error(format!("RocksDB create_cf error: {err}")) })?; } Ok(()) } pub fn recreate_column_family(&self) -> OperationResult<()> { self.remove_column_family()?; self.create_column_family_if_not_exists() } pub fn remove_column_family(&self) -> OperationResult<()> { let mut db = self.database.write(); if db.cf_handle(&self.column_name).is_some() { db.drop_cf(&self.column_name).map_err(|err| { OperationError::service_error(format!("RocksDB drop_cf error: {err}")) })?; } Ok(()) } pub fn has_column_family(&self) -> OperationResult { let db = self.database.read(); Ok(db.cf_handle(&self.column_name).is_some()) } fn make_write_options() -> WriteOptions { let mut write_options = WriteOptions::default(); write_options.set_sync(false); // RocksDB WAL is required for durability even if data is flushed write_options.disable_wal(false); write_options } fn get_column_family<'a>( &self, db: &'a parking_lot::RwLockReadGuard<'_, DB>, ) -> OperationResult<&'a ColumnFamily> { db.cf_handle(&self.column_name).ok_or_else(|| { OperationError::service_error(format!( "RocksDB cf_handle error: Cannot find column family {}", &self.column_name )) }) } pub fn get_database(&self) -> Arc> { self.database.clone() } pub fn get_column_name(&self) -> &str { &self.column_name } } impl<'a> LockedDatabaseColumnWrapper<'a> { pub fn iter(&self) -> OperationResult { DatabaseColumnIterator::new(&self.guard, self.column_name) } } impl<'a> DatabaseColumnIterator<'a> { pub fn new(db: &'a DB, column_name: &str) -> OperationResult> { let handle = db.cf_handle(column_name).ok_or_else(|| { OperationError::service_error(format!( "RocksDB cf_handle error: Cannot find column family {column_name}" )) })?; let mut iter = db.raw_iterator_cf(&handle); iter.seek_to_first(); Ok(DatabaseColumnIterator { handle, iter }) } } impl<'a> Iterator for DatabaseColumnIterator<'a> { type Item = (Box<[u8]>, Box<[u8]>); fn next(&mut self) -> Option { // Stop if iterator has ended or errored if !self.iter.valid() { return None; } let item = ( Box::from(self.iter.key().unwrap()), Box::from(self.iter.value().unwrap()), ); // Search to next item for next iteration self.iter.next(); Some(item) } }