Spaces:
Build error
Build error
use std::collections::{BTreeMap, HashMap, HashSet}; | |
use std::fs::File; | |
use std::io::{Read, Write}; | |
use std::num::NonZeroU32; | |
use std::path::Path; | |
use atomicwrites::AtomicFile; | |
use atomicwrites::OverwriteBehavior::AllowOverwrite; | |
use schemars::JsonSchema; | |
use segment::common::anonymize::Anonymize; | |
use segment::data_types::vectors::DEFAULT_VECTOR_NAME; | |
use segment::index::sparse_index::sparse_index_config::{SparseIndexConfig, SparseIndexType}; | |
use segment::types::{ | |
default_replication_factor_const, default_shard_number_const, | |
default_write_consistency_factor_const, Distance, HnswConfig, Indexes, PayloadStorageType, | |
QuantizationConfig, SparseVectorDataConfig, StrictModeConfig, VectorDataConfig, | |
VectorStorageDatatype, VectorStorageType, | |
}; | |
use serde::{Deserialize, Serialize}; | |
use uuid::Uuid; | |
use validator::Validate; | |
use wal::WalOptions; | |
use crate::operations::config_diff::{DiffConfig, QuantizationConfigDiff}; | |
use crate::operations::types::{ | |
CollectionError, CollectionResult, SparseVectorParams, SparseVectorsConfig, VectorParams, | |
VectorParamsDiff, VectorsConfig, VectorsConfigDiff, | |
}; | |
use crate::operations::validation; | |
use crate::optimizers_builder::OptimizersConfig; | |
pub const COLLECTION_CONFIG_FILE: &str = "config.json"; | |
pub struct WalConfig { | |
/// Size of a single WAL segment in MB | |
pub wal_capacity_mb: usize, | |
/// Number of WAL segments to create ahead of actually used ones | |
pub wal_segments_ahead: usize, | |
} | |
impl From<&WalConfig> for WalOptions { | |
fn from(config: &WalConfig) -> Self { | |
WalOptions { | |
segment_capacity: config.wal_capacity_mb * 1024 * 1024, | |
segment_queue_len: config.wal_segments_ahead, | |
} | |
} | |
} | |
impl Default for WalConfig { | |
fn default() -> Self { | |
WalConfig { | |
wal_capacity_mb: 32, | |
wal_segments_ahead: 0, | |
} | |
} | |
} | |
pub enum ShardingMethod { | |
Auto, | |
Custom, | |
} | |
pub struct CollectionParams { | |
/// Configuration of the vector storage | |
pub vectors: VectorsConfig, | |
/// Number of shards the collection has | |
pub shard_number: NonZeroU32, | |
/// Sharding method | |
/// Default is Auto - points are distributed across all available shards | |
/// Custom - points are distributed across shards according to shard key | |
pub sharding_method: Option<ShardingMethod>, | |
/// Number of replicas for each shard | |
pub replication_factor: NonZeroU32, | |
/// Defines how many replicas should apply the operation for us to consider it successful. | |
/// Increasing this number will make the collection more resilient to inconsistencies, but will | |
/// also make it fail if not enough replicas are available. | |
/// Does not have any performance impact. | |
pub write_consistency_factor: NonZeroU32, | |
/// Defines how many additional replicas should be processing read request at the same time. | |
/// Default value is Auto, which means that fan-out will be determined automatically based on | |
/// the busyness of the local replica. | |
/// Having more than 0 might be useful to smooth latency spikes of individual nodes. | |
pub read_fan_out_factor: Option<u32>, | |
/// If true - point's payload will not be stored in memory. | |
/// It will be read from the disk every time it is requested. | |
/// This setting saves RAM by (slightly) increasing the response time. | |
/// Note: those payload values that are involved in filtering and are indexed - remain in RAM. | |
/// | |
/// Default: true | |
pub on_disk_payload: bool, | |
/// Temporary setting to enable/disable the use of mmap for on-disk payload storage. | |
// TODO: remove this setting after integration is finished | |
pub on_disk_payload_uses_mmap: bool, | |
/// Configuration of the sparse vector storage | |
pub sparse_vectors: Option<BTreeMap<String, SparseVectorParams>>, | |
} | |
impl CollectionParams { | |
pub fn payload_storage_type(&self) -> PayloadStorageType { | |
if self.on_disk_payload { | |
if self.on_disk_payload_uses_mmap { | |
return PayloadStorageType::Mmap; | |
} | |
PayloadStorageType::OnDisk | |
} else { | |
PayloadStorageType::InMemory | |
} | |
} | |
pub fn check_compatible(&self, other: &CollectionParams) -> CollectionResult<()> { | |
let CollectionParams { | |
vectors, | |
shard_number: _, // Maybe be updated by resharding, assume local shards needs to be dropped | |
sharding_method, // Not changeable | |
replication_factor: _, // May be changed | |
write_consistency_factor: _, // May be changed | |
read_fan_out_factor: _, // May be changed | |
on_disk_payload: _, // May be changed | |
on_disk_payload_uses_mmap: _, // Temporary | |
sparse_vectors, // Parameters may be changes, but not the structure | |
} = other; | |
self.vectors.check_compatible(vectors)?; | |
let this_sparse_vectors: HashSet<_> = if let Some(sparse_vectors) = &self.sparse_vectors { | |
sparse_vectors.keys().collect() | |
} else { | |
HashSet::new() | |
}; | |
let other_sparse_vectors: HashSet<_> = if let Some(sparse_vectors) = sparse_vectors { | |
sparse_vectors.keys().collect() | |
} else { | |
HashSet::new() | |
}; | |
if this_sparse_vectors != other_sparse_vectors { | |
return Err(CollectionError::bad_input(format!( | |
"sparse vectors are incompatible: \ | |
origin sparse vectors: {this_sparse_vectors:?}, \ | |
while other sparse vectors: {other_sparse_vectors:?}", | |
))); | |
} | |
let this_sharding_method = self.sharding_method.unwrap_or_default(); | |
let other_sharding_method = sharding_method.unwrap_or_default(); | |
if this_sharding_method != other_sharding_method { | |
return Err(CollectionError::bad_input(format!( | |
"sharding method is incompatible: \ | |
origin sharding method: {this_sharding_method:?}, \ | |
while other sharding method: {other_sharding_method:?}", | |
))); | |
} | |
Ok(()) | |
} | |
} | |
impl Anonymize for CollectionParams { | |
fn anonymize(&self) -> Self { | |
CollectionParams { | |
vectors: self.vectors.anonymize(), | |
shard_number: self.shard_number, | |
sharding_method: self.sharding_method, | |
replication_factor: self.replication_factor, | |
write_consistency_factor: self.write_consistency_factor, | |
read_fan_out_factor: self.read_fan_out_factor, | |
on_disk_payload: self.on_disk_payload, | |
on_disk_payload_uses_mmap: self.on_disk_payload_uses_mmap, | |
sparse_vectors: self.sparse_vectors.anonymize(), | |
} | |
} | |
} | |
pub fn default_shard_number() -> NonZeroU32 { | |
NonZeroU32::new(default_shard_number_const()).unwrap() | |
} | |
pub fn default_replication_factor() -> NonZeroU32 { | |
NonZeroU32::new(default_replication_factor_const()).unwrap() | |
} | |
pub fn default_write_consistency_factor() -> NonZeroU32 { | |
NonZeroU32::new(default_write_consistency_factor_const()).unwrap() | |
} | |
pub const fn default_on_disk_payload() -> bool { | |
true | |
} | |
pub struct CollectionConfigInternal { | |
pub params: CollectionParams, | |
pub hnsw_config: HnswConfig, | |
pub optimizer_config: OptimizersConfig, | |
pub wal_config: WalConfig, | |
pub quantization_config: Option<QuantizationConfig>, | |
pub strict_mode_config: Option<StrictModeConfig>, | |
pub uuid: Option<Uuid>, | |
} | |
impl CollectionConfigInternal { | |
pub fn to_bytes(&self) -> CollectionResult<Vec<u8>> { | |
serde_json::to_vec(self).map_err(|err| CollectionError::service_error(err.to_string())) | |
} | |
pub fn save(&self, path: &Path) -> CollectionResult<()> { | |
let config_path = path.join(COLLECTION_CONFIG_FILE); | |
let af = AtomicFile::new(&config_path, AllowOverwrite); | |
let state_bytes = serde_json::to_vec(self).unwrap(); | |
af.write(|f| f.write_all(&state_bytes)).map_err(|err| { | |
CollectionError::service_error(format!("Can't write {config_path:?}, error: {err}")) | |
})?; | |
Ok(()) | |
} | |
pub fn load(path: &Path) -> CollectionResult<Self> { | |
let config_path = path.join(COLLECTION_CONFIG_FILE); | |
let mut contents = String::new(); | |
let mut file = File::open(config_path)?; | |
file.read_to_string(&mut contents)?; | |
Ok(serde_json::from_str(&contents)?) | |
} | |
/// Check if collection config exists | |
pub fn check(path: &Path) -> bool { | |
let config_path = path.join(COLLECTION_CONFIG_FILE); | |
config_path.exists() | |
} | |
pub fn validate_and_warn(&self) { | |
if let Err(ref errs) = self.validate() { | |
validation::warn_validation_errors("Collection configuration file", errs); | |
} | |
} | |
} | |
impl CollectionParams { | |
pub fn empty() -> Self { | |
CollectionParams { | |
vectors: Default::default(), | |
shard_number: default_shard_number(), | |
sharding_method: None, | |
replication_factor: default_replication_factor(), | |
write_consistency_factor: default_write_consistency_factor(), | |
read_fan_out_factor: None, | |
on_disk_payload: default_on_disk_payload(), | |
on_disk_payload_uses_mmap: false, | |
sparse_vectors: None, | |
} | |
} | |
fn missing_vector_error(&self, vector_name: &str) -> CollectionError { | |
let mut available_names = vec![]; | |
match &self.vectors { | |
VectorsConfig::Single(_) => { | |
available_names.push(DEFAULT_VECTOR_NAME.to_string()); | |
} | |
VectorsConfig::Multi(vectors) => { | |
for name in vectors.keys() { | |
available_names.push(name.clone()); | |
} | |
} | |
} | |
if let Some(sparse_vectors) = &self.sparse_vectors { | |
for name in sparse_vectors.keys() { | |
available_names.push(name.clone()); | |
} | |
} | |
if available_names.is_empty() { | |
CollectionError::BadInput { | |
description: "Vectors are not configured in this collection".into(), | |
} | |
} else if available_names == vec![DEFAULT_VECTOR_NAME] { | |
return CollectionError::BadInput { | |
description: format!( | |
"Vector with name {vector_name} is not configured in this collection" | |
), | |
}; | |
} else { | |
let available_names = available_names.join(", "); | |
if vector_name == DEFAULT_VECTOR_NAME { | |
return CollectionError::BadInput { | |
description: format!( | |
"Collection requires specified vector name in the request, available names: {available_names}" | |
), | |
}; | |
} | |
CollectionError::BadInput { | |
description: format!( | |
"Vector with name `{vector_name}` is not configured in this collection, available names: {available_names}" | |
), | |
} | |
} | |
} | |
pub fn get_distance(&self, vector_name: &str) -> CollectionResult<Distance> { | |
match self.vectors.get_params(vector_name) { | |
Some(params) => Ok(params.distance), | |
None => { | |
if let Some(sparse_vectors) = &self.sparse_vectors { | |
if let Some(_params) = sparse_vectors.get(vector_name) { | |
return Ok(Distance::Dot); | |
} | |
} | |
Err(self.missing_vector_error(vector_name)) | |
} | |
} | |
} | |
fn get_vector_params_mut(&mut self, vector_name: &str) -> CollectionResult<&mut VectorParams> { | |
self.vectors | |
.get_params_mut(vector_name) | |
.ok_or_else(|| CollectionError::BadInput { | |
description: if vector_name == DEFAULT_VECTOR_NAME { | |
"Default vector params are not specified in config".into() | |
} else { | |
format!("Vector params for {vector_name} are not specified in config") | |
}, | |
}) | |
} | |
pub fn get_sparse_vector_params_opt(&self, vector_name: &str) -> Option<&SparseVectorParams> { | |
self.sparse_vectors | |
.as_ref() | |
.and_then(|sparse_vectors| sparse_vectors.get(vector_name)) | |
} | |
pub fn get_sparse_vector_params_mut( | |
&mut self, | |
vector_name: &str, | |
) -> CollectionResult<&mut SparseVectorParams> { | |
self.sparse_vectors | |
.as_mut() | |
.ok_or_else(|| CollectionError::BadInput { | |
description: format!( | |
"Sparse vector `{vector_name}` is not specified in collection config" | |
), | |
})? | |
.get_mut(vector_name) | |
.ok_or_else(|| CollectionError::BadInput { | |
description: format!( | |
"Sparse vector `{vector_name}` is not specified in collection config" | |
), | |
}) | |
} | |
/// Update collection vectors from the given update vectors config | |
pub fn update_vectors_from_diff( | |
&mut self, | |
update_vectors_diff: &VectorsConfigDiff, | |
) -> CollectionResult<()> { | |
for (vector_name, update_params) in update_vectors_diff.0.iter() { | |
let vector_params = self.get_vector_params_mut(vector_name)?; | |
let VectorParamsDiff { | |
hnsw_config, | |
quantization_config, | |
on_disk, | |
} = update_params.clone(); | |
if let Some(hnsw_diff) = hnsw_config { | |
if let Some(existing_hnsw) = &vector_params.hnsw_config { | |
vector_params.hnsw_config = Some(hnsw_diff.update(existing_hnsw)?); | |
} else { | |
vector_params.hnsw_config = Some(hnsw_diff); | |
} | |
} | |
if let Some(quantization_diff) = quantization_config { | |
vector_params.quantization_config = match quantization_diff.clone() { | |
QuantizationConfigDiff::Scalar(scalar) => { | |
Some(QuantizationConfig::Scalar(scalar)) | |
} | |
QuantizationConfigDiff::Product(product) => { | |
Some(QuantizationConfig::Product(product)) | |
} | |
QuantizationConfigDiff::Binary(binary) => { | |
Some(QuantizationConfig::Binary(binary)) | |
} | |
QuantizationConfigDiff::Disabled(_) => None, | |
} | |
} | |
if let Some(on_disk) = on_disk { | |
vector_params.on_disk = Some(on_disk); | |
} | |
} | |
Ok(()) | |
} | |
/// Update collection vectors from the given update vectors config | |
pub fn update_sparse_vectors_from_other( | |
&mut self, | |
update_vectors: &SparseVectorsConfig, | |
) -> CollectionResult<()> { | |
for (vector_name, update_params) in update_vectors.0.iter() { | |
let sparse_vector_params = self.get_sparse_vector_params_mut(vector_name)?; | |
let SparseVectorParams { index, modifier } = update_params.clone(); | |
if let Some(modifier) = modifier { | |
sparse_vector_params.modifier = Some(modifier); | |
} | |
if let Some(index) = index { | |
if let Some(existing_index) = &mut sparse_vector_params.index { | |
existing_index.update_from_other(index); | |
} else { | |
sparse_vector_params.index.replace(index); | |
} | |
} | |
} | |
Ok(()) | |
} | |
/// Convert into unoptimized named vector data configs | |
/// | |
/// It is the job of the segment optimizer to change this configuration with optimized settings | |
/// based on threshold configurations. | |
pub fn to_base_vector_data(&self) -> CollectionResult<HashMap<String, VectorDataConfig>> { | |
Ok(self | |
.vectors | |
.params_iter() | |
.map(|(name, params)| { | |
( | |
name.into(), | |
VectorDataConfig { | |
size: params.size.get() as usize, | |
distance: params.distance, | |
// Plain (disabled) index | |
index: Indexes::Plain {}, | |
// Disabled quantization | |
quantization_config: None, | |
// Default to in memory storage | |
storage_type: if params.on_disk.unwrap_or_default() { | |
VectorStorageType::ChunkedMmap | |
} else { | |
VectorStorageType::InRamChunkedMmap | |
}, | |
multivector_config: params.multivector_config, | |
datatype: params.datatype.map(VectorStorageDatatype::from), | |
}, | |
) | |
}) | |
.collect()) | |
} | |
/// Convert into unoptimized sparse vector data configs | |
/// | |
/// It is the job of the segment optimizer to change this configuration with optimized settings | |
/// based on threshold configurations. | |
pub fn to_sparse_vector_data( | |
&self, | |
) -> CollectionResult<HashMap<String, SparseVectorDataConfig>> { | |
if let Some(sparse_vectors) = &self.sparse_vectors { | |
sparse_vectors | |
.iter() | |
.map(|(name, params)| { | |
Ok(( | |
name.into(), | |
SparseVectorDataConfig { | |
index: SparseIndexConfig { | |
full_scan_threshold: params | |
.index | |
.and_then(|index| index.full_scan_threshold), | |
index_type: SparseIndexType::MutableRam, | |
datatype: params | |
.index | |
.and_then(|index| index.datatype) | |
.map(VectorStorageDatatype::from), | |
}, | |
}, | |
)) | |
}) | |
.collect() | |
} else { | |
Ok(Default::default()) | |
} | |
} | |
} | |