use prometheus::proto::{Counter, Gauge, LabelPair, Metric, MetricFamily, MetricType}; use prometheus::TextEncoder; use segment::common::operation_time_statistics::OperationDurationStatistics; use crate::common::telemetry::TelemetryData; use crate::common::telemetry_ops::app_telemetry::{AppBuildTelemetry, AppFeaturesTelemetry}; use crate::common::telemetry_ops::cluster_telemetry::{ClusterStatusTelemetry, ClusterTelemetry}; use crate::common::telemetry_ops::collections_telemetry::{ CollectionTelemetryEnum, CollectionsTelemetry, }; use crate::common::telemetry_ops::memory_telemetry::MemoryTelemetry; use crate::common::telemetry_ops::requests_telemetry::{ GrpcTelemetry, RequestsTelemetry, WebApiTelemetry, }; /// Whitelist for REST endpoints in metrics output. /// /// Contains selection of search, recommend, scroll and upsert endpoints. /// /// This array *must* be sorted. const REST_ENDPOINT_WHITELIST: &[&str] = &[ "/collections/{name}/index", "/collections/{name}/points", "/collections/{name}/points/batch", "/collections/{name}/points/count", "/collections/{name}/points/delete", "/collections/{name}/points/discover", "/collections/{name}/points/discover/batch", "/collections/{name}/points/facet", "/collections/{name}/points/payload", "/collections/{name}/points/payload/clear", "/collections/{name}/points/payload/delete", "/collections/{name}/points/query", "/collections/{name}/points/query/batch", "/collections/{name}/points/query/groups", "/collections/{name}/points/recommend", "/collections/{name}/points/recommend/batch", "/collections/{name}/points/recommend/groups", "/collections/{name}/points/scroll", "/collections/{name}/points/search", "/collections/{name}/points/search/batch", "/collections/{name}/points/search/groups", "/collections/{name}/points/search/matrix/offsets", "/collections/{name}/points/search/matrix/pairs", "/collections/{name}/points/vectors", "/collections/{name}/points/vectors/delete", ]; /// Whitelist for GRPC endpoints in metrics output. /// /// Contains selection of search, recommend, scroll and upsert endpoints. /// /// This array *must* be sorted. const GRPC_ENDPOINT_WHITELIST: &[&str] = &[ "/qdrant.Points/ClearPayload", "/qdrant.Points/Count", "/qdrant.Points/Delete", "/qdrant.Points/DeletePayload", "/qdrant.Points/Discover", "/qdrant.Points/DiscoverBatch", "/qdrant.Points/Facet", "/qdrant.Points/Get", "/qdrant.Points/OverwritePayload", "/qdrant.Points/Query", "/qdrant.Points/QueryBatch", "/qdrant.Points/QueryGroups", "/qdrant.Points/Recommend", "/qdrant.Points/RecommendBatch", "/qdrant.Points/RecommendGroups", "/qdrant.Points/Scroll", "/qdrant.Points/Search", "/qdrant.Points/SearchBatch", "/qdrant.Points/SearchGroups", "/qdrant.Points/SetPayload", "/qdrant.Points/UpdateBatch", "/qdrant.Points/UpdateVectors", "/qdrant.Points/Upsert", ]; /// For REST requests, only report timings when having this HTTP response status. const REST_TIMINGS_FOR_STATUS: u16 = 200; /// Encapsulates metrics data in Prometheus format. pub struct MetricsData { metrics: Vec, } impl MetricsData { pub fn format_metrics(&self) -> String { TextEncoder::new().encode_to_string(&self.metrics).unwrap() } } impl From for MetricsData { fn from(telemetry_data: TelemetryData) -> Self { let mut metrics = vec![]; telemetry_data.add_metrics(&mut metrics); Self { metrics } } } trait MetricsProvider { /// Add metrics definitions for this. fn add_metrics(&self, metrics: &mut Vec); } impl MetricsProvider for TelemetryData { fn add_metrics(&self, metrics: &mut Vec) { self.app.add_metrics(metrics); self.collections.add_metrics(metrics); self.cluster.add_metrics(metrics); self.requests.add_metrics(metrics); if let Some(mem) = &self.memory { mem.add_metrics(metrics); } } } impl MetricsProvider for AppBuildTelemetry { fn add_metrics(&self, metrics: &mut Vec) { metrics.push(metric_family( "app_info", "information about qdrant server", MetricType::GAUGE, vec![gauge( 1.0, &[("name", &self.name), ("version", &self.version)], )], )); self.features.iter().for_each(|f| f.add_metrics(metrics)); } } impl MetricsProvider for AppFeaturesTelemetry { fn add_metrics(&self, metrics: &mut Vec) { metrics.push(metric_family( "app_status_recovery_mode", "features enabled in qdrant server", MetricType::GAUGE, vec![gauge(if self.recovery_mode { 1.0 } else { 0.0 }, &[])], )) } } impl MetricsProvider for CollectionsTelemetry { fn add_metrics(&self, metrics: &mut Vec) { let vector_count = self .collections .iter() .flatten() .map(|p| match p { CollectionTelemetryEnum::Aggregated(a) => a.vectors, CollectionTelemetryEnum::Full(c) => c.count_vectors(), }) .sum::(); metrics.push(metric_family( "collections_total", "number of collections", MetricType::GAUGE, vec![gauge(self.number_of_collections as f64, &[])], )); metrics.push(metric_family( "collections_vector_total", "total number of vectors in all collections", MetricType::GAUGE, vec![gauge(vector_count as f64, &[])], )); } } impl MetricsProvider for ClusterTelemetry { fn add_metrics(&self, metrics: &mut Vec) { let ClusterTelemetry { enabled, status, config: _, peers: _, metadata: _, } = self; metrics.push(metric_family( "cluster_enabled", "is cluster support enabled", MetricType::GAUGE, vec![gauge(if *enabled { 1.0 } else { 0.0 }, &[])], )); if let Some(ref status) = status { status.add_metrics(metrics); } } } impl MetricsProvider for ClusterStatusTelemetry { fn add_metrics(&self, metrics: &mut Vec) { metrics.push(metric_family( "cluster_peers_total", "total number of cluster peers", MetricType::GAUGE, vec![gauge(self.number_of_peers as f64, &[])], )); metrics.push(metric_family( "cluster_term", "current cluster term", MetricType::COUNTER, vec![counter(self.term as f64, &[])], )); if let Some(ref peer_id) = self.peer_id.map(|p| p.to_string()) { metrics.push(metric_family( "cluster_commit", "index of last committed (finalized) operation cluster peer is aware of", MetricType::COUNTER, vec![counter(self.commit as f64, &[("peer_id", peer_id)])], )); metrics.push(metric_family( "cluster_pending_operations_total", "total number of pending operations for cluster peer", MetricType::GAUGE, vec![gauge(self.pending_operations as f64, &[])], )); metrics.push(metric_family( "cluster_voter", "is cluster peer a voter or learner", MetricType::GAUGE, vec![gauge(if self.is_voter { 1.0 } else { 0.0 }, &[])], )); } } } impl MetricsProvider for RequestsTelemetry { fn add_metrics(&self, metrics: &mut Vec) { self.rest.add_metrics(metrics); self.grpc.add_metrics(metrics); } } impl MetricsProvider for WebApiTelemetry { fn add_metrics(&self, metrics: &mut Vec) { let mut builder = OperationDurationMetricsBuilder::default(); for (endpoint, responses) in &self.responses { let Some((method, endpoint)) = endpoint.split_once(' ') else { continue; }; // Endpoint must be whitelisted if REST_ENDPOINT_WHITELIST.binary_search(&endpoint).is_err() { continue; } for (status, stats) in responses { builder.add( stats, &[ ("method", method), ("endpoint", endpoint), ("status", &status.to_string()), ], *status == REST_TIMINGS_FOR_STATUS, ); } } builder.build("rest", metrics); } } impl MetricsProvider for GrpcTelemetry { fn add_metrics(&self, metrics: &mut Vec) { let mut builder = OperationDurationMetricsBuilder::default(); for (endpoint, stats) in &self.responses { // Endpoint must be whitelisted if GRPC_ENDPOINT_WHITELIST .binary_search(&endpoint.as_str()) .is_err() { continue; } builder.add(stats, &[("endpoint", endpoint.as_str())], true); } builder.build("grpc", metrics); } } impl MetricsProvider for MemoryTelemetry { fn add_metrics(&self, metrics: &mut Vec) { metrics.push(metric_family( "memory_active_bytes", "Total number of bytes in active pages allocated by the application", MetricType::GAUGE, vec![gauge(self.active_bytes as f64, &[])], )); metrics.push(metric_family( "memory_allocated_bytes", "Total number of bytes allocated by the application", MetricType::GAUGE, vec![gauge(self.allocated_bytes as f64, &[])], )); metrics.push(metric_family( "memory_metadata_bytes", "Total number of bytes dedicated to metadata", MetricType::GAUGE, vec![gauge(self.metadata_bytes as f64, &[])], )); metrics.push(metric_family( "memory_resident_bytes", "Maximum number of bytes in physically resident data pages mapped", MetricType::GAUGE, vec![gauge(self.resident_bytes as f64, &[])], )); metrics.push(metric_family( "memory_retained_bytes", "Total number of bytes in virtual memory mappings", MetricType::GAUGE, vec![gauge(self.retained_bytes as f64, &[])], )); } } /// A helper struct to build a vector of [`MetricFamily`] out of a collection of /// [`OperationDurationStatistics`]. #[derive(Default)] struct OperationDurationMetricsBuilder { total: Vec, fail_total: Vec, avg_secs: Vec, min_secs: Vec, max_secs: Vec, duration_histogram_secs: Vec, } impl OperationDurationMetricsBuilder { /// Add metrics for the provided statistics. /// If `add_timings` is `false`, only the total and fail_total counters will be added. pub fn add( &mut self, stat: &OperationDurationStatistics, labels: &[(&str, &str)], add_timings: bool, ) { self.total.push(counter(stat.count as f64, labels)); self.fail_total .push(counter(stat.fail_count as f64, labels)); if !add_timings { return; } self.avg_secs.push(gauge( f64::from(stat.avg_duration_micros.unwrap_or(0.0)) / 1_000_000.0, labels, )); self.min_secs.push(gauge( f64::from(stat.min_duration_micros.unwrap_or(0.0)) / 1_000_000.0, labels, )); self.max_secs.push(gauge( f64::from(stat.max_duration_micros.unwrap_or(0.0)) / 1_000_000.0, labels, )); self.duration_histogram_secs.push(histogram( stat.count as u64, stat.total_duration_micros as f64 / 1_000_000.0, &stat .duration_micros_histogram .iter() .map(|&(b, c)| (f64::from(b) / 1_000_000.0, c as u64)) .collect::>(), labels, )); } /// Build metrics and add them to the provided vector. pub fn build(self, prefix: &str, metrics: &mut Vec) { if !self.total.is_empty() { metrics.push(metric_family( &format!("{prefix}_responses_total"), "total number of responses", MetricType::COUNTER, self.total, )); } if !self.fail_total.is_empty() { metrics.push(metric_family( &format!("{prefix}_responses_fail_total"), "total number of failed responses", MetricType::COUNTER, self.fail_total, )); } if !self.avg_secs.is_empty() { metrics.push(metric_family( &format!("{prefix}_responses_avg_duration_seconds"), "average response duration", MetricType::GAUGE, self.avg_secs, )); } if !self.min_secs.is_empty() { metrics.push(metric_family( &format!("{prefix}_responses_min_duration_seconds"), "minimum response duration", MetricType::GAUGE, self.min_secs, )); } if !self.max_secs.is_empty() { metrics.push(metric_family( &format!("{prefix}_responses_max_duration_seconds"), "maximum response duration", MetricType::GAUGE, self.max_secs, )); } if !self.duration_histogram_secs.is_empty() { metrics.push(metric_family( &format!("{prefix}_responses_duration_seconds"), "response duration histogram", MetricType::HISTOGRAM, self.duration_histogram_secs, )); } } } fn metric_family(name: &str, help: &str, r#type: MetricType, metrics: Vec) -> MetricFamily { let mut metric_family = MetricFamily::default(); metric_family.set_name(name.into()); metric_family.set_help(help.into()); metric_family.set_field_type(r#type); metric_family.set_metric(metrics); metric_family } fn counter(value: f64, labels: &[(&str, &str)]) -> Metric { let mut metric = Metric::default(); metric.set_label(labels.iter().map(|(n, v)| label_pair(n, v)).collect()); metric.set_counter({ let mut counter = Counter::default(); counter.set_value(value); counter }); metric } fn gauge(value: f64, labels: &[(&str, &str)]) -> Metric { let mut metric = Metric::default(); metric.set_label(labels.iter().map(|(n, v)| label_pair(n, v)).collect()); metric.set_gauge({ let mut gauge = Gauge::default(); gauge.set_value(value); gauge }); metric } fn histogram( sample_count: u64, sample_sum: f64, buckets: &[(f64, u64)], labels: &[(&str, &str)], ) -> Metric { let mut metric = Metric::default(); metric.set_label(labels.iter().map(|(n, v)| label_pair(n, v)).collect()); metric.set_histogram({ let mut histogram = prometheus::proto::Histogram::default(); histogram.set_sample_count(sample_count); histogram.set_sample_sum(sample_sum); histogram.set_bucket( buckets .iter() .map(|&(upper_bound, cumulative_count)| { let mut bucket = prometheus::proto::Bucket::default(); bucket.set_cumulative_count(cumulative_count); bucket.set_upper_bound(upper_bound); bucket }) .collect(), ); histogram }); metric } fn label_pair(name: &str, value: &str) -> LabelPair { let mut label = LabelPair::default(); label.set_name(name.into()); label.set_value(value.into()); label } #[cfg(test)] mod tests { #[test] fn test_endpoint_whitelists_sorted() { use super::{GRPC_ENDPOINT_WHITELIST, REST_ENDPOINT_WHITELIST}; assert!( REST_ENDPOINT_WHITELIST.windows(2).all(|n| n[0] <= n[1]), "REST_ENDPOINT_WHITELIST must be sorted in code to allow binary search" ); assert!( GRPC_ENDPOINT_WHITELIST.windows(2).all(|n| n[0] <= n[1]), "GRPC_ENDPOINT_WHITELIST must be sorted in code to allow binary search" ); } }