File size: 1,877 Bytes
ca1ecab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from aioprometheus import Gauge

# The begin-* and end* here are used by the documentation generator
# to extract the metrics definitions.

# begin-metrics-definitions
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
                                    "Average prefill throughput in tokens/s.")
gauge_avg_generation_throughput = Gauge(
    "vllm:avg_generation_throughput_toks_per_s",
    "Average generation throughput in tokens/s.")

gauge_scheduler_running = Gauge(
    "vllm:num_requests_running",
    "Number of requests that is currently running for inference.")
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
                                "Number requests swapped to CPU.")
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
                                "Number of requests waiting to be processed.")

gauge_gpu_cache_usage = Gauge(
    "vllm:gpu_cache_usage_perc",
    "GPU KV-cache usage. 1 means 100 percent usage.")
gauge_cpu_cache_usage = Gauge(
    "vllm:cpu_cache_usage_perc",
    "CPU KV-cache usage. 1 means 100 percent usage.")
# end-metrics-definitions

labels = {}


def add_global_metrics_labels(**kwargs):
    labels.update(kwargs)


def record_metrics(
    avg_prompt_throughput: float,
    avg_generation_throughput: float,
    scheduler_running: int,
    scheduler_swapped: int,
    scheduler_waiting: int,
    gpu_cache_usage: float,
    cpu_cache_usage: float,
):
    gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
    gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
    gauge_scheduler_running.set(labels, scheduler_running)
    gauge_scheduler_swapped.set(labels, scheduler_swapped)
    gauge_scheduler_waiting.set(labels, scheduler_waiting)
    gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
    gauge_cpu_cache_usage.set(labels, cpu_cache_usage)