File size: 4,435 Bytes
90e26fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from collections import Counter, defaultdict
from typing import List

import numpy as np


def get_servers_metrics(model_reports) -> List[str]:
    servers_num_total = 0
    servers_num_relay = 0
    num_peers = 0
    pings = []
    num_ping_infs = 0
    version_counts = Counter()
    result = ["# SERVER LEVEL METRICS"]

    for model_reports in model_reports:
        for server in model_reports["server_rows"]:
            if server["span"].server_info is not None:
                next_pings = server["span"].server_info.next_pings
                if next_pings is not None:
                    servers_num_total += 1
                    num_peers += len(next_pings)
                    pings_not_inf = [v for k, v in next_pings.items() if v != float("inf")]
                    pings.extend(pings_not_inf)
                    num_ping_infs += len([v for v in next_pings.values() if v == float("inf")])

                if server["span"].server_info.using_relay:
                    servers_num_relay += 1

                version = server["span"].server_info.version
                if version:
                    version_counts[version] += 1

    if servers_num_total > 0 and pings:
        peers_per_srv = (len(pings) + num_ping_infs) / servers_num_total
        pings_inf_share = num_ping_infs / (num_ping_infs + len(pings))

        result.extend(
            [
                f"peers_per_srv {peers_per_srv:.1f}",
                f"pings_inf_share {pings_inf_share:.3f}",
            ]
        )

    result.append(f"servers_num_total {servers_num_total}")
    result.append(f"servers_num_relay {servers_num_relay}")

    if pings:
        result.append("# PINGS")
        pings = np.sort(pings).tolist()
        for pct in (25, 50, 75, 90, 95):
            result.append(f'ping_pct{{pct="{pct}"}} {np.percentile(pings, pct):.4f}')

    result.append("# VERSIONS")
    for version_number, version_count in version_counts.items():
        result.append(f'server_version{{version_number="{version_number}"}} {version_count}')

    return result


def get_models_metrics(model_reports) -> List[str]:
    result = [
        "# MODEL LEVEL METRICS",
    ]

    for model_reports in model_reports:
        model_name = model_reports["dht_prefix"]

        result.append(f"# MODEL: {model_name} {'-' * 50}")

        blocks = defaultdict(lambda: np.zeros(model_reports["num_blocks"]))

        for server in model_reports["server_rows"]:
            for block_idx in range(server["span"].start, server["span"].end):
                blocks["total"][block_idx] += 1
                blocks[server["state"]][block_idx] += 1

                if server["span"].server_info is not None:
                    for rps in ("network_rps", "inference_rps", "forward_rps"):
                        rps_value = getattr(server["span"].server_info, rps, 0)
                        if rps_value is not None:
                            blocks[rps][block_idx] += rps_value

        result.extend(
            [
                f'n_blocks{{model="{model_name}"}} {model_reports["num_blocks"]}',
                f'servers_num{{model="{model_name}"}} {len(model_reports["server_rows"])}',
                f'blocks_total{{model="{model_name}"}} {blocks["total"].sum()}',
                f'blocks_online_min{{model="{model_name}"}} {blocks["online"].min()}',
            ]
        )

        for block_state in ("online", "joining", "offline", "unreachable"):
            result.append(f'blocks{{model="{model_name}",state="{block_state}"}} {blocks[block_state].sum():.0f}')

        for rps in ("network_rps", "inference_rps", "forward_rps"):
            rps_type = rps.split("_")[0]
            result.append(f'rps_avg{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].mean():.1f}')
            result.append(f'rps_min{{model="{model_name}",rps="{rps_type}"}} {blocks[rps].min():.1f}')

    return result


def get_prometheus_metrics(state_dict) -> str:
    """prepares metrics in Prometeus format
    description: https://prometheus.io/docs/instrumenting/exposition_formats/
    returns multline string with single metric per line
    """
    result = []

    result.append("# GENERAL METRICS")
    result.append(f"update_duration {state_dict.get('update_duration', None):.1f}")

    result.extend(get_servers_metrics(state_dict["model_reports"]))

    result.extend(get_models_metrics(state_dict["model_reports"]))

    return "\n".join(result)