File size: 2,519 Bytes
c8763bd
 
 
d8b9ce2
c8763bd
 
d8b9ce2
 
c8763bd
 
 
 
 
 
 
 
d8b9ce2
c8763bd
 
 
 
d8b9ce2
c8763bd
 
1cbd09d
c8763bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8b9ce2
 
c8763bd
d8b9ce2
 
 
 
 
 
 
 
c8763bd
d8b9ce2
c8763bd
d8b9ce2
c8763bd
 
 
 
 
 
 
 
d8b9ce2
 
c8763bd
 
 
 
 
 
d8b9ce2
 
c8763bd
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, Repository
from apscheduler.schedulers.background import BackgroundScheduler

from src.assets.text_content import TITLE, INTRODUCTION_TEXT
from src.assets.css_html_js import custom_css, get_window_url_params

OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)

LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
LLM_PERF_DATASET_REPO = "optimum/llm-perf"


def restart_space():
    HfApi().restart_space(
        repo_id=LLM_PERF_LEADERBOARD_REPO, token=OPTIMUM_TOKEN
    )


def load_dataset_repo():
    llm_perf_repo = None
    if OPTIMUM_TOKEN:
        print("Loading LLM-Perf-Dataset from Hub...")
        llm_perf_repo = Repository(
            local_dir="./llm-perf/",
            clone_from=LLM_PERF_DATASET_REPO,
            token=OPTIMUM_TOKEN,
            repo_type="dataset",
        )
        llm_perf_repo.git_pull()

    return llm_perf_repo


def get_leaderboard_df():
    if llm_perf_repo:
        llm_perf_repo.git_pull()

    df = pd.read_csv("./llm-perf/reports/cuda_1_100/inference_report.csv")
    df = df[["model", "backend.name", "backend.torch_dtype", "backend.quantization",
             "generate.latency(s)", "generate.throughput(tokens/s)"]]

    df.rename(columns={
        "model": "Model",
        "backend.name": "Backend",
        "backend.torch_dtype": "Torch dtype",
        "backend.quantization": "Quantization",
        "generate.latency(s)": "Latency (s)",
        "generate.throughput(tokens/s)": "Throughput (tokens/s)"
    }, inplace=True)

    df.sort_values(by=["Throughput (tokens/s)"], ascending=False, inplace=True)

    return df


def refresh():
    leaderboard_df = get_leaderboard_df()

    return leaderboard_df


llm_perf_repo = load_dataset_repo()

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("Vanilla Benchmark", elem_id="vanilla-benchmark", id=0):
            leaderboard_df = get_leaderboard_df()
            leaderboard_table_lite = gr.components.Dataframe(
                value=leaderboard_df,
                headers=leaderboard_df.columns.tolist(),
                max_rows=None,
                elem_id="leaderboard-table-lite",
            )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.queue(concurrency_count=40).launch()