import os import gradio as gr import pandas as pd from huggingface_hub import HfApi, Repository from apscheduler.schedulers.background import BackgroundScheduler from src.assets.text_content import TITLE, INTRODUCTION_TEXT from src.assets.css_html_js import custom_css, get_window_url_params OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None) LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard" LLM_PERF_DATASET_REPO = "optimum/llm-perf" def restart_space(): HfApi().restart_space( repo_id=LLM_PERF_LEADERBOARD_REPO, token=OPTIMUM_TOKEN ) def load_dataset_repo(): llm_perf_repo = None if OPTIMUM_TOKEN: print("Loading LLM-Perf-Dataset from Hub...") llm_perf_repo = Repository( local_dir="./llm-perf/", clone_from=LLM_PERF_DATASET_REPO, token=OPTIMUM_TOKEN, repo_type="dataset", ) llm_perf_repo.git_pull() return llm_perf_repo def get_leaderboard_df(): if llm_perf_repo: llm_perf_repo.git_pull() df = pd.read_csv("./llm-perf/reports/cuda_1_100/inference_report.csv") df = df[["model", "backend.name", "backend.torch_dtype", "backend.quantization", "generate.latency(s)", "generate.throughput(tokens/s)"]] df.rename(columns={ "model": "Model", "backend.name": "Backend", "backend.torch_dtype": "Torch dtype", "backend.quantization": "Quantization", "generate.latency(s)": "Latency (s)", "generate.throughput(tokens/s)": "Throughput (tokens/s)" }, inplace=True) df.sort_values(by=["Throughput (tokens/s)"], ascending=False, inplace=True) return df def refresh(): leaderboard_df = get_leaderboard_df() return leaderboard_df llm_perf_repo = load_dataset_repo() demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("Vanilla Benchmark", elem_id="vanilla-benchmark", id=0): leaderboard_df = get_leaderboard_df() leaderboard_table_lite = gr.components.Dataframe( value=leaderboard_df, headers=leaderboard_df.columns.tolist(), max_rows=None, elem_id="leaderboard-table-lite", ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.queue(concurrency_count=40).launch()