|
import os |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
from huggingface_hub.file_download import hf_hub_download |
|
|
|
|
|
from src.utils import process_model_name, process_model_arch |
|
from src.assets.css_html_js import custom_css |
|
from src.assets.text_content import ( |
|
TITLE, |
|
ABOUT_TEXT, |
|
INTRODUCTION_TEXT, |
|
EXAMPLE_CONFIG_TEXT, |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
) |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/huggy_bench.png" |
|
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" |
|
ALL_COLUMNS_MAPPING = { |
|
"Model": "Model π€", |
|
"Arch": "Arch ποΈ", |
|
"Size": "Params (B) π", |
|
|
|
"backend.name": "Backend π", |
|
"backend.torch_dtype": "Dtype π₯", |
|
"optimization": "Optimization π οΈ", |
|
"quantization": "Quantization ποΈ", |
|
|
|
"Score": "Open LLM Score (%) β¬οΈ", |
|
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ", |
|
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ", |
|
"forward.latency(s)": "Prefill Latency (s) β¬οΈ", |
|
"generate.latency(s)": "E2E Latency (s) β¬οΈ", |
|
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) β¬οΈ", |
|
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) β¬οΈ", |
|
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ", |
|
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ", |
|
} |
|
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"] |
|
SORTING_ASCENDING = [False, False] |
|
ALL_COLUMNS_DATATYPES = [ |
|
|
|
"markdown", |
|
"markdown", |
|
"number", |
|
|
|
"str", |
|
"str", |
|
"str", |
|
"str", |
|
|
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
] |
|
|
|
hf_hub_download( |
|
repo_id="optimum/llm-perf-dataset", |
|
filename="open-llm.csv", |
|
local_dir="dataset", |
|
repo_type="dataset", |
|
token=HF_TOKEN, |
|
) |
|
OPEN_LLM_DF = pd.read_csv("dataset/open-llm.csv") |
|
|
|
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB π₯οΈ"} |
|
MACHINE_TO_PERF = {} |
|
for machine in MACHINE_TO_HARDWARE: |
|
hf_hub_download( |
|
repo_id="optimum/llm-perf-dataset", |
|
filename=f"{machine}/perf-report.csv", |
|
local_dir="dataset", |
|
repo_type="dataset", |
|
token=HF_TOKEN, |
|
) |
|
MACHINE_TO_PERF[machine] = pd.read_csv(f"dataset/{machine}/perf-report.csv") |
|
|
|
|
|
def get_benchmark_df(machine="hf-dgx-01"): |
|
|
|
machine_perf_df = MACHINE_TO_PERF[machine].copy() |
|
merged_df = OPEN_LLM_DF.merge(machine_perf_df, left_on="Model", right_on="model") |
|
|
|
merged_df["generate.energy_consumption(tokens/kWh)"] = ( |
|
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1) |
|
).astype(int) |
|
|
|
merged_df.loc[ |
|
merged_df["generate.energy_consumption(tokens/kWh)"] == 1, |
|
"generate.energy_consumption(tokens/kWh)", |
|
] = pd.NA |
|
|
|
merged_df["optimization"] = merged_df[ |
|
["backend.to_bettertransformer", "backend.use_flash_attention_2"] |
|
].apply( |
|
lambda x: "BetterTransformer" |
|
if x["backend.to_bettertransformer"] |
|
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"), |
|
axis=1, |
|
) |
|
|
|
merged_df["quantization"] = merged_df[ |
|
["backend.quantization_scheme", "backend.quantization_config.exllama_config.version"] |
|
].apply( |
|
lambda x: "BnB.4bit" |
|
if x["backend.quantization_scheme"] == "bnb" |
|
else ( |
|
"GPTQ.4bit+ExllamaV1" |
|
if (x["backend.quantization_scheme"] == "gptq") |
|
and (x["backend.quantization_config.exllama_config.version"] == 1) |
|
else ( |
|
"GPTQ.4bit+ExllamaV2" |
|
if (x["backend.quantization_scheme"] == "gptq") |
|
and (x["backend.quantization_config.exllama_config.version"] == 2) |
|
else "None" |
|
) |
|
), |
|
axis=1, |
|
) |
|
|
|
merged_df["decode.throughput(tokens/s)"] = ( |
|
1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"]) |
|
).round(2) |
|
|
|
merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True) |
|
|
|
merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())] |
|
|
|
merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True) |
|
|
|
return merged_df |
|
|
|
|
|
def get_benchmark_table(bench_df): |
|
copy_df = bench_df.copy() |
|
|
|
copy_df["Model π€"] = copy_df["Model π€"].apply(process_model_name) |
|
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch) |
|
|
|
copy_df["Open LLM Score (%) β¬οΈ"] = copy_df.apply( |
|
lambda x: f"{x['Open LLM Score (%) β¬οΈ']}**" |
|
if x["Quantization ποΈ"] in ["BnB.4bit", "GPTQ.4bit"] |
|
else x["Open LLM Score (%) β¬οΈ"], |
|
axis=1, |
|
) |
|
return copy_df |
|
|
|
|
|
def get_benchmark_chart(bench_df): |
|
copy_df = bench_df.copy() |
|
|
|
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch) |
|
|
|
fig = px.scatter( |
|
copy_df, |
|
y="Open LLM Score (%) β¬οΈ", |
|
x="E2E Latency (s) β¬οΈ", |
|
size="Allocated Memory (MB) β¬οΈ", |
|
color="Arch ποΈ", |
|
custom_data=list(ALL_COLUMNS_MAPPING.values()), |
|
color_discrete_sequence=px.colors.qualitative.Light24, |
|
) |
|
fig.update_layout( |
|
title={ |
|
"text": "Latency vs. Score vs. Memory", |
|
"y": 0.95, |
|
"x": 0.5, |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
}, |
|
xaxis_title="Per 1000 Tokens Latency (s)", |
|
yaxis_title="Open LLM Score (%)", |
|
legend_title="LLM Architecture", |
|
width=1200, |
|
height=600, |
|
) |
|
fig.update_traces( |
|
hovertemplate="<br>".join( |
|
[ |
|
f"<b>{column}:</b> %{{customdata[{i}]}}" |
|
for i, column in enumerate(ALL_COLUMNS_MAPPING.values()) |
|
] |
|
) |
|
) |
|
return fig |
|
|
|
|
|
def filter_query( |
|
text, |
|
backends, |
|
datatypes, |
|
optimizations, |
|
quantizations, |
|
score, |
|
memory, |
|
machine, |
|
): |
|
raw_df = get_benchmark_df(machine=machine) |
|
filtered_df = raw_df[ |
|
raw_df["Model π€"].str.contains(text, case=False) |
|
& raw_df["Backend π"].isin(backends) |
|
& raw_df["Dtype π₯"].isin(datatypes) |
|
& raw_df["Optimization π οΈ"].isin(optimizations) |
|
& raw_df["Quantization ποΈ"].isin(quantizations) |
|
& (raw_df["Open LLM Score (%) β¬οΈ"] >= score) |
|
& (raw_df["Allocated Memory (MB) β¬οΈ"] <= memory) |
|
] |
|
filtered_table = get_benchmark_table(filtered_df) |
|
filtered_chart = get_benchmark_chart(filtered_df) |
|
return filtered_table, filtered_chart |
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
|
|
gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo") |
|
|
|
gr.HTML(TITLE) |
|
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text") |
|
|
|
with gr.Tabs(elem_classes="leaderboard-tabs"): |
|
machine_placeholders = {} |
|
machine_tables = {} |
|
machine_plots = {} |
|
|
|
for i, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()): |
|
|
|
machine_placeholders[machine] = gr.Textbox(value=machine, visible=False) |
|
|
|
with gr.TabItem(hardware, id=i): |
|
with gr.Tabs(elem_classes="machine-tabs"): |
|
|
|
machine_df = get_benchmark_df(machine=machine) |
|
with gr.TabItem("Leaderboard π
", id=0): |
|
gr.HTML( |
|
"π Scroll to the right π for additional columns.", |
|
elem_id="descriptive-text", |
|
) |
|
|
|
machine_tables[machine] = gr.components.Dataframe( |
|
value=get_benchmark_table(machine_df), |
|
headers=list(ALL_COLUMNS_MAPPING.values()), |
|
datatype=ALL_COLUMNS_DATATYPES, |
|
elem_id="machine-table", |
|
) |
|
with gr.TabItem("Plot π", id=1): |
|
gr.HTML( |
|
"π Hover over the points π for additional information.", |
|
elem_id="descriptive-text", |
|
) |
|
|
|
machine_plots[machine] = gr.components.Plot( |
|
value=get_benchmark_chart(machine_df), |
|
elem_id="machine-plot", |
|
show_label=False, |
|
) |
|
|
|
|
|
with gr.TabItem("Control Panel ποΈ", id=2): |
|
gr.HTML( |
|
"Use this control panel to filter the leaderboard's table and plot.", |
|
elem_id="descriptive-text", |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
search_bar = gr.Textbox( |
|
label="Model π€", |
|
info="π Search for a model name", |
|
elem_id="search-bar", |
|
) |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
score_slider = gr.Slider( |
|
label="Open LLM Score (%) π", |
|
info="ποΈ Slide to minimum Open LLM score", |
|
value=0, |
|
elem_id="threshold-slider", |
|
) |
|
with gr.Column(scale=1): |
|
memory_slider = gr.Slider( |
|
label="Peak Memory (MB) π", |
|
info="ποΈ Slide to maximum Peak Memory", |
|
minimum=0, |
|
maximum=80 * 1024, |
|
value=80 * 1024, |
|
elem_id="memory-slider", |
|
) |
|
with gr.Column(scale=1): |
|
backend_checkboxes = gr.CheckboxGroup( |
|
label="Backends π", |
|
choices=["pytorch", "onnxruntime"], |
|
value=["pytorch", "onnxruntime"], |
|
info="βοΈ Select the backends", |
|
elem_id="backend-checkboxes", |
|
) |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
datatype_checkboxes = gr.CheckboxGroup( |
|
label="Load Dtypes π₯", |
|
choices=["float32", "float16"], |
|
value=["float32", "float16"], |
|
info="βοΈ Select the load dtypes", |
|
elem_id="dtype-checkboxes", |
|
) |
|
with gr.Column(scale=1): |
|
optimization_checkboxes = gr.CheckboxGroup( |
|
label="Optimizations π οΈ", |
|
choices=["None", "BetterTransformer", "FlashAttentionV2"], |
|
value=["None", "BetterTransformer", "FlashAttentionV2"], |
|
info="βοΈ Select the optimization", |
|
elem_id="optimization-checkboxes", |
|
) |
|
with gr.Column(scale=1): |
|
quantization_checkboxes = gr.CheckboxGroup( |
|
label="Quantizations ποΈ", |
|
choices=["None", "BnB.4bit", "GPTQ.4bit"], |
|
value=["None", "BnB.4bit", "GPTQ.4bit"], |
|
info="βοΈ Select the quantization schemes", |
|
elem_id="quantization-checkboxes", |
|
) |
|
with gr.Row(): |
|
filter_button = gr.Button( |
|
value="Filter π", |
|
elem_id="filter-button", |
|
) |
|
for machine in MACHINE_TO_HARDWARE: |
|
filter_button.click( |
|
filter_query, |
|
[ |
|
search_bar, |
|
backend_checkboxes, |
|
datatype_checkboxes, |
|
optimization_checkboxes, |
|
quantization_checkboxes, |
|
score_slider, |
|
memory_slider, |
|
machine_placeholders[machine], |
|
], |
|
[machine_tables[machine], machine_plots[machine]], |
|
) |
|
|
|
|
|
with gr.TabItem("About π", id=3): |
|
gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text") |
|
gr.Markdown(EXAMPLE_CONFIG_TEXT, elem_classes="descriptive-text") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Accordion("π Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
|
|
demo.queue().launch() |
|
|