Spaces:
Running
Running
Arnav Chavan
commited on
Commit
Β·
2fcb72a
1
Parent(s):
88357e8
initial commit
Browse files- README.md +53 -2
- app.py +118 -0
- dataset/llm-perf-leaderboard-Raspberry Pi 5(8GB).csv +126 -0
- hardware.yaml +10 -0
- requirements.txt +3 -0
- src/assets.py +53 -0
- src/content.py +35 -0
- src/dependency.py +3 -0
- src/hardware.py +26 -0
- src/leaderboard.py +70 -0
- src/llm_perf.py +101 -0
- src/panel.py +206 -0
- src/utils.py +4 -0
README.md
CHANGED
@@ -6,7 +6,58 @@ colorTo: blue
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.8.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.8.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: apache-2.0
|
11 |
+
tags: [edge llm leaderboard, llm edge leaderboard, llm, edge, leaderboard]
|
12 |
---
|
13 |
|
14 |
+
# LLM-perf leaderboard
|
15 |
+
|
16 |
+
## π About
|
17 |
+
The Edge-LLM Leaderboard is a leaderboard to gauge practical performance and quality of edge LLMs.
|
18 |
+
Its aim is to benchmark the performance (throughput and memory)
|
19 |
+
of Large Language Models (LLMs) on Edge hardware - starting with a Raspberry Pi 5 (8GB) based on the ARM Cortex A76 CPU.
|
20 |
+
|
21 |
+
Anyone from the community can request a new base model or edge hardware/backend/optimization
|
22 |
+
configuration for automated benchmarking:
|
23 |
+
|
24 |
+
- Model evaluation requests will be made live soon, in the meantime feel free to email to - arnav[dot]chavan[@]nyunai[dot]com
|
25 |
+
|
26 |
+
## βοΈ Details
|
27 |
+
|
28 |
+
- To avoid multi-thread discrepencies, all 4 threads are used on the Pi 5.
|
29 |
+
- LLMs are running on a singleton batch with a prompt size of 512 and generating 128 tokens.
|
30 |
+
|
31 |
+
All of our throughput benchmarks are ran by this single tool
|
32 |
+
[llama-bench](https://github.com/ggerganov/llama.cpp/tree/master/examples/llama-bench)
|
33 |
+
using the power of [llama.cpp](https://github.com/ggerganov/llama.cpp) to guarantee reproducibility and consistency.
|
34 |
+
|
35 |
+
## π How to run locally
|
36 |
+
|
37 |
+
To run the Edge-LLM Leaderboard locally on your machine, follow these steps:
|
38 |
+
|
39 |
+
### 1. Clone the Repository
|
40 |
+
|
41 |
+
First, clone the repository to your local machine:
|
42 |
+
|
43 |
+
```bash
|
44 |
+
git clone https://huggingface.co/spaces/nyunai/edge-llm-leaderboard
|
45 |
+
cd edge-llm-leaderboard
|
46 |
+
```
|
47 |
+
|
48 |
+
### 2. Install the Required Dependencies
|
49 |
+
|
50 |
+
Install the necessary Python packages listed in the requirements.txt file:
|
51 |
+
`pip install -r requirements.txt`
|
52 |
+
|
53 |
+
### 3. Run the Application
|
54 |
+
|
55 |
+
You can run the Gradio application in one of the following ways:
|
56 |
+
- Option 1: Using Python
|
57 |
+
`python app.py`
|
58 |
+
- Option 2: Using Gradio CLI (include hot-reload)
|
59 |
+
`gradio app.py`
|
60 |
+
|
61 |
+
### 4. Access the Application
|
62 |
+
|
63 |
+
Once the application is running, you can access it locally in your web browser at http://127.0.0.1:7860/
|
app.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import src.dependency # noqa
|
4 |
+
from src.assets import custom_css
|
5 |
+
|
6 |
+
# from src.attention import create_attn_plots
|
7 |
+
from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
|
8 |
+
from src.hardware import load_hardware_configs
|
9 |
+
from src.leaderboard import create_leaderboard_table
|
10 |
+
from src.llm_perf import get_llm_perf_df
|
11 |
+
from src.panel import (
|
12 |
+
create_control_callback,
|
13 |
+
create_control_panel,
|
14 |
+
create_select_callback,
|
15 |
+
)
|
16 |
+
|
17 |
+
configs = load_hardware_configs("hardware.yaml")
|
18 |
+
|
19 |
+
|
20 |
+
demo = gr.Blocks(
|
21 |
+
css=custom_css,
|
22 |
+
theme=gr.themes.Default(primary_hue="indigo", secondary_hue="indigo"),
|
23 |
+
)
|
24 |
+
with demo:
|
25 |
+
gr.HTML(LOGO, elem_classes="logo")
|
26 |
+
gr.HTML(TITLE, elem_classes="title")
|
27 |
+
####################### HARDWARE TABS #######################
|
28 |
+
with gr.Tabs(elem_classes="tabs"):
|
29 |
+
for id, config in enumerate(configs):
|
30 |
+
with gr.TabItem(config.description, id=id):
|
31 |
+
####################### HARDWARE DETAILS #######################
|
32 |
+
if config.detail:
|
33 |
+
gr.Markdown(config.detail, elem_classes="descriptive-text")
|
34 |
+
|
35 |
+
######################## CONTROL PANEL #######################
|
36 |
+
(
|
37 |
+
filter_button,
|
38 |
+
machine_value,
|
39 |
+
backends_value,
|
40 |
+
hardware_type_value,
|
41 |
+
memory_slider,
|
42 |
+
quantization_checkboxes,
|
43 |
+
) = create_control_panel(
|
44 |
+
machine=config.machine,
|
45 |
+
backends=config.backends,
|
46 |
+
hardware_provider=config.hardware_provider,
|
47 |
+
hardware_type=config.hardware_type,
|
48 |
+
)
|
49 |
+
####################### HARDWARE SUBTABS #######################
|
50 |
+
with gr.Tabs(elem_classes="subtabs"):
|
51 |
+
open_llm_perf_df = get_llm_perf_df(
|
52 |
+
machine=config.machine,
|
53 |
+
backends=config.backends,
|
54 |
+
hardware_type=config.hardware_type,
|
55 |
+
)
|
56 |
+
####################### LEADERBOARD TAB #######################
|
57 |
+
with gr.TabItem("Leaderboard π
", id=0):
|
58 |
+
search_bar, columns_checkboxes, leaderboard_table = (
|
59 |
+
create_leaderboard_table(open_llm_perf_df)
|
60 |
+
)
|
61 |
+
###################### ATTENTIONS SPEEDUP TAB #######################
|
62 |
+
# with gr.TabItem("Attention π", id=2):
|
63 |
+
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
64 |
+
# open_llm_perf_df
|
65 |
+
# )
|
66 |
+
# ####################### KERNELS SPEEDUP TAB #######################
|
67 |
+
# with gr.TabItem("Kernels π", id=4):
|
68 |
+
# quant_krnl_prefill_plot, quant_krnl_decode_plot = (
|
69 |
+
# create_quant_krnl_plots(llm_perf_df)
|
70 |
+
# )
|
71 |
+
####################### CONTROL CALLBACK #######################
|
72 |
+
create_control_callback(
|
73 |
+
filter_button,
|
74 |
+
# inputs
|
75 |
+
machine_value,
|
76 |
+
backends_value,
|
77 |
+
hardware_type_value,
|
78 |
+
memory_slider,
|
79 |
+
quantization_checkboxes,
|
80 |
+
# interactive
|
81 |
+
columns_checkboxes,
|
82 |
+
search_bar,
|
83 |
+
# outputs
|
84 |
+
leaderboard_table,
|
85 |
+
# attn_prefill_plot,
|
86 |
+
# attn_decode_plot,
|
87 |
+
# quant_krnl_prefill_plot,
|
88 |
+
# quant_krnl_decode_plot,
|
89 |
+
)
|
90 |
+
|
91 |
+
create_select_callback(
|
92 |
+
# inputs
|
93 |
+
machine_value,
|
94 |
+
backends_value,
|
95 |
+
hardware_type_value,
|
96 |
+
# interactive
|
97 |
+
columns_checkboxes,
|
98 |
+
search_bar,
|
99 |
+
# outputs
|
100 |
+
leaderboard_table,
|
101 |
+
)
|
102 |
+
|
103 |
+
####################### ABOUT TAB #######################
|
104 |
+
with gr.TabItem("About π", id=len(configs)):
|
105 |
+
gr.Markdown(ABOUT, elem_classes="descriptive-text")
|
106 |
+
####################### CITATION
|
107 |
+
with gr.Row():
|
108 |
+
with gr.Accordion("π Citation", open=False):
|
109 |
+
citation_button = gr.Textbox(
|
110 |
+
value=CITATION_BUTTON,
|
111 |
+
label=CITATION_BUTTON_LABEL,
|
112 |
+
elem_id="citation-button",
|
113 |
+
show_copy_button=True,
|
114 |
+
)
|
115 |
+
|
116 |
+
if __name__ == "__main__":
|
117 |
+
# Launch demo
|
118 |
+
demo.queue().launch()
|
dataset/llm-perf-leaderboard-Raspberry Pi 5(8GB).csv
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Quantization,Params (B),Model Size (GB),Prefill (tokens/s),Decode (tokens/s),Backend
|
2 |
+
gemma-2-9b,Q8_0,10.159,10.796,2.169,0.012,llama_cpp
|
3 |
+
DeepSeek-V2-Lite,Q4_K_M,15.706,10.36,4.304,1.764,llama_cpp
|
4 |
+
aya-expanse-8b,Q8_0,9.077,9.644,3.1,0.027,llama_cpp
|
5 |
+
Yi-1.5-9B,Q8_0,8.829,9.382,2.585,0.019,llama_cpp
|
6 |
+
Qwen2.5-14B,Q4_K_M,14.77,8.982,1.916,0.018,llama_cpp
|
7 |
+
DeepSeek-V2-Lite,Q4_0_4_4,15.706,8.901,7.788,3.867,llama_cpp
|
8 |
+
Phi-3-medium-128k-instruct,Q4_K_M,13.96,8.566,1.819,0.02,llama_cpp
|
9 |
+
Hermes-3-Llama-3.1-8B,Q8_0,8.03,8.533,3.286,0.922,llama_cpp
|
10 |
+
Qwen2.5-14B,Q4_0_4_4,14.77,8.512,4.698,0.028,llama_cpp
|
11 |
+
internlm2_5-7b-chat,Q8_0,7.738,8.222,3.258,1.238,llama_cpp
|
12 |
+
dolphin-2.9.2-qwen2-7b,Q8_0,7.616,8.093,4.241,1.301,llama_cpp
|
13 |
+
Qwen2.5-7B,Q8_0,7.616,8.093,4.253,1.302,llama_cpp
|
14 |
+
Phi-3-medium-128k-instruct,Q4_0_4_4,13.96,7.896,4.715,0.038,llama_cpp
|
15 |
+
NexusRaven-V2-13B,Q4_K_M,13.016,7.865,2.066,0.035,llama_cpp
|
16 |
+
Mistral-7B-Instruct-v0.3,Q8_0,7.248,7.702,4.104,1.29,llama_cpp
|
17 |
+
dolphin-2.9.3-mistral-7B-32k,Q8_0,7.248,7.702,4.135,1.294,llama_cpp
|
18 |
+
Yarn-Mistral-7b-128k,Q8_0,7.242,7.695,4.082,1.292,llama_cpp
|
19 |
+
Starling-LM-7B-beta,Q8_0,7.242,7.695,4.132,1.296,llama_cpp
|
20 |
+
Mistral-Nemo-Base-2407,Q4_K_M,12.248,7.469,2.453,1.358,llama_cpp
|
21 |
+
NexusRaven-V2-13B,Q4_0_4_4,13.016,7.365,4.979,1.348,llama_cpp
|
22 |
+
OLMoE-1B-7B-0924,Q8_0,6.919,7.358,26.942,7.489,llama_cpp
|
23 |
+
OLMo-7B-0724-hf,Q8_0,6.888,7.319,4.515,1.371,llama_cpp
|
24 |
+
mpt-7b-instruct,Q8_0,6.856,7.285,4.287,1.367,llama_cpp
|
25 |
+
Amber,Q8_0,6.738,7.16,4.442,1.373,llama_cpp
|
26 |
+
Mistral-Nemo-Base-2407,Q4_0_4_4,12.248,7.064,9.103,1.48,llama_cpp
|
27 |
+
gemma-2-9b,Q4_K_M,10.159,6.508,3.531,1.629,llama_cpp
|
28 |
+
Yarn-Solar-10b-64k,Q4_K_M,10.732,6.461,2.905,1.503,llama_cpp
|
29 |
+
SOLAR-10.7B-v1.0,Q4_K_M,10.732,6.461,2.925,1.505,llama_cpp
|
30 |
+
SOLAR-10.7B-Instruct-v1.0,Q4_K_M,10.732,6.461,2.916,1.506,llama_cpp
|
31 |
+
Yi-1.5-6B,Q8_0,6.061,6.441,5.269,1.584,llama_cpp
|
32 |
+
gemma-2-9b,Q4_0_4_4,10.159,6.19,10.553,1.757,llama_cpp
|
33 |
+
SOLAR-10.7B-v1.0,Q4_0_4_4,10.732,6.072,9.315,1.635,llama_cpp
|
34 |
+
SOLAR-10.7B-Instruct-v1.0,Q4_0_4_4,10.732,6.072,9.332,1.635,llama_cpp
|
35 |
+
Yarn-Solar-10b-64k,Q4_0_4_4,10.732,6.072,9.352,1.638,llama_cpp
|
36 |
+
aya-expanse-8b,Q4_K_M,9.077,5.906,4.406,1.911,llama_cpp
|
37 |
+
aya-23-8B,Q4_K_M,9.077,5.906,4.428,1.914,llama_cpp
|
38 |
+
aya-expanse-8b,Q4_0_4_4,9.077,5.647,14.074,2.05,llama_cpp
|
39 |
+
aya-23-8B,Q4_0_4_4,9.077,5.647,14.113,2.051,llama_cpp
|
40 |
+
Yi-1.5-9B,Q4_K_M,8.829,5.327,3.681,1.85,llama_cpp
|
41 |
+
Yi-1.5-9B,Q4_0_4_4,8.829,5.035,11.33,2.0,llama_cpp
|
42 |
+
Hermes-3-Llama-3.1-8B,Q4_K_M,8.03,4.913,4.375,2.078,llama_cpp
|
43 |
+
Llama-3.1-8B,Q4_K_M,8.03,4.913,4.403,2.086,llama_cpp
|
44 |
+
internlm2_5-7b-chat,Q4_K_M,7.738,4.711,4.4,2.133,llama_cpp
|
45 |
+
Qwen2.5-7B,Q4_K_M,7.616,4.677,4.769,2.201,llama_cpp
|
46 |
+
dolphin-2.9.2-qwen2-7b,Q4_K_M,7.616,4.677,4.759,2.204,llama_cpp
|
47 |
+
Llama-3.1-8B,Q4_0_4_4,8.03,4.653,13.99,2.245,llama_cpp
|
48 |
+
Hermes-3-Llama-3.1-8B,Q4_0_4_4,8.03,4.653,14.006,2.245,llama_cpp
|
49 |
+
internlm2_5-7b-chat,Q4_0_4_4,7.738,4.451,14.036,2.31,llama_cpp
|
50 |
+
mpt-7b-instruct,Q4_K_M,6.856,4.442,4.162,2.213,llama_cpp
|
51 |
+
Qwen2.5-7B,Q4_0_4_4,7.616,4.425,15.563,2.386,llama_cpp
|
52 |
+
dolphin-2.9.2-qwen2-7b,Q4_0_4_4,7.616,4.425,15.58,2.387,llama_cpp
|
53 |
+
dolphin-2.9.3-mistral-7B-32k,Q4_K_M,7.248,4.372,4.387,2.227,llama_cpp
|
54 |
+
Mistral-7B-Instruct-v0.3,Q4_K_M,7.248,4.372,4.462,2.241,llama_cpp
|
55 |
+
Starling-LM-7B-beta,Q4_K_M,7.242,4.368,4.406,2.234,llama_cpp
|
56 |
+
Yarn-Mistral-7b-128k,Q4_K_M,7.242,4.368,4.434,2.245,llama_cpp
|
57 |
+
OLMoE-1B-7B-0924,Q4_K_M,6.919,4.212,26.902,12.119,llama_cpp
|
58 |
+
OLMo-7B-0724-hf,Q4_K_M,6.888,4.183,4.706,2.339,llama_cpp
|
59 |
+
dolphin-2.9.3-mistral-7B-32k,Q4_0_4_4,7.248,4.113,14.053,2.427,llama_cpp
|
60 |
+
Mistral-7B-Instruct-v0.3,Q4_0_4_4,7.248,4.113,14.177,2.43,llama_cpp
|
61 |
+
Starling-LM-7B-beta,Q4_0_4_4,7.242,4.108,14.068,2.427,llama_cpp
|
62 |
+
Yarn-Mistral-7b-128k,Q4_0_4_4,7.242,4.108,14.139,2.436,llama_cpp
|
63 |
+
Amber,Q4_K_M,6.738,4.08,4.594,2.351,llama_cpp
|
64 |
+
Phi-3.5-mini-instruct,Q8_0,3.821,4.06,7.951,2.423,llama_cpp
|
65 |
+
Phi-3-mini-128k-instruct,Q8_0,3.821,4.06,7.947,2.426,llama_cpp
|
66 |
+
mpt-7b-instruct,Q4_0_4_4,6.856,3.964,14.569,2.533,llama_cpp
|
67 |
+
OLMoE-1B-7B-0924,Q4_0_4_4,6.919,3.926,50.413,12.989,llama_cpp
|
68 |
+
Amber,Q4_0_4_4,6.738,3.825,14.442,2.57,llama_cpp
|
69 |
+
Yi-1.5-6B,Q4_K_M,6.061,3.672,5.58,2.72,llama_cpp
|
70 |
+
Qwen2.5-3B,Q8_0,3.397,3.61,10.473,2.939,llama_cpp
|
71 |
+
Yi-1.5-6B,Q4_0_4_4,6.061,3.478,17.017,2.945,llama_cpp
|
72 |
+
dolphin-2.9.4-gemma2-2b,Q8_0,3.204,3.405,13.966,3.381,llama_cpp
|
73 |
+
gemma-2-2b,Q8_0,3.204,3.405,13.996,3.385,llama_cpp
|
74 |
+
stable-code-instruct-3b,Q8_0,2.795,2.971,10.668,3.316,llama_cpp
|
75 |
+
Phi-3.5-mini-instruct,Q4_K_M,3.821,2.393,7.502,3.936,llama_cpp
|
76 |
+
Phi-3-mini-128k-instruct,Q4_K_M,3.821,2.393,7.519,3.938,llama_cpp
|
77 |
+
Llama-3.2-3B,Q4_K_M,3.607,2.335,10.691,4.674,llama_cpp
|
78 |
+
Llama-3.2-3B,Q4_0_4_4,3.607,2.233,31.72,5.025,llama_cpp
|
79 |
+
gemma-2-2b,Q4_K_M,3.204,2.186,14.202,5.253,llama_cpp
|
80 |
+
dolphin-2.9.4-gemma2-2b,Q4_K_M,3.204,2.186,14.218,5.253,llama_cpp
|
81 |
+
Qwen2.5-3B,Q4_K_M,3.397,2.179,10.638,4.808,llama_cpp
|
82 |
+
Phi-3.5-mini-instruct,Q4_0_4_4,3.821,2.175,23.369,4.428,llama_cpp
|
83 |
+
Phi-3-mini-128k-instruct,Q4_0_4_4,3.821,2.175,23.461,4.436,llama_cpp
|
84 |
+
gemma-2-2b,Q4_0_4_4,3.204,2.107,40.616,5.552,llama_cpp
|
85 |
+
dolphin-2.9.4-gemma2-2b,Q4_0_4_4,3.204,2.107,40.977,5.58,llama_cpp
|
86 |
+
Qwen2.5-3B,Q4_0_4_4,3.397,2.072,32.434,5.239,llama_cpp
|
87 |
+
internlm2_5-1_8b-chat,Q8_0,1.889,2.007,19.329,5.279,llama_cpp
|
88 |
+
SmolLM2-1.7B-Instruct,Q8_0,1.812,1.926,17.524,5.177,llama_cpp
|
89 |
+
Qwen2.5-1.5B,Q8_0,1.777,1.889,21.927,5.793,llama_cpp
|
90 |
+
stable-code-instruct-3b,Q4_K_M,2.795,1.707,10.803,5.564,llama_cpp
|
91 |
+
stable-code-instruct-3b,Q4_0_4_4,2.795,1.607,28.926,5.957,llama_cpp
|
92 |
+
Yi-Coder-1.5B,Q8_0,1.476,1.569,23.894,6.596,llama_cpp
|
93 |
+
OLMo-1B-0724-hf,Q8_0,1.28,1.36,27.787,7.591,llama_cpp
|
94 |
+
Qwen2.5-1.5B,Q4_K_M,1.777,1.172,22.326,9.56,llama_cpp
|
95 |
+
internlm2_5-1_8b-chat,Q4_K_M,1.889,1.17,19.453,8.56,llama_cpp
|
96 |
+
TinyLlama-1.1B-Chat-v1.0,Q8_0,1.1,1.169,28.472,8.637,llama_cpp
|
97 |
+
TinyLlama_v1.1,Q8_0,1.1,1.169,28.538,8.652,llama_cpp
|
98 |
+
SmolLM2-1.7B-Instruct,Q4_K_M,1.812,1.136,17.72,8.497,llama_cpp
|
99 |
+
Qwen2.5-1.5B,Q4_0_4_4,1.777,1.12,65.915,10.128,llama_cpp
|
100 |
+
internlm2_5-1_8b-chat,Q4_0_4_4,1.889,1.112,57.736,9.243,llama_cpp
|
101 |
+
SmolLM2-1.7B-Instruct,Q4_0_4_4,1.812,1.072,50.27,9.239,llama_cpp
|
102 |
+
Llama-3.2-1B,Q4_K_M,1.498,1.015,30.451,11.51,llama_cpp
|
103 |
+
Llama-3.2-1B,Q4_0_4_4,1.498,0.979,86.772,12.364,llama_cpp
|
104 |
+
Yi-Coder-1.5B,Q4_K_M,1.476,0.962,23.267,10.03,llama_cpp
|
105 |
+
Yi-Coder-1.5B,Q4_0_4_4,1.476,0.865,67.713,11.422,llama_cpp
|
106 |
+
OLMo-1B-0724-hf,Q4_K_M,1.28,0.79,28.276,12.321,llama_cpp
|
107 |
+
OLMo-1B-0724-hf,Q4_0_4_4,1.28,0.746,84.882,13.339,llama_cpp
|
108 |
+
Qwen2.5-0.5B,Q8_0,0.63,0.67,75.456,18.06,llama_cpp
|
109 |
+
TinyLlama-1.1B-Chat-v1.0,Q4_K_M,1.1,0.667,29.44,14.305,llama_cpp
|
110 |
+
TinyLlama_v1.1,Q4_K_M,1.1,0.667,29.397,14.346,llama_cpp
|
111 |
+
TinyLlama-1.1B-Chat-v1.0,Q4_0_4_4,1.1,0.636,77.823,15.509,llama_cpp
|
112 |
+
TinyLlama_v1.1,Q4_0_4_4,1.1,0.636,77.943,15.543,llama_cpp
|
113 |
+
Qwen2.5-0.5B,Q4_K_M,0.63,0.537,52.916,22.324,llama_cpp
|
114 |
+
Qwen2.5-0.5B,Q4_0_4_4,0.63,0.491,189.874,26.738,llama_cpp
|
115 |
+
gpt2-medium,Q8_0,0.406,0.436,83.423,23.016,llama_cpp
|
116 |
+
SmolLM2-360M-Instruct,Q8_0,0.409,0.435,79.518,22.857,llama_cpp
|
117 |
+
SmolLM2-360M-Instruct,Q4_K_M,0.409,0.319,55.774,30.718,llama_cpp
|
118 |
+
SmolLM2-360M-Instruct,Q4_0_4_4,0.409,0.277,173.275,37.176,llama_cpp
|
119 |
+
gpt2-medium,Q4_K_M,0.406,0.269,73.615,33.913,llama_cpp
|
120 |
+
gpt2-medium,Q4_0_4_4,0.406,0.247,178.73,37.89,llama_cpp
|
121 |
+
gpt2,Q8_0,0.163,0.176,302.932,68.191,llama_cpp
|
122 |
+
SmolLM2-135M-Instruct,Q8_0,0.163,0.173,212.146,57.992,llama_cpp
|
123 |
+
SmolLM2-135M-Instruct,Q4_K_M,0.163,0.134,153.439,73.272,llama_cpp
|
124 |
+
SmolLM2-135M-Instruct,Q4_0_4_4,0.163,0.12,381.667,86.735,llama_cpp
|
125 |
+
gpt2,Q4_K_M,0.163,0.111,269.906,92.707,llama_cpp
|
126 |
+
gpt2,Q4_0_4_4,0.163,0.105,582.32,101.509,llama_cpp
|
hardware.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- machine: Raspberry Pi 5(8GB)
|
2 |
+
description: Cortex A76
|
3 |
+
hardware_provider: ARM
|
4 |
+
hardware_type: arm cortex a76
|
5 |
+
subsets:
|
6 |
+
- Q8_0
|
7 |
+
- Q4_K_M
|
8 |
+
- Q4_0_4_4
|
9 |
+
backends:
|
10 |
+
- llama_cpp
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
gradio>=5.0.0
|
3 |
+
pandas
|
src/assets.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
.logo {
|
3 |
+
width: 300px;
|
4 |
+
height: auto;
|
5 |
+
margin: 0 auto;
|
6 |
+
max-width: 100%
|
7 |
+
object-fit: contain;
|
8 |
+
overflow: visible !important;
|
9 |
+
}
|
10 |
+
.text {
|
11 |
+
font-size: 16px !important;
|
12 |
+
}
|
13 |
+
|
14 |
+
.tabs button {
|
15 |
+
font-size: 20px;
|
16 |
+
}
|
17 |
+
.subtabs button {
|
18 |
+
font-size: 20px;
|
19 |
+
}
|
20 |
+
|
21 |
+
.descriptive-text span {
|
22 |
+
font-size: 16px !important;
|
23 |
+
}
|
24 |
+
|
25 |
+
#control-panel span {
|
26 |
+
font-size: 20px !important;
|
27 |
+
}
|
28 |
+
#search-bar span {
|
29 |
+
font-size: 16px !important;
|
30 |
+
}
|
31 |
+
#threshold-slider span {
|
32 |
+
font-size: 16px !important;
|
33 |
+
}
|
34 |
+
#memory-slider span {
|
35 |
+
font-size: 16px !important;
|
36 |
+
}
|
37 |
+
#columns-checkboxes span {
|
38 |
+
font-size: 16px !important;
|
39 |
+
}
|
40 |
+
#backend-checkboxes span {
|
41 |
+
font-size: 16px !important;
|
42 |
+
}
|
43 |
+
#quantization-checkboxes span {
|
44 |
+
font-size: 16px !important;
|
45 |
+
}
|
46 |
+
|
47 |
+
#leaderboard-table td:first-child,
|
48 |
+
#leaderboard-table th:first-child {
|
49 |
+
max-width: 300px;
|
50 |
+
overflow: auto;
|
51 |
+
white-space: nowrap;
|
52 |
+
}
|
53 |
+
"""
|
src/content.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LOGO = '<img src="https://nyunai.com/assets/images/logo.png">'
|
2 |
+
|
3 |
+
TITLE = """<h1 align="center" id="space-title"> Edge-LLM Leaderboard </h1>"""
|
4 |
+
|
5 |
+
ABOUT = """
|
6 |
+
## π About
|
7 |
+
The Edge-LLM Leaderboard is a leaderboard to gauge practical performance and quality of edge LLMs.
|
8 |
+
Its aim is to benchmark the performance (throughput and memory)
|
9 |
+
of Large Language Models (LLMs) on Edge hardware - starting with a Raspberry Pi 5 (8GB) based on the ARM Cortex A76 CPU.
|
10 |
+
|
11 |
+
Anyone from the community can request a new base model or edge hardware/backend/optimization
|
12 |
+
configuration for automated benchmarking:
|
13 |
+
|
14 |
+
- Model evaluation requests will be made live soon, in the meantime feel free to email to - arnav[dot]chavan[@]nyunai[dot]com
|
15 |
+
|
16 |
+
## βοΈ Details
|
17 |
+
|
18 |
+
- To avoid multi-thread discrepencies, all 4 threads are used on the Pi 5.
|
19 |
+
- LLMs are running on a singleton batch with a prompt size of 512 and generating 128 tokens.
|
20 |
+
|
21 |
+
All of our throughput benchmarks are ran by this single tool
|
22 |
+
[llama-bench](https://github.com/ggerganov/llama.cpp/tree/master/examples/llama-bench)
|
23 |
+
using the power of [llama.cpp](https://github.com/ggerganov/llama.cpp) to guarantee reproducibility and consistency.
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
28 |
+
CITATION_BUTTON = r"""@misc{edge-llm-leaderboard,
|
29 |
+
author = {},
|
30 |
+
title = {Edge-LLM Leaderboard},
|
31 |
+
year = {2024},
|
32 |
+
publisher = {},
|
33 |
+
howpublished = "\url{https://huggingface.co/spaces/nyunai/edge-llm-leaderboard}",
|
34 |
+
}
|
35 |
+
"""
|
src/dependency.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
|
src/hardware.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict, List, Optional
|
2 |
+
|
3 |
+
import yaml
|
4 |
+
|
5 |
+
|
6 |
+
class HardwareConfig:
|
7 |
+
def __init__(self, data: Dict[str, Any]):
|
8 |
+
self.machine: str = data["machine"]
|
9 |
+
self.description: str = data["description"]
|
10 |
+
self.hardware_provider: str = data["hardware_provider"]
|
11 |
+
self.hardware_type: str = data["hardware_type"]
|
12 |
+
self.subsets: List[str] = data["subsets"]
|
13 |
+
self.backends: List[str] = data["backends"]
|
14 |
+
self.detail: Optional[str] = data.get("detail", None)
|
15 |
+
|
16 |
+
def __repr__(self) -> str:
|
17 |
+
return (
|
18 |
+
f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
|
19 |
+
f"hardware_provider={self.hardware_provider}, hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
|
24 |
+
with open(file_path, "r") as file:
|
25 |
+
data = yaml.safe_load(file)
|
26 |
+
return [HardwareConfig(config) for config in data]
|
src/leaderboard.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from src.utils import model_hyperlink
|
4 |
+
|
5 |
+
LEADERBOARD_COLUMN_TO_DATATYPE = {
|
6 |
+
# open llm
|
7 |
+
"Model": "markdown",
|
8 |
+
"Quantization": "str",
|
9 |
+
# primary measurements
|
10 |
+
"Prefill (tokens/s)": "number",
|
11 |
+
"Decode (tokens/s)": "number",
|
12 |
+
"Model Size (GB)": "number",
|
13 |
+
# deployment settings
|
14 |
+
"Backend": "str",
|
15 |
+
# additional measurements
|
16 |
+
# "Reserved Memory (MB)": "number",
|
17 |
+
# "Used Memory (MB)": "number",
|
18 |
+
"Params (B)": "number",
|
19 |
+
}
|
20 |
+
|
21 |
+
PRIMARY_COLUMNS = [
|
22 |
+
"Model",
|
23 |
+
"Quantization",
|
24 |
+
"Prefill (tokens/s)",
|
25 |
+
"Decode (tokens/s)",
|
26 |
+
"Model Size (GB)",
|
27 |
+
]
|
28 |
+
|
29 |
+
|
30 |
+
def process_model(model_name):
|
31 |
+
link = f"https://huggingface.co/{model_name}"
|
32 |
+
return model_hyperlink(link, model_name)
|
33 |
+
|
34 |
+
|
35 |
+
def get_leaderboard_df(llm_perf_df):
|
36 |
+
df = llm_perf_df.copy()
|
37 |
+
# transform for leaderboard
|
38 |
+
df["Model"] = df["Model"].apply(process_model)
|
39 |
+
return df
|
40 |
+
|
41 |
+
|
42 |
+
def create_leaderboard_table(llm_perf_df):
|
43 |
+
# get dataframe
|
44 |
+
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
45 |
+
|
46 |
+
# create search bar
|
47 |
+
with gr.Row():
|
48 |
+
search_bar = gr.Textbox(
|
49 |
+
label="Model",
|
50 |
+
info="π Search for a model name",
|
51 |
+
elem_id="search-bar",
|
52 |
+
)
|
53 |
+
# create checkboxes
|
54 |
+
with gr.Row():
|
55 |
+
columns_checkboxes = gr.CheckboxGroup(
|
56 |
+
label="Columns π",
|
57 |
+
value=PRIMARY_COLUMNS,
|
58 |
+
choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
59 |
+
info="βοΈ Select the columns to display",
|
60 |
+
elem_id="columns-checkboxes",
|
61 |
+
)
|
62 |
+
# create table
|
63 |
+
leaderboard_table = gr.components.Dataframe(
|
64 |
+
value=leaderboard_df[PRIMARY_COLUMNS],
|
65 |
+
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
66 |
+
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
67 |
+
elem_id="leaderboard-table",
|
68 |
+
)
|
69 |
+
|
70 |
+
return search_bar, columns_checkboxes, leaderboard_table
|
src/llm_perf.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
DATASET_DIRECTORY = "dataset"
|
7 |
+
|
8 |
+
COLUMNS_MAPPING = {
|
9 |
+
"config.name": "Quantization",
|
10 |
+
"config.backend.model": "Model",
|
11 |
+
# primary measurements
|
12 |
+
"report.prefill.throughput.value": "Prefill (tokens/s)",
|
13 |
+
"report.decode.throughput.value": "Decode (tokens/s)",
|
14 |
+
"report.memory": "Model Size (GB)",
|
15 |
+
# deployment settings
|
16 |
+
"config.backend.name": "Backend",
|
17 |
+
"quantization": "Quantization",
|
18 |
+
# additional information
|
19 |
+
"#Params (B)": "Params (B)",
|
20 |
+
}
|
21 |
+
SORTING_COLUMNS = ["Model Size (GB)", "Decode (tokens/s)", "Prefill (tokens/s)"]
|
22 |
+
SORTING_ASCENDING = [False, True, True]
|
23 |
+
|
24 |
+
|
25 |
+
def get_raw_llm_perf_df(
|
26 |
+
machine: str, backends: List[str], hardware_type: str
|
27 |
+
):
|
28 |
+
dfs = []
|
29 |
+
try:
|
30 |
+
dfs.append(
|
31 |
+
pd.read_csv("/Users/arnavchavan/leaderboard/benchmark_results.csv")
|
32 |
+
# pd.read_csv(
|
33 |
+
# f"hf://datasets/nyunai/edge-llm-leaderboard/perf-df-{hardware_type}-{machine}-{backends}.csv"
|
34 |
+
# )
|
35 |
+
)
|
36 |
+
except Exception:
|
37 |
+
print("Dataset not found for:")
|
38 |
+
print(f" β’ Machine: {machine}")
|
39 |
+
print(f" β’ Hardware Type: {hardware_type}")
|
40 |
+
url = f"https://huggingface.co/datasets/nyunai/edge-llm-leaderboard/blob/main/perf-df-{hardware_type}-{machine}-{backends}.csv"
|
41 |
+
print(f" β’ URL: {url}")
|
42 |
+
|
43 |
+
if len(dfs) == 0:
|
44 |
+
raise ValueError(
|
45 |
+
f"No datasets found for machine {machine}, check your hardware.yml config file or your datatset on huggingface"
|
46 |
+
)
|
47 |
+
|
48 |
+
perf_df = pd.concat(dfs)
|
49 |
+
# llm_df = pd.read_csv(
|
50 |
+
# "hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-df.csv"
|
51 |
+
# )
|
52 |
+
|
53 |
+
# llm_perf_df = pd.merge(
|
54 |
+
# llm_df, perf_df, left_on="Model", right_on="config.backend.model"
|
55 |
+
# )
|
56 |
+
|
57 |
+
return perf_df
|
58 |
+
|
59 |
+
|
60 |
+
def processed_llm_perf_df(llm_perf_df):
|
61 |
+
# llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
|
62 |
+
# process_architectures
|
63 |
+
# )
|
64 |
+
# round numerical columns
|
65 |
+
llm_perf_df = llm_perf_df.round(
|
66 |
+
{
|
67 |
+
"Prefill (tokens/s)": 3,
|
68 |
+
"Decode (tokens/s)": 3,
|
69 |
+
"Model Size (GB)": 3,
|
70 |
+
"#Params (B)": 3,
|
71 |
+
}
|
72 |
+
)
|
73 |
+
# sort by metric
|
74 |
+
llm_perf_df.sort_values(
|
75 |
+
by=SORTING_COLUMNS,
|
76 |
+
ascending=SORTING_ASCENDING,
|
77 |
+
inplace=True,
|
78 |
+
)
|
79 |
+
|
80 |
+
return llm_perf_df
|
81 |
+
|
82 |
+
|
83 |
+
def get_llm_perf_df(
|
84 |
+
machine: str, backends: List[str], hardware_type: str
|
85 |
+
):
|
86 |
+
if not os.path.exists(DATASET_DIRECTORY):
|
87 |
+
os.makedirs(DATASET_DIRECTORY)
|
88 |
+
|
89 |
+
if os.path.exists(f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"):
|
90 |
+
llm_perf_df = pd.read_csv(
|
91 |
+
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"
|
92 |
+
)
|
93 |
+
else:
|
94 |
+
print(f"Dataset machine {machine} not found, downloading...")
|
95 |
+
llm_perf_df = get_raw_llm_perf_df(machine, backends, hardware_type)
|
96 |
+
llm_perf_df = processed_llm_perf_df(llm_perf_df)
|
97 |
+
llm_perf_df.to_csv(
|
98 |
+
f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
|
99 |
+
)
|
100 |
+
|
101 |
+
return llm_perf_df
|
src/panel.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from src.leaderboard import get_leaderboard_df
|
6 |
+
from src.llm_perf import get_llm_perf_df
|
7 |
+
|
8 |
+
# from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig
|
9 |
+
# from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig
|
10 |
+
|
11 |
+
|
12 |
+
def create_control_panel(
|
13 |
+
machine: str,
|
14 |
+
backends: List[str],
|
15 |
+
hardware_provider: str,
|
16 |
+
hardware_type: str,
|
17 |
+
):
|
18 |
+
# controls
|
19 |
+
machine_value = gr.State(value=machine)
|
20 |
+
backends_value = gr.State(value=backends)
|
21 |
+
hardware_type_value = gr.State(value=hardware_type)
|
22 |
+
|
23 |
+
if hardware_provider == "ARM":
|
24 |
+
backends = ["llama.cpp"]
|
25 |
+
quantizations = ["Q8_0", "Q4_K_M", "Q4_0_4_4"]
|
26 |
+
else:
|
27 |
+
raise ValueError(f"Unknown hardware provider: {hardware_provider}")
|
28 |
+
|
29 |
+
with gr.Accordion("Control Panel", open=False, elem_id="control-panel"):
|
30 |
+
with gr.Row():
|
31 |
+
with gr.Column(scale=2, variant="panel"):
|
32 |
+
memory_slider = gr.Slider(
|
33 |
+
label="Model Size (GB) π",
|
34 |
+
info="ποΈ Slide to maximum Model Size",
|
35 |
+
minimum=0,
|
36 |
+
maximum=16,
|
37 |
+
value=16,
|
38 |
+
elem_id="memory-slider",
|
39 |
+
)
|
40 |
+
with gr.Column(scale=1, variant="panel"):
|
41 |
+
quantization_checkboxes = gr.CheckboxGroup(
|
42 |
+
label="Quantizations",
|
43 |
+
choices=quantizations,
|
44 |
+
value=quantizations,
|
45 |
+
info="βοΈ Select the quantization schemes",
|
46 |
+
elem_id="quantization-checkboxes",
|
47 |
+
elem_classes="boxed-option",
|
48 |
+
)
|
49 |
+
with gr.Row():
|
50 |
+
filter_button = gr.Button(
|
51 |
+
value="Filter π",
|
52 |
+
elem_id="filter-button",
|
53 |
+
elem_classes="boxed-option",
|
54 |
+
)
|
55 |
+
|
56 |
+
return (
|
57 |
+
filter_button,
|
58 |
+
machine_value,
|
59 |
+
backends_value,
|
60 |
+
hardware_type_value,
|
61 |
+
memory_slider,
|
62 |
+
quantization_checkboxes,
|
63 |
+
)
|
64 |
+
|
65 |
+
def filter_rows_fn(
|
66 |
+
machine,
|
67 |
+
backends,
|
68 |
+
hardware_type,
|
69 |
+
# inputs
|
70 |
+
memory,
|
71 |
+
quantizations,
|
72 |
+
# interactive
|
73 |
+
columns,
|
74 |
+
search,
|
75 |
+
):
|
76 |
+
llm_perf_df = get_llm_perf_df(
|
77 |
+
machine=machine, backends=backends, hardware_type=hardware_type
|
78 |
+
)
|
79 |
+
# print(attentions)
|
80 |
+
# print(llm_perf_df["Attention ποΈ"].unique())
|
81 |
+
filtered_llm_perf_df = llm_perf_df[
|
82 |
+
llm_perf_df["Model"].str.contains(search, case=False)
|
83 |
+
& llm_perf_df["Quantization"].isin(quantizations)
|
84 |
+
& (llm_perf_df["Model Size (GB)"] <= memory)
|
85 |
+
]
|
86 |
+
selected_filtered_llm_perf_df = select_columns_fn(
|
87 |
+
machine, backends, hardware_type, columns, search, filtered_llm_perf_df
|
88 |
+
)
|
89 |
+
# filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
90 |
+
# filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
91 |
+
# filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
|
92 |
+
# filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
|
93 |
+
# filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
|
94 |
+
# filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
|
95 |
+
|
96 |
+
return [
|
97 |
+
selected_filtered_llm_perf_df,
|
98 |
+
# filtered_bt_prefill_fig,
|
99 |
+
# filtered_bt_decode_fig,
|
100 |
+
# filtered_fa2_prefill_fig,
|
101 |
+
# filtered_fa2_decode_fig,
|
102 |
+
# filtered_quant_prefill_fig,
|
103 |
+
# filtered_quant_decode_fig,
|
104 |
+
]
|
105 |
+
|
106 |
+
|
107 |
+
def create_control_callback(
|
108 |
+
# button
|
109 |
+
filter_button,
|
110 |
+
# fixed
|
111 |
+
machine_value,
|
112 |
+
backends_value,
|
113 |
+
hardware_type_value,
|
114 |
+
# inputs
|
115 |
+
memory_slider,
|
116 |
+
quantization_checkboxes,
|
117 |
+
# interactive
|
118 |
+
columns_checkboxes,
|
119 |
+
search_bar,
|
120 |
+
# outputs
|
121 |
+
leaderboard_table,
|
122 |
+
# attn_prefill_plot,
|
123 |
+
# attn_decode_plot,
|
124 |
+
# fa2_prefill_plot,
|
125 |
+
# fa2_decode_plot,
|
126 |
+
# quant_prefill_plot,
|
127 |
+
# quant_decode_plot,
|
128 |
+
):
|
129 |
+
filter_button.click(
|
130 |
+
fn=filter_rows_fn,
|
131 |
+
inputs=[
|
132 |
+
# fixed
|
133 |
+
machine_value,
|
134 |
+
backends_value,
|
135 |
+
hardware_type_value,
|
136 |
+
# inputs
|
137 |
+
memory_slider,
|
138 |
+
quantization_checkboxes,
|
139 |
+
# interactive
|
140 |
+
columns_checkboxes,
|
141 |
+
search_bar,
|
142 |
+
],
|
143 |
+
outputs=[
|
144 |
+
leaderboard_table,
|
145 |
+
# attn_prefill_plot,
|
146 |
+
# attn_decode_plot,
|
147 |
+
# fa2_prefill_plot,
|
148 |
+
# fa2_decode_plot,
|
149 |
+
# quant_prefill_plot,
|
150 |
+
# quant_decode_plot,
|
151 |
+
],
|
152 |
+
)
|
153 |
+
|
154 |
+
|
155 |
+
def select_columns_fn(
|
156 |
+
machine, backends, hardware_type, columns, search, llm_perf_df=None
|
157 |
+
):
|
158 |
+
if llm_perf_df is None:
|
159 |
+
llm_perf_df = get_llm_perf_df(
|
160 |
+
machine=machine,
|
161 |
+
backends=backends,
|
162 |
+
hardware_type=hardware_type,
|
163 |
+
)
|
164 |
+
|
165 |
+
selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
|
166 |
+
selected_leaderboard_df = selected_leaderboard_df[
|
167 |
+
selected_leaderboard_df["Model"].str.contains(search, case=False)
|
168 |
+
]
|
169 |
+
selected_leaderboard_df = selected_leaderboard_df[columns]
|
170 |
+
|
171 |
+
return selected_leaderboard_df
|
172 |
+
|
173 |
+
|
174 |
+
def create_select_callback(
|
175 |
+
# fixed
|
176 |
+
machine_value,
|
177 |
+
backends_value,
|
178 |
+
hardware_type_value,
|
179 |
+
# interactive
|
180 |
+
columns_checkboxes,
|
181 |
+
search_bar,
|
182 |
+
# outputs
|
183 |
+
leaderboard_table,
|
184 |
+
):
|
185 |
+
columns_checkboxes.change(
|
186 |
+
fn=select_columns_fn,
|
187 |
+
inputs=[
|
188 |
+
machine_value,
|
189 |
+
backends_value,
|
190 |
+
hardware_type_value,
|
191 |
+
columns_checkboxes,
|
192 |
+
search_bar,
|
193 |
+
],
|
194 |
+
outputs=[leaderboard_table],
|
195 |
+
)
|
196 |
+
search_bar.change(
|
197 |
+
fn=select_columns_fn,
|
198 |
+
inputs=[
|
199 |
+
machine_value,
|
200 |
+
backends_value,
|
201 |
+
hardware_type_value,
|
202 |
+
columns_checkboxes,
|
203 |
+
search_bar,
|
204 |
+
],
|
205 |
+
outputs=[leaderboard_table],
|
206 |
+
)
|
src/utils.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
def model_hyperlink(link, model_name):
|
4 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|