Arnav Chavan commited on
Commit
2fcb72a
Β·
1 Parent(s): 88357e8

initial commit

Browse files
README.md CHANGED
@@ -6,7 +6,58 @@ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.8.0
8
  app_file: app.py
9
- pinned: false
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  sdk: gradio
7
  sdk_version: 5.8.0
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ tags: [edge llm leaderboard, llm edge leaderboard, llm, edge, leaderboard]
12
  ---
13
 
14
+ # LLM-perf leaderboard
15
+
16
+ ## πŸ“ About
17
+ The Edge-LLM Leaderboard is a leaderboard to gauge practical performance and quality of edge LLMs.
18
+ Its aim is to benchmark the performance (throughput and memory)
19
+ of Large Language Models (LLMs) on Edge hardware - starting with a Raspberry Pi 5 (8GB) based on the ARM Cortex A76 CPU.
20
+
21
+ Anyone from the community can request a new base model or edge hardware/backend/optimization
22
+ configuration for automated benchmarking:
23
+
24
+ - Model evaluation requests will be made live soon, in the meantime feel free to email to - arnav[dot]chavan[@]nyunai[dot]com
25
+
26
+ ## ✍️ Details
27
+
28
+ - To avoid multi-thread discrepencies, all 4 threads are used on the Pi 5.
29
+ - LLMs are running on a singleton batch with a prompt size of 512 and generating 128 tokens.
30
+
31
+ All of our throughput benchmarks are ran by this single tool
32
+ [llama-bench](https://github.com/ggerganov/llama.cpp/tree/master/examples/llama-bench)
33
+ using the power of [llama.cpp](https://github.com/ggerganov/llama.cpp) to guarantee reproducibility and consistency.
34
+
35
+ ## πŸƒ How to run locally
36
+
37
+ To run the Edge-LLM Leaderboard locally on your machine, follow these steps:
38
+
39
+ ### 1. Clone the Repository
40
+
41
+ First, clone the repository to your local machine:
42
+
43
+ ```bash
44
+ git clone https://huggingface.co/spaces/nyunai/edge-llm-leaderboard
45
+ cd edge-llm-leaderboard
46
+ ```
47
+
48
+ ### 2. Install the Required Dependencies
49
+
50
+ Install the necessary Python packages listed in the requirements.txt file:
51
+ `pip install -r requirements.txt`
52
+
53
+ ### 3. Run the Application
54
+
55
+ You can run the Gradio application in one of the following ways:
56
+ - Option 1: Using Python
57
+ `python app.py`
58
+ - Option 2: Using Gradio CLI (include hot-reload)
59
+ `gradio app.py`
60
+
61
+ ### 4. Access the Application
62
+
63
+ Once the application is running, you can access it locally in your web browser at http://127.0.0.1:7860/
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import src.dependency # noqa
4
+ from src.assets import custom_css
5
+
6
+ # from src.attention import create_attn_plots
7
+ from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
8
+ from src.hardware import load_hardware_configs
9
+ from src.leaderboard import create_leaderboard_table
10
+ from src.llm_perf import get_llm_perf_df
11
+ from src.panel import (
12
+ create_control_callback,
13
+ create_control_panel,
14
+ create_select_callback,
15
+ )
16
+
17
+ configs = load_hardware_configs("hardware.yaml")
18
+
19
+
20
+ demo = gr.Blocks(
21
+ css=custom_css,
22
+ theme=gr.themes.Default(primary_hue="indigo", secondary_hue="indigo"),
23
+ )
24
+ with demo:
25
+ gr.HTML(LOGO, elem_classes="logo")
26
+ gr.HTML(TITLE, elem_classes="title")
27
+ ####################### HARDWARE TABS #######################
28
+ with gr.Tabs(elem_classes="tabs"):
29
+ for id, config in enumerate(configs):
30
+ with gr.TabItem(config.description, id=id):
31
+ ####################### HARDWARE DETAILS #######################
32
+ if config.detail:
33
+ gr.Markdown(config.detail, elem_classes="descriptive-text")
34
+
35
+ ######################## CONTROL PANEL #######################
36
+ (
37
+ filter_button,
38
+ machine_value,
39
+ backends_value,
40
+ hardware_type_value,
41
+ memory_slider,
42
+ quantization_checkboxes,
43
+ ) = create_control_panel(
44
+ machine=config.machine,
45
+ backends=config.backends,
46
+ hardware_provider=config.hardware_provider,
47
+ hardware_type=config.hardware_type,
48
+ )
49
+ ####################### HARDWARE SUBTABS #######################
50
+ with gr.Tabs(elem_classes="subtabs"):
51
+ open_llm_perf_df = get_llm_perf_df(
52
+ machine=config.machine,
53
+ backends=config.backends,
54
+ hardware_type=config.hardware_type,
55
+ )
56
+ ####################### LEADERBOARD TAB #######################
57
+ with gr.TabItem("Leaderboard πŸ…", id=0):
58
+ search_bar, columns_checkboxes, leaderboard_table = (
59
+ create_leaderboard_table(open_llm_perf_df)
60
+ )
61
+ ###################### ATTENTIONS SPEEDUP TAB #######################
62
+ # with gr.TabItem("Attention πŸ“ˆ", id=2):
63
+ # attn_prefill_plot, attn_decode_plot = create_attn_plots(
64
+ # open_llm_perf_df
65
+ # )
66
+ # ####################### KERNELS SPEEDUP TAB #######################
67
+ # with gr.TabItem("Kernels πŸ“ˆ", id=4):
68
+ # quant_krnl_prefill_plot, quant_krnl_decode_plot = (
69
+ # create_quant_krnl_plots(llm_perf_df)
70
+ # )
71
+ ####################### CONTROL CALLBACK #######################
72
+ create_control_callback(
73
+ filter_button,
74
+ # inputs
75
+ machine_value,
76
+ backends_value,
77
+ hardware_type_value,
78
+ memory_slider,
79
+ quantization_checkboxes,
80
+ # interactive
81
+ columns_checkboxes,
82
+ search_bar,
83
+ # outputs
84
+ leaderboard_table,
85
+ # attn_prefill_plot,
86
+ # attn_decode_plot,
87
+ # quant_krnl_prefill_plot,
88
+ # quant_krnl_decode_plot,
89
+ )
90
+
91
+ create_select_callback(
92
+ # inputs
93
+ machine_value,
94
+ backends_value,
95
+ hardware_type_value,
96
+ # interactive
97
+ columns_checkboxes,
98
+ search_bar,
99
+ # outputs
100
+ leaderboard_table,
101
+ )
102
+
103
+ ####################### ABOUT TAB #######################
104
+ with gr.TabItem("About πŸ“–", id=len(configs)):
105
+ gr.Markdown(ABOUT, elem_classes="descriptive-text")
106
+ ####################### CITATION
107
+ with gr.Row():
108
+ with gr.Accordion("πŸ“™ Citation", open=False):
109
+ citation_button = gr.Textbox(
110
+ value=CITATION_BUTTON,
111
+ label=CITATION_BUTTON_LABEL,
112
+ elem_id="citation-button",
113
+ show_copy_button=True,
114
+ )
115
+
116
+ if __name__ == "__main__":
117
+ # Launch demo
118
+ demo.queue().launch()
dataset/llm-perf-leaderboard-Raspberry Pi 5(8GB).csv ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Quantization,Params (B),Model Size (GB),Prefill (tokens/s),Decode (tokens/s),Backend
2
+ gemma-2-9b,Q8_0,10.159,10.796,2.169,0.012,llama_cpp
3
+ DeepSeek-V2-Lite,Q4_K_M,15.706,10.36,4.304,1.764,llama_cpp
4
+ aya-expanse-8b,Q8_0,9.077,9.644,3.1,0.027,llama_cpp
5
+ Yi-1.5-9B,Q8_0,8.829,9.382,2.585,0.019,llama_cpp
6
+ Qwen2.5-14B,Q4_K_M,14.77,8.982,1.916,0.018,llama_cpp
7
+ DeepSeek-V2-Lite,Q4_0_4_4,15.706,8.901,7.788,3.867,llama_cpp
8
+ Phi-3-medium-128k-instruct,Q4_K_M,13.96,8.566,1.819,0.02,llama_cpp
9
+ Hermes-3-Llama-3.1-8B,Q8_0,8.03,8.533,3.286,0.922,llama_cpp
10
+ Qwen2.5-14B,Q4_0_4_4,14.77,8.512,4.698,0.028,llama_cpp
11
+ internlm2_5-7b-chat,Q8_0,7.738,8.222,3.258,1.238,llama_cpp
12
+ dolphin-2.9.2-qwen2-7b,Q8_0,7.616,8.093,4.241,1.301,llama_cpp
13
+ Qwen2.5-7B,Q8_0,7.616,8.093,4.253,1.302,llama_cpp
14
+ Phi-3-medium-128k-instruct,Q4_0_4_4,13.96,7.896,4.715,0.038,llama_cpp
15
+ NexusRaven-V2-13B,Q4_K_M,13.016,7.865,2.066,0.035,llama_cpp
16
+ Mistral-7B-Instruct-v0.3,Q8_0,7.248,7.702,4.104,1.29,llama_cpp
17
+ dolphin-2.9.3-mistral-7B-32k,Q8_0,7.248,7.702,4.135,1.294,llama_cpp
18
+ Yarn-Mistral-7b-128k,Q8_0,7.242,7.695,4.082,1.292,llama_cpp
19
+ Starling-LM-7B-beta,Q8_0,7.242,7.695,4.132,1.296,llama_cpp
20
+ Mistral-Nemo-Base-2407,Q4_K_M,12.248,7.469,2.453,1.358,llama_cpp
21
+ NexusRaven-V2-13B,Q4_0_4_4,13.016,7.365,4.979,1.348,llama_cpp
22
+ OLMoE-1B-7B-0924,Q8_0,6.919,7.358,26.942,7.489,llama_cpp
23
+ OLMo-7B-0724-hf,Q8_0,6.888,7.319,4.515,1.371,llama_cpp
24
+ mpt-7b-instruct,Q8_0,6.856,7.285,4.287,1.367,llama_cpp
25
+ Amber,Q8_0,6.738,7.16,4.442,1.373,llama_cpp
26
+ Mistral-Nemo-Base-2407,Q4_0_4_4,12.248,7.064,9.103,1.48,llama_cpp
27
+ gemma-2-9b,Q4_K_M,10.159,6.508,3.531,1.629,llama_cpp
28
+ Yarn-Solar-10b-64k,Q4_K_M,10.732,6.461,2.905,1.503,llama_cpp
29
+ SOLAR-10.7B-v1.0,Q4_K_M,10.732,6.461,2.925,1.505,llama_cpp
30
+ SOLAR-10.7B-Instruct-v1.0,Q4_K_M,10.732,6.461,2.916,1.506,llama_cpp
31
+ Yi-1.5-6B,Q8_0,6.061,6.441,5.269,1.584,llama_cpp
32
+ gemma-2-9b,Q4_0_4_4,10.159,6.19,10.553,1.757,llama_cpp
33
+ SOLAR-10.7B-v1.0,Q4_0_4_4,10.732,6.072,9.315,1.635,llama_cpp
34
+ SOLAR-10.7B-Instruct-v1.0,Q4_0_4_4,10.732,6.072,9.332,1.635,llama_cpp
35
+ Yarn-Solar-10b-64k,Q4_0_4_4,10.732,6.072,9.352,1.638,llama_cpp
36
+ aya-expanse-8b,Q4_K_M,9.077,5.906,4.406,1.911,llama_cpp
37
+ aya-23-8B,Q4_K_M,9.077,5.906,4.428,1.914,llama_cpp
38
+ aya-expanse-8b,Q4_0_4_4,9.077,5.647,14.074,2.05,llama_cpp
39
+ aya-23-8B,Q4_0_4_4,9.077,5.647,14.113,2.051,llama_cpp
40
+ Yi-1.5-9B,Q4_K_M,8.829,5.327,3.681,1.85,llama_cpp
41
+ Yi-1.5-9B,Q4_0_4_4,8.829,5.035,11.33,2.0,llama_cpp
42
+ Hermes-3-Llama-3.1-8B,Q4_K_M,8.03,4.913,4.375,2.078,llama_cpp
43
+ Llama-3.1-8B,Q4_K_M,8.03,4.913,4.403,2.086,llama_cpp
44
+ internlm2_5-7b-chat,Q4_K_M,7.738,4.711,4.4,2.133,llama_cpp
45
+ Qwen2.5-7B,Q4_K_M,7.616,4.677,4.769,2.201,llama_cpp
46
+ dolphin-2.9.2-qwen2-7b,Q4_K_M,7.616,4.677,4.759,2.204,llama_cpp
47
+ Llama-3.1-8B,Q4_0_4_4,8.03,4.653,13.99,2.245,llama_cpp
48
+ Hermes-3-Llama-3.1-8B,Q4_0_4_4,8.03,4.653,14.006,2.245,llama_cpp
49
+ internlm2_5-7b-chat,Q4_0_4_4,7.738,4.451,14.036,2.31,llama_cpp
50
+ mpt-7b-instruct,Q4_K_M,6.856,4.442,4.162,2.213,llama_cpp
51
+ Qwen2.5-7B,Q4_0_4_4,7.616,4.425,15.563,2.386,llama_cpp
52
+ dolphin-2.9.2-qwen2-7b,Q4_0_4_4,7.616,4.425,15.58,2.387,llama_cpp
53
+ dolphin-2.9.3-mistral-7B-32k,Q4_K_M,7.248,4.372,4.387,2.227,llama_cpp
54
+ Mistral-7B-Instruct-v0.3,Q4_K_M,7.248,4.372,4.462,2.241,llama_cpp
55
+ Starling-LM-7B-beta,Q4_K_M,7.242,4.368,4.406,2.234,llama_cpp
56
+ Yarn-Mistral-7b-128k,Q4_K_M,7.242,4.368,4.434,2.245,llama_cpp
57
+ OLMoE-1B-7B-0924,Q4_K_M,6.919,4.212,26.902,12.119,llama_cpp
58
+ OLMo-7B-0724-hf,Q4_K_M,6.888,4.183,4.706,2.339,llama_cpp
59
+ dolphin-2.9.3-mistral-7B-32k,Q4_0_4_4,7.248,4.113,14.053,2.427,llama_cpp
60
+ Mistral-7B-Instruct-v0.3,Q4_0_4_4,7.248,4.113,14.177,2.43,llama_cpp
61
+ Starling-LM-7B-beta,Q4_0_4_4,7.242,4.108,14.068,2.427,llama_cpp
62
+ Yarn-Mistral-7b-128k,Q4_0_4_4,7.242,4.108,14.139,2.436,llama_cpp
63
+ Amber,Q4_K_M,6.738,4.08,4.594,2.351,llama_cpp
64
+ Phi-3.5-mini-instruct,Q8_0,3.821,4.06,7.951,2.423,llama_cpp
65
+ Phi-3-mini-128k-instruct,Q8_0,3.821,4.06,7.947,2.426,llama_cpp
66
+ mpt-7b-instruct,Q4_0_4_4,6.856,3.964,14.569,2.533,llama_cpp
67
+ OLMoE-1B-7B-0924,Q4_0_4_4,6.919,3.926,50.413,12.989,llama_cpp
68
+ Amber,Q4_0_4_4,6.738,3.825,14.442,2.57,llama_cpp
69
+ Yi-1.5-6B,Q4_K_M,6.061,3.672,5.58,2.72,llama_cpp
70
+ Qwen2.5-3B,Q8_0,3.397,3.61,10.473,2.939,llama_cpp
71
+ Yi-1.5-6B,Q4_0_4_4,6.061,3.478,17.017,2.945,llama_cpp
72
+ dolphin-2.9.4-gemma2-2b,Q8_0,3.204,3.405,13.966,3.381,llama_cpp
73
+ gemma-2-2b,Q8_0,3.204,3.405,13.996,3.385,llama_cpp
74
+ stable-code-instruct-3b,Q8_0,2.795,2.971,10.668,3.316,llama_cpp
75
+ Phi-3.5-mini-instruct,Q4_K_M,3.821,2.393,7.502,3.936,llama_cpp
76
+ Phi-3-mini-128k-instruct,Q4_K_M,3.821,2.393,7.519,3.938,llama_cpp
77
+ Llama-3.2-3B,Q4_K_M,3.607,2.335,10.691,4.674,llama_cpp
78
+ Llama-3.2-3B,Q4_0_4_4,3.607,2.233,31.72,5.025,llama_cpp
79
+ gemma-2-2b,Q4_K_M,3.204,2.186,14.202,5.253,llama_cpp
80
+ dolphin-2.9.4-gemma2-2b,Q4_K_M,3.204,2.186,14.218,5.253,llama_cpp
81
+ Qwen2.5-3B,Q4_K_M,3.397,2.179,10.638,4.808,llama_cpp
82
+ Phi-3.5-mini-instruct,Q4_0_4_4,3.821,2.175,23.369,4.428,llama_cpp
83
+ Phi-3-mini-128k-instruct,Q4_0_4_4,3.821,2.175,23.461,4.436,llama_cpp
84
+ gemma-2-2b,Q4_0_4_4,3.204,2.107,40.616,5.552,llama_cpp
85
+ dolphin-2.9.4-gemma2-2b,Q4_0_4_4,3.204,2.107,40.977,5.58,llama_cpp
86
+ Qwen2.5-3B,Q4_0_4_4,3.397,2.072,32.434,5.239,llama_cpp
87
+ internlm2_5-1_8b-chat,Q8_0,1.889,2.007,19.329,5.279,llama_cpp
88
+ SmolLM2-1.7B-Instruct,Q8_0,1.812,1.926,17.524,5.177,llama_cpp
89
+ Qwen2.5-1.5B,Q8_0,1.777,1.889,21.927,5.793,llama_cpp
90
+ stable-code-instruct-3b,Q4_K_M,2.795,1.707,10.803,5.564,llama_cpp
91
+ stable-code-instruct-3b,Q4_0_4_4,2.795,1.607,28.926,5.957,llama_cpp
92
+ Yi-Coder-1.5B,Q8_0,1.476,1.569,23.894,6.596,llama_cpp
93
+ OLMo-1B-0724-hf,Q8_0,1.28,1.36,27.787,7.591,llama_cpp
94
+ Qwen2.5-1.5B,Q4_K_M,1.777,1.172,22.326,9.56,llama_cpp
95
+ internlm2_5-1_8b-chat,Q4_K_M,1.889,1.17,19.453,8.56,llama_cpp
96
+ TinyLlama-1.1B-Chat-v1.0,Q8_0,1.1,1.169,28.472,8.637,llama_cpp
97
+ TinyLlama_v1.1,Q8_0,1.1,1.169,28.538,8.652,llama_cpp
98
+ SmolLM2-1.7B-Instruct,Q4_K_M,1.812,1.136,17.72,8.497,llama_cpp
99
+ Qwen2.5-1.5B,Q4_0_4_4,1.777,1.12,65.915,10.128,llama_cpp
100
+ internlm2_5-1_8b-chat,Q4_0_4_4,1.889,1.112,57.736,9.243,llama_cpp
101
+ SmolLM2-1.7B-Instruct,Q4_0_4_4,1.812,1.072,50.27,9.239,llama_cpp
102
+ Llama-3.2-1B,Q4_K_M,1.498,1.015,30.451,11.51,llama_cpp
103
+ Llama-3.2-1B,Q4_0_4_4,1.498,0.979,86.772,12.364,llama_cpp
104
+ Yi-Coder-1.5B,Q4_K_M,1.476,0.962,23.267,10.03,llama_cpp
105
+ Yi-Coder-1.5B,Q4_0_4_4,1.476,0.865,67.713,11.422,llama_cpp
106
+ OLMo-1B-0724-hf,Q4_K_M,1.28,0.79,28.276,12.321,llama_cpp
107
+ OLMo-1B-0724-hf,Q4_0_4_4,1.28,0.746,84.882,13.339,llama_cpp
108
+ Qwen2.5-0.5B,Q8_0,0.63,0.67,75.456,18.06,llama_cpp
109
+ TinyLlama-1.1B-Chat-v1.0,Q4_K_M,1.1,0.667,29.44,14.305,llama_cpp
110
+ TinyLlama_v1.1,Q4_K_M,1.1,0.667,29.397,14.346,llama_cpp
111
+ TinyLlama-1.1B-Chat-v1.0,Q4_0_4_4,1.1,0.636,77.823,15.509,llama_cpp
112
+ TinyLlama_v1.1,Q4_0_4_4,1.1,0.636,77.943,15.543,llama_cpp
113
+ Qwen2.5-0.5B,Q4_K_M,0.63,0.537,52.916,22.324,llama_cpp
114
+ Qwen2.5-0.5B,Q4_0_4_4,0.63,0.491,189.874,26.738,llama_cpp
115
+ gpt2-medium,Q8_0,0.406,0.436,83.423,23.016,llama_cpp
116
+ SmolLM2-360M-Instruct,Q8_0,0.409,0.435,79.518,22.857,llama_cpp
117
+ SmolLM2-360M-Instruct,Q4_K_M,0.409,0.319,55.774,30.718,llama_cpp
118
+ SmolLM2-360M-Instruct,Q4_0_4_4,0.409,0.277,173.275,37.176,llama_cpp
119
+ gpt2-medium,Q4_K_M,0.406,0.269,73.615,33.913,llama_cpp
120
+ gpt2-medium,Q4_0_4_4,0.406,0.247,178.73,37.89,llama_cpp
121
+ gpt2,Q8_0,0.163,0.176,302.932,68.191,llama_cpp
122
+ SmolLM2-135M-Instruct,Q8_0,0.163,0.173,212.146,57.992,llama_cpp
123
+ SmolLM2-135M-Instruct,Q4_K_M,0.163,0.134,153.439,73.272,llama_cpp
124
+ SmolLM2-135M-Instruct,Q4_0_4_4,0.163,0.12,381.667,86.735,llama_cpp
125
+ gpt2,Q4_K_M,0.163,0.111,269.906,92.707,llama_cpp
126
+ gpt2,Q4_0_4_4,0.163,0.105,582.32,101.509,llama_cpp
hardware.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ - machine: Raspberry Pi 5(8GB)
2
+ description: Cortex A76
3
+ hardware_provider: ARM
4
+ hardware_type: arm cortex a76
5
+ subsets:
6
+ - Q8_0
7
+ - Q4_K_M
8
+ - Q4_0_4_4
9
+ backends:
10
+ - llama_cpp
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ huggingface_hub
2
+ gradio>=5.0.0
3
+ pandas
src/assets.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ .logo {
3
+ width: 300px;
4
+ height: auto;
5
+ margin: 0 auto;
6
+ max-width: 100%
7
+ object-fit: contain;
8
+ overflow: visible !important;
9
+ }
10
+ .text {
11
+ font-size: 16px !important;
12
+ }
13
+
14
+ .tabs button {
15
+ font-size: 20px;
16
+ }
17
+ .subtabs button {
18
+ font-size: 20px;
19
+ }
20
+
21
+ .descriptive-text span {
22
+ font-size: 16px !important;
23
+ }
24
+
25
+ #control-panel span {
26
+ font-size: 20px !important;
27
+ }
28
+ #search-bar span {
29
+ font-size: 16px !important;
30
+ }
31
+ #threshold-slider span {
32
+ font-size: 16px !important;
33
+ }
34
+ #memory-slider span {
35
+ font-size: 16px !important;
36
+ }
37
+ #columns-checkboxes span {
38
+ font-size: 16px !important;
39
+ }
40
+ #backend-checkboxes span {
41
+ font-size: 16px !important;
42
+ }
43
+ #quantization-checkboxes span {
44
+ font-size: 16px !important;
45
+ }
46
+
47
+ #leaderboard-table td:first-child,
48
+ #leaderboard-table th:first-child {
49
+ max-width: 300px;
50
+ overflow: auto;
51
+ white-space: nowrap;
52
+ }
53
+ """
src/content.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LOGO = '<img src="https://nyunai.com/assets/images/logo.png">'
2
+
3
+ TITLE = """<h1 align="center" id="space-title"> Edge-LLM Leaderboard </h1>"""
4
+
5
+ ABOUT = """
6
+ ## πŸ“ About
7
+ The Edge-LLM Leaderboard is a leaderboard to gauge practical performance and quality of edge LLMs.
8
+ Its aim is to benchmark the performance (throughput and memory)
9
+ of Large Language Models (LLMs) on Edge hardware - starting with a Raspberry Pi 5 (8GB) based on the ARM Cortex A76 CPU.
10
+
11
+ Anyone from the community can request a new base model or edge hardware/backend/optimization
12
+ configuration for automated benchmarking:
13
+
14
+ - Model evaluation requests will be made live soon, in the meantime feel free to email to - arnav[dot]chavan[@]nyunai[dot]com
15
+
16
+ ## ✍️ Details
17
+
18
+ - To avoid multi-thread discrepencies, all 4 threads are used on the Pi 5.
19
+ - LLMs are running on a singleton batch with a prompt size of 512 and generating 128 tokens.
20
+
21
+ All of our throughput benchmarks are ran by this single tool
22
+ [llama-bench](https://github.com/ggerganov/llama.cpp/tree/master/examples/llama-bench)
23
+ using the power of [llama.cpp](https://github.com/ggerganov/llama.cpp) to guarantee reproducibility and consistency.
24
+ """
25
+
26
+
27
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
28
+ CITATION_BUTTON = r"""@misc{edge-llm-leaderboard,
29
+ author = {},
30
+ title = {Edge-LLM Leaderboard},
31
+ year = {2024},
32
+ publisher = {},
33
+ howpublished = "\url{https://huggingface.co/spaces/nyunai/edge-llm-leaderboard}",
34
+ }
35
+ """
src/dependency.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import os
2
+
3
+ os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
src/hardware.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ import yaml
4
+
5
+
6
+ class HardwareConfig:
7
+ def __init__(self, data: Dict[str, Any]):
8
+ self.machine: str = data["machine"]
9
+ self.description: str = data["description"]
10
+ self.hardware_provider: str = data["hardware_provider"]
11
+ self.hardware_type: str = data["hardware_type"]
12
+ self.subsets: List[str] = data["subsets"]
13
+ self.backends: List[str] = data["backends"]
14
+ self.detail: Optional[str] = data.get("detail", None)
15
+
16
+ def __repr__(self) -> str:
17
+ return (
18
+ f"HardwareConfig(machine='{self.machine}', description='{self.description}', "
19
+ f"hardware_provider={self.hardware_provider}, hardware_type={self.hardware_type}, subsets={self.subsets}, backends={self.backends})"
20
+ )
21
+
22
+
23
+ def load_hardware_configs(file_path: str) -> List[HardwareConfig]:
24
+ with open(file_path, "r") as file:
25
+ data = yaml.safe_load(file)
26
+ return [HardwareConfig(config) for config in data]
src/leaderboard.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.utils import model_hyperlink
4
+
5
+ LEADERBOARD_COLUMN_TO_DATATYPE = {
6
+ # open llm
7
+ "Model": "markdown",
8
+ "Quantization": "str",
9
+ # primary measurements
10
+ "Prefill (tokens/s)": "number",
11
+ "Decode (tokens/s)": "number",
12
+ "Model Size (GB)": "number",
13
+ # deployment settings
14
+ "Backend": "str",
15
+ # additional measurements
16
+ # "Reserved Memory (MB)": "number",
17
+ # "Used Memory (MB)": "number",
18
+ "Params (B)": "number",
19
+ }
20
+
21
+ PRIMARY_COLUMNS = [
22
+ "Model",
23
+ "Quantization",
24
+ "Prefill (tokens/s)",
25
+ "Decode (tokens/s)",
26
+ "Model Size (GB)",
27
+ ]
28
+
29
+
30
+ def process_model(model_name):
31
+ link = f"https://huggingface.co/{model_name}"
32
+ return model_hyperlink(link, model_name)
33
+
34
+
35
+ def get_leaderboard_df(llm_perf_df):
36
+ df = llm_perf_df.copy()
37
+ # transform for leaderboard
38
+ df["Model"] = df["Model"].apply(process_model)
39
+ return df
40
+
41
+
42
+ def create_leaderboard_table(llm_perf_df):
43
+ # get dataframe
44
+ leaderboard_df = get_leaderboard_df(llm_perf_df)
45
+
46
+ # create search bar
47
+ with gr.Row():
48
+ search_bar = gr.Textbox(
49
+ label="Model",
50
+ info="πŸ” Search for a model name",
51
+ elem_id="search-bar",
52
+ )
53
+ # create checkboxes
54
+ with gr.Row():
55
+ columns_checkboxes = gr.CheckboxGroup(
56
+ label="Columns πŸ“Š",
57
+ value=PRIMARY_COLUMNS,
58
+ choices=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
59
+ info="β˜‘οΈ Select the columns to display",
60
+ elem_id="columns-checkboxes",
61
+ )
62
+ # create table
63
+ leaderboard_table = gr.components.Dataframe(
64
+ value=leaderboard_df[PRIMARY_COLUMNS],
65
+ datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
66
+ headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
67
+ elem_id="leaderboard-table",
68
+ )
69
+
70
+ return search_bar, columns_checkboxes, leaderboard_table
src/llm_perf.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+
6
+ DATASET_DIRECTORY = "dataset"
7
+
8
+ COLUMNS_MAPPING = {
9
+ "config.name": "Quantization",
10
+ "config.backend.model": "Model",
11
+ # primary measurements
12
+ "report.prefill.throughput.value": "Prefill (tokens/s)",
13
+ "report.decode.throughput.value": "Decode (tokens/s)",
14
+ "report.memory": "Model Size (GB)",
15
+ # deployment settings
16
+ "config.backend.name": "Backend",
17
+ "quantization": "Quantization",
18
+ # additional information
19
+ "#Params (B)": "Params (B)",
20
+ }
21
+ SORTING_COLUMNS = ["Model Size (GB)", "Decode (tokens/s)", "Prefill (tokens/s)"]
22
+ SORTING_ASCENDING = [False, True, True]
23
+
24
+
25
+ def get_raw_llm_perf_df(
26
+ machine: str, backends: List[str], hardware_type: str
27
+ ):
28
+ dfs = []
29
+ try:
30
+ dfs.append(
31
+ pd.read_csv("/Users/arnavchavan/leaderboard/benchmark_results.csv")
32
+ # pd.read_csv(
33
+ # f"hf://datasets/nyunai/edge-llm-leaderboard/perf-df-{hardware_type}-{machine}-{backends}.csv"
34
+ # )
35
+ )
36
+ except Exception:
37
+ print("Dataset not found for:")
38
+ print(f" β€’ Machine: {machine}")
39
+ print(f" β€’ Hardware Type: {hardware_type}")
40
+ url = f"https://huggingface.co/datasets/nyunai/edge-llm-leaderboard/blob/main/perf-df-{hardware_type}-{machine}-{backends}.csv"
41
+ print(f" β€’ URL: {url}")
42
+
43
+ if len(dfs) == 0:
44
+ raise ValueError(
45
+ f"No datasets found for machine {machine}, check your hardware.yml config file or your datatset on huggingface"
46
+ )
47
+
48
+ perf_df = pd.concat(dfs)
49
+ # llm_df = pd.read_csv(
50
+ # "hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-df.csv"
51
+ # )
52
+
53
+ # llm_perf_df = pd.merge(
54
+ # llm_df, perf_df, left_on="Model", right_on="config.backend.model"
55
+ # )
56
+
57
+ return perf_df
58
+
59
+
60
+ def processed_llm_perf_df(llm_perf_df):
61
+ # llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
62
+ # process_architectures
63
+ # )
64
+ # round numerical columns
65
+ llm_perf_df = llm_perf_df.round(
66
+ {
67
+ "Prefill (tokens/s)": 3,
68
+ "Decode (tokens/s)": 3,
69
+ "Model Size (GB)": 3,
70
+ "#Params (B)": 3,
71
+ }
72
+ )
73
+ # sort by metric
74
+ llm_perf_df.sort_values(
75
+ by=SORTING_COLUMNS,
76
+ ascending=SORTING_ASCENDING,
77
+ inplace=True,
78
+ )
79
+
80
+ return llm_perf_df
81
+
82
+
83
+ def get_llm_perf_df(
84
+ machine: str, backends: List[str], hardware_type: str
85
+ ):
86
+ if not os.path.exists(DATASET_DIRECTORY):
87
+ os.makedirs(DATASET_DIRECTORY)
88
+
89
+ if os.path.exists(f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"):
90
+ llm_perf_df = pd.read_csv(
91
+ f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv"
92
+ )
93
+ else:
94
+ print(f"Dataset machine {machine} not found, downloading...")
95
+ llm_perf_df = get_raw_llm_perf_df(machine, backends, hardware_type)
96
+ llm_perf_df = processed_llm_perf_df(llm_perf_df)
97
+ llm_perf_df.to_csv(
98
+ f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
99
+ )
100
+
101
+ return llm_perf_df
src/panel.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import gradio as gr
4
+
5
+ from src.leaderboard import get_leaderboard_df
6
+ from src.llm_perf import get_llm_perf_df
7
+
8
+ # from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig
9
+ # from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig
10
+
11
+
12
+ def create_control_panel(
13
+ machine: str,
14
+ backends: List[str],
15
+ hardware_provider: str,
16
+ hardware_type: str,
17
+ ):
18
+ # controls
19
+ machine_value = gr.State(value=machine)
20
+ backends_value = gr.State(value=backends)
21
+ hardware_type_value = gr.State(value=hardware_type)
22
+
23
+ if hardware_provider == "ARM":
24
+ backends = ["llama.cpp"]
25
+ quantizations = ["Q8_0", "Q4_K_M", "Q4_0_4_4"]
26
+ else:
27
+ raise ValueError(f"Unknown hardware provider: {hardware_provider}")
28
+
29
+ with gr.Accordion("Control Panel", open=False, elem_id="control-panel"):
30
+ with gr.Row():
31
+ with gr.Column(scale=2, variant="panel"):
32
+ memory_slider = gr.Slider(
33
+ label="Model Size (GB) πŸ“ˆ",
34
+ info="🎚️ Slide to maximum Model Size",
35
+ minimum=0,
36
+ maximum=16,
37
+ value=16,
38
+ elem_id="memory-slider",
39
+ )
40
+ with gr.Column(scale=1, variant="panel"):
41
+ quantization_checkboxes = gr.CheckboxGroup(
42
+ label="Quantizations",
43
+ choices=quantizations,
44
+ value=quantizations,
45
+ info="β˜‘οΈ Select the quantization schemes",
46
+ elem_id="quantization-checkboxes",
47
+ elem_classes="boxed-option",
48
+ )
49
+ with gr.Row():
50
+ filter_button = gr.Button(
51
+ value="Filter πŸš€",
52
+ elem_id="filter-button",
53
+ elem_classes="boxed-option",
54
+ )
55
+
56
+ return (
57
+ filter_button,
58
+ machine_value,
59
+ backends_value,
60
+ hardware_type_value,
61
+ memory_slider,
62
+ quantization_checkboxes,
63
+ )
64
+
65
+ def filter_rows_fn(
66
+ machine,
67
+ backends,
68
+ hardware_type,
69
+ # inputs
70
+ memory,
71
+ quantizations,
72
+ # interactive
73
+ columns,
74
+ search,
75
+ ):
76
+ llm_perf_df = get_llm_perf_df(
77
+ machine=machine, backends=backends, hardware_type=hardware_type
78
+ )
79
+ # print(attentions)
80
+ # print(llm_perf_df["Attention πŸ‘οΈ"].unique())
81
+ filtered_llm_perf_df = llm_perf_df[
82
+ llm_perf_df["Model"].str.contains(search, case=False)
83
+ & llm_perf_df["Quantization"].isin(quantizations)
84
+ & (llm_perf_df["Model Size (GB)"] <= memory)
85
+ ]
86
+ selected_filtered_llm_perf_df = select_columns_fn(
87
+ machine, backends, hardware_type, columns, search, filtered_llm_perf_df
88
+ )
89
+ # filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
90
+ # filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
91
+ # filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
92
+ # filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
93
+ # filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
94
+ # filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
95
+
96
+ return [
97
+ selected_filtered_llm_perf_df,
98
+ # filtered_bt_prefill_fig,
99
+ # filtered_bt_decode_fig,
100
+ # filtered_fa2_prefill_fig,
101
+ # filtered_fa2_decode_fig,
102
+ # filtered_quant_prefill_fig,
103
+ # filtered_quant_decode_fig,
104
+ ]
105
+
106
+
107
+ def create_control_callback(
108
+ # button
109
+ filter_button,
110
+ # fixed
111
+ machine_value,
112
+ backends_value,
113
+ hardware_type_value,
114
+ # inputs
115
+ memory_slider,
116
+ quantization_checkboxes,
117
+ # interactive
118
+ columns_checkboxes,
119
+ search_bar,
120
+ # outputs
121
+ leaderboard_table,
122
+ # attn_prefill_plot,
123
+ # attn_decode_plot,
124
+ # fa2_prefill_plot,
125
+ # fa2_decode_plot,
126
+ # quant_prefill_plot,
127
+ # quant_decode_plot,
128
+ ):
129
+ filter_button.click(
130
+ fn=filter_rows_fn,
131
+ inputs=[
132
+ # fixed
133
+ machine_value,
134
+ backends_value,
135
+ hardware_type_value,
136
+ # inputs
137
+ memory_slider,
138
+ quantization_checkboxes,
139
+ # interactive
140
+ columns_checkboxes,
141
+ search_bar,
142
+ ],
143
+ outputs=[
144
+ leaderboard_table,
145
+ # attn_prefill_plot,
146
+ # attn_decode_plot,
147
+ # fa2_prefill_plot,
148
+ # fa2_decode_plot,
149
+ # quant_prefill_plot,
150
+ # quant_decode_plot,
151
+ ],
152
+ )
153
+
154
+
155
+ def select_columns_fn(
156
+ machine, backends, hardware_type, columns, search, llm_perf_df=None
157
+ ):
158
+ if llm_perf_df is None:
159
+ llm_perf_df = get_llm_perf_df(
160
+ machine=machine,
161
+ backends=backends,
162
+ hardware_type=hardware_type,
163
+ )
164
+
165
+ selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
166
+ selected_leaderboard_df = selected_leaderboard_df[
167
+ selected_leaderboard_df["Model"].str.contains(search, case=False)
168
+ ]
169
+ selected_leaderboard_df = selected_leaderboard_df[columns]
170
+
171
+ return selected_leaderboard_df
172
+
173
+
174
+ def create_select_callback(
175
+ # fixed
176
+ machine_value,
177
+ backends_value,
178
+ hardware_type_value,
179
+ # interactive
180
+ columns_checkboxes,
181
+ search_bar,
182
+ # outputs
183
+ leaderboard_table,
184
+ ):
185
+ columns_checkboxes.change(
186
+ fn=select_columns_fn,
187
+ inputs=[
188
+ machine_value,
189
+ backends_value,
190
+ hardware_type_value,
191
+ columns_checkboxes,
192
+ search_bar,
193
+ ],
194
+ outputs=[leaderboard_table],
195
+ )
196
+ search_bar.change(
197
+ fn=select_columns_fn,
198
+ inputs=[
199
+ machine_value,
200
+ backends_value,
201
+ hardware_type_value,
202
+ columns_checkboxes,
203
+ search_bar,
204
+ ],
205
+ outputs=[leaderboard_table],
206
+ )
src/utils.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+
3
+ def model_hyperlink(link, model_name):
4
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'