MirakramAghalarov commited on
Commit
a76b907
1 Parent(s): 994653a

Productin Commit

Browse files
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ hfvenv/
4
+ __pycache__/
5
+ .env
6
+ .ipynb_checkpoints
7
+ *ipynb
8
+ .vscode/
9
+
10
+ gpt_4_evals/
11
+ human_evals/
12
+ eval-queue/
13
+ eval-results/
14
+ eval-results-group/
15
+ auto_evals/
16
+
17
+ src/assets/model_counts.html
18
+
19
+ test
20
+ env
21
+ a.py
22
+ testing.py
23
+ frontend
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md CHANGED
@@ -1,11 +1,36 @@
1
  ---
2
- title: Leaderboard Frontend
3
- emoji: 👀
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: static
7
- pinned: false
8
- license: cc-by-nc-sa-4.0
 
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Azerbaijani LLM Leaderboard
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
16
+
17
+ Results files should have the following format:
18
+ ```
19
+ {
20
+ "config": {
21
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
+ "model_name": "path of the model on the hub: org/model",
23
+ "model_sha": "revision on the hub",
24
+ },
25
+ "results": {
26
+ "task_name": {
27
+ "metric_name": score,
28
+ },
29
+ "task_name2": {
30
+ "metric_name": score,
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Request files are created automatically by this tool.
app.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import snapshot_download
5
+ import os
6
+ os.environ['CURL_CA_BUNDLE'] = ''
7
+
8
+ from src.display.about import (
9
+ EVALUATION_QUEUE_TEXT,
10
+ INTRODUCTION_TEXT,
11
+ LLM_BENCHMARKS_TEXT,
12
+ LLM_DATASET_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ TYPES,
22
+ AutoEvalColumn,
23
+ fields,
24
+ BENCHMARK_COLS_GROUP,
25
+ COLS_GROUP,
26
+ EVAL_COLS_GROUP,
27
+ EVAL_TYPES_GROUP,
28
+ TYPES_GROUP,
29
+ AutoEvalColumnGroup,
30
+ )
31
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
32
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_evaluation_queue_df_group, get_leaderboard_group_df
33
+ from src.submission.submit import add_new_eval
34
+
35
+
36
+ def restart_space():
37
+ API.restart_space(repo_id=REPO_ID, token=TOKEN)
38
+
39
+ try:
40
+ print(EVAL_REQUESTS_PATH)
41
+ snapshot_download(
42
+ repo_id=QUEUE_REPO,
43
+ local_dir=EVAL_REQUESTS_PATH,
44
+ repo_type="dataset",
45
+ tqdm_class=None,
46
+ etag_timeout=30,
47
+ force_download=True,
48
+ token=TOKEN
49
+ )
50
+ except Exception:
51
+ restart_space()
52
+ try:
53
+ print(EVAL_RESULTS_PATH)
54
+ snapshot_download(
55
+ repo_id=RESULTS_REPO,
56
+ local_dir=EVAL_RESULTS_PATH,
57
+ repo_type="dataset",
58
+ tqdm_class=None,
59
+ etag_timeout=30,
60
+ force_download=True,
61
+ token=TOKEN
62
+ )
63
+ snapshot_download(
64
+ repo_id=RESULTS_GROUP_REPO,
65
+ local_dir=EVAL_RESULTS_GROUP_PATH,
66
+ repo_type="dataset",
67
+ tqdm_class=None,
68
+ etag_timeout=30,
69
+ force_download=True,
70
+ token=TOKEN)
71
+ except Exception:
72
+ restart_space()
73
+
74
+
75
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
76
+ raw_data_grouped, original_df_grouped = get_leaderboard_group_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
77
+
78
+ leaderboard_grouped_df = original_df_grouped.copy()
79
+ leaderboard_df = original_df.copy()
80
+
81
+ (
82
+ finished_eval_queue_df,
83
+ running_eval_queue_df,
84
+ pending_eval_queue_df,
85
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
86
+
87
+
88
+ (
89
+ finished_eval_queue_g_df,
90
+ running_eval_queue_g_df,
91
+ pending_eval_queue_g_df,
92
+ ) = get_evaluation_queue_df_group(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
93
+
94
+ # Searching and filtering
95
+ def update_table(
96
+ hidden_df: pd.DataFrame,
97
+ columns: list,
98
+ query: str,
99
+ ):
100
+ filtered_df = filter_queries(query, hidden_df)
101
+ df = select_columns(filtered_df, columns)
102
+ return df
103
+
104
+
105
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
106
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
107
+
108
+
109
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
110
+ always_here_cols = [
111
+ AutoEvalColumn.model_submission_date.name,
112
+ AutoEvalColumn.model.name,
113
+ ]
114
+ # We use COLS to maintain sorting
115
+ filtered_df = df[
116
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
117
+ ]
118
+ return filtered_df
119
+
120
+
121
+ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
122
+ final_df = []
123
+ if query != "":
124
+ queries = [q.strip() for q in query.split(";")]
125
+ for _q in queries:
126
+ if _q != "":
127
+ temp_filtered_df = search_table(filtered_df, _q)
128
+ if len(temp_filtered_df) > 0:
129
+ final_df.append(temp_filtered_df)
130
+ if len(final_df) > 0:
131
+ filtered_df = pd.concat(final_df)
132
+ filtered_df = filtered_df.drop_duplicates(
133
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.model_submission_date.name]
134
+ )
135
+
136
+ return filtered_df
137
+
138
+
139
+ demo = gr.Blocks(css=custom_css)
140
+ with demo:
141
+ gr.HTML(TITLE)
142
+ with gr.Row():
143
+ with gr.Column(scale=9):
144
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
145
+ with gr.Column(scale=1, min_width=1):
146
+ gr.Image('src/display/kapital.jpg', scale=1,
147
+ show_label=False,
148
+ interactive=False,
149
+ show_share_button=False,
150
+ show_download_button=False)
151
+
152
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
153
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
154
+ with gr.Row():
155
+ with gr.Row():
156
+ search_bar = gr.Textbox(
157
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
158
+ show_label=False,
159
+ elem_id="search-bar",
160
+ )
161
+ with gr.Row():
162
+ shown_columns = gr.CheckboxGroup(
163
+ choices=[
164
+ c.name
165
+ for c in fields(AutoEvalColumnGroup)
166
+ if not c.hidden and not c.never_hidden and not c.dummy
167
+ ],
168
+ value=[
169
+ c.name
170
+ for c in fields(AutoEvalColumnGroup)
171
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
172
+ ],
173
+ label="Select columns to show",
174
+ elem_id="column-select",
175
+ interactive=True,
176
+ )
177
+
178
+ leaderboard_table = gr.components.Dataframe(
179
+ value=leaderboard_grouped_df[
180
+ [c.name for c in fields(AutoEvalColumnGroup) if c.never_hidden]
181
+ + shown_columns.value
182
+ + [AutoEvalColumnGroup.dummy.name]
183
+ ],
184
+ headers=[c.name for c in fields(AutoEvalColumnGroup) if c.never_hidden] + shown_columns.value + [AutoEvalColumnGroup.dummy.name],
185
+ datatype=TYPES_GROUP,
186
+ elem_id="leaderboard-table",
187
+ interactive=False,
188
+ visible=True,
189
+ column_widths=["15%", "30%"]
190
+ )
191
+
192
+ # Dummy leaderboard for handling the case when the user uses backspace key
193
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
194
+ value=original_df_grouped[COLS_GROUP],
195
+ headers=COLS_GROUP,
196
+ datatype=TYPES_GROUP,
197
+ visible=False,
198
+ )
199
+ search_bar.submit(
200
+ update_table,
201
+ [
202
+ hidden_leaderboard_table_for_search,
203
+ shown_columns,
204
+ search_bar,
205
+ ],
206
+ leaderboard_table,
207
+ )
208
+ for selector in [shown_columns]:
209
+ selector.change(
210
+ update_table,
211
+ [
212
+ hidden_leaderboard_table_for_search,
213
+ shown_columns,
214
+ search_bar,
215
+ ],
216
+ leaderboard_table,
217
+ queue=True,
218
+ )
219
+
220
+ with gr.TabItem("🏅 LLM Benchmark FineGrained", elem_id="llm-benchmark-tab-table-1", id=1):
221
+ with gr.Row():
222
+ with gr.Row():
223
+ search_bar = gr.Textbox(
224
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
225
+ show_label=False,
226
+ elem_id="search-bar",
227
+ )
228
+ with gr.Row():
229
+ shown_columns = gr.CheckboxGroup(
230
+ choices=[
231
+ c.name
232
+ for c in fields(AutoEvalColumn)
233
+ if not c.hidden and not c.never_hidden and not c.dummy
234
+ ],
235
+ value=[
236
+ c.name
237
+ for c in fields(AutoEvalColumn)
238
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
239
+ ],
240
+ label="Select columns to show",
241
+ elem_id="column-select",
242
+ interactive=True,
243
+ )
244
+
245
+ leaderboard_table = gr.components.Dataframe(
246
+ value=leaderboard_df[
247
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
248
+ + shown_columns.value
249
+ + [AutoEvalColumn.dummy.name]
250
+ ],
251
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name],
252
+ datatype=TYPES,
253
+ elem_id="leaderboard-table",
254
+ interactive=False,
255
+ visible=True,
256
+ column_widths=["15%", "30%"]
257
+ )
258
+
259
+ # Dummy leaderboard for handling the case when the user uses backspace key
260
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
261
+ value=original_df[COLS],
262
+ headers=COLS,
263
+ datatype=TYPES,
264
+ visible=False,
265
+ )
266
+ search_bar.submit(
267
+ update_table,
268
+ [
269
+ hidden_leaderboard_table_for_search,
270
+ shown_columns,
271
+ search_bar,
272
+ ],
273
+ leaderboard_table,
274
+ )
275
+ for selector in [shown_columns]:
276
+ selector.change(
277
+ update_table,
278
+ [
279
+ hidden_leaderboard_table_for_search,
280
+ shown_columns,
281
+ search_bar,
282
+ ],
283
+ leaderboard_table,
284
+ queue=True,
285
+ )
286
+
287
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
288
+ with gr.Column():
289
+ with gr.Row():
290
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
291
+
292
+ with gr.Column():
293
+ with gr.Accordion(
294
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
295
+ open=False,
296
+ ):
297
+ with gr.Row():
298
+ finished_eval_table = gr.components.Dataframe(
299
+ value=finished_eval_queue_df,
300
+ headers=EVAL_COLS,
301
+ datatype=EVAL_TYPES,
302
+ row_count=5,
303
+ )
304
+ with gr.Accordion(
305
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
306
+ open=False,
307
+ ):
308
+ with gr.Row():
309
+ running_eval_table = gr.components.Dataframe(
310
+ value=running_eval_queue_df,
311
+ headers=EVAL_COLS,
312
+ datatype=EVAL_TYPES,
313
+ row_count=5,
314
+ )
315
+
316
+ with gr.Accordion(
317
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
318
+ open=False,
319
+ ):
320
+ with gr.Row():
321
+ pending_eval_table = gr.components.Dataframe(
322
+ value=pending_eval_queue_df,
323
+ headers=EVAL_COLS,
324
+ datatype=EVAL_TYPES,
325
+ row_count=5,
326
+ )
327
+
328
+
329
+
330
+ with gr.Row():
331
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
332
+
333
+ with gr.Row():
334
+ with gr.Column():
335
+ with gr.Row():
336
+ model_name_textbox = gr.Textbox(label="Model name")
337
+
338
+ with gr.Column():
339
+ with gr.Row():
340
+ weight_type = gr.Dropdown(
341
+ choices=['safetensors', 'gguf'],
342
+ label="Weights type",
343
+ multiselect=False,
344
+ value='safgit petensors',
345
+ interactive=True,
346
+ )
347
+
348
+ with gr.Column():
349
+ with gr.Row():
350
+ gguf_filename_textbox = gr.Textbox(label="GGUF filename")
351
+
352
+ submit_button = gr.Button("Submit Eval")
353
+ submission_result = gr.Markdown()
354
+ submit_button.click(
355
+ add_new_eval,
356
+ [
357
+ model_name_textbox,
358
+ weight_type,
359
+ gguf_filename_textbox
360
+ ],
361
+ submission_result,
362
+ )
363
+
364
+ with gr.TabItem("📝 Evaluation Datasets", elem_id="llm-benchmark-tab-table", id=4):
365
+ gr.Markdown(LLM_DATASET_TEXT, elem_classes="markdown-text")
366
+ gr.HTML("""<h1 align="center" id="space-title"> Contributor Companies and Teams </h1>""")
367
+ with gr.Row():
368
+ with gr.Column(scale=35):
369
+ pass
370
+ with gr.Column(scale=10, min_width=1, elem_classes='center-column'):
371
+ gr.Image('src/display/localdocs.jpeg',
372
+ scale = 1,
373
+ height=160,
374
+ show_label=False,
375
+ interactive=False,
376
+ show_share_button=False,
377
+ show_download_button=False)
378
+ gr.HTML("""<h1 align="center" id="company tile"> LocalDocs </h1>""")
379
+ with gr.Column(scale=10, min_width=1, elem_classes='center-column'):
380
+ gr.Image('src/display/prodata.png',
381
+ scale = 1,
382
+ height=160,
383
+ show_label=False,
384
+ interactive=False,
385
+ show_share_button=False,
386
+ show_download_button=False)
387
+ gr.HTML("""<h1 align="center" id="company tile"> PRODATA </h1>""")
388
+ with gr.Column(scale=10, min_width=1, elem_classes='center-column'):
389
+ gr.Image('src/display/bhosai.jpeg',
390
+ scale = 1,
391
+ height=160,
392
+ show_label=False,
393
+ interactive=False,
394
+ show_share_button=False,
395
+ show_download_button=False)
396
+ gr.HTML("""<h1 align="center" id="company tile"> BHOSAI </h1>""")
397
+ with gr.Column(scale=35):
398
+ pass
399
+
400
+ scheduler = BackgroundScheduler()
401
+ scheduler.add_job(restart_space, "interval", seconds=1000)
402
+ scheduler.start()
403
+ demo.queue(default_concurrency_limit=40).launch()
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.2
3
+ aiohttp==3.10.8
4
+ aiosignal==1.3.1
5
+ altair==5.4.1
6
+ annotated-types==0.7.0
7
+ anyio==4.6.0
8
+ APScheduler==3.10.1
9
+ async-timeout==4.0.3
10
+ attrs==24.2.0
11
+ black==23.11.0
12
+ certifi==2024.8.30
13
+ charset-normalizer==3.3.2
14
+ click==8.1.3
15
+ contourpy==1.3.0
16
+ cycler==0.12.1
17
+ datasets==2.14.5
18
+ dill==0.3.7
19
+ exceptiongroup==1.2.2
20
+ fastapi==0.115.3
21
+ ffmpy==0.4.0
22
+ filelock==3.16.1
23
+ fonttools==4.54.1
24
+ frozenlist==1.4.1
25
+ fsspec==2023.6.0
26
+ gradio==5.3.0
27
+ gradio_client==1.4.2
28
+ h11==0.14.0
29
+ httpcore==1.0.5
30
+ httpx==0.27.2
31
+ huggingface-hub==0.25.1
32
+ idna==3.10
33
+ importlib_resources==6.4.5
34
+ Jinja2==3.1.4
35
+ jsonschema==4.23.0
36
+ jsonschema-specifications==2023.12.1
37
+ kiwisolver==1.4.7
38
+ markdown-it-py==3.0.0
39
+ MarkupSafe==2.1.5
40
+ matplotlib==3.7.1
41
+ mdurl==0.1.2
42
+ multidict==6.1.0
43
+ multiprocess==0.70.15
44
+ mypy-extensions==1.0.0
45
+ narwhals==1.8.4
46
+ numpy==1.26.4
47
+ orjson==3.10.7
48
+ packaging==24.1
49
+ pandas==2.0.0
50
+ pathspec==0.12.1
51
+ pillow==10.4.0
52
+ platformdirs==4.3.6
53
+ pyarrow==17.0.0
54
+ pydantic==2.9.2
55
+ pydantic_core==2.23.4
56
+ pydub==0.25.1
57
+ Pygments==2.18.0
58
+ pyparsing==3.1.4
59
+ python-dateutil==2.8.2
60
+ python-multipart==0.0.12
61
+ pytz==2024.2
62
+ PyYAML==6.0.2
63
+ referencing==0.35.1
64
+ regex==2024.9.11
65
+ requests==2.32.3
66
+ rich==13.8.1
67
+ rpds-py==0.20.0
68
+ ruff==0.6.8
69
+ safetensors==0.4.5
70
+ semantic-version==2.10.0
71
+ shellingham==1.5.4
72
+ six==1.16.0
73
+ sniffio==1.3.1
74
+ starlette==0.41.0
75
+ tokenizers==0.15.2
76
+ tomli==2.0.1
77
+ tomlkit==0.12.0
78
+ tqdm==4.65.0
79
+ transformers==4.35.2
80
+ typer==0.12.5
81
+ typing_extensions==4.12.2
82
+ tzdata==2024.2
83
+ tzlocal==5.2
84
+ urllib3==2.2.3
85
+ uvicorn==0.31.0
86
+ websockets==11.0.3
87
+ xxhash==3.5.0
88
+ yarl==1.13.1
src/datasets.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_type": "mmlu",
4
+ "dstype": "mc",
5
+ "group": "Banking",
6
+ "subtext": "You are an AI that selects the most accurate answer in Azerbaijani based on a given question. You will be provided with a question in Azerbaijani and multiple options in Azerbaijani. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
7
+ "data": "LLM-Beetle/Banking_Exam_MCQ",
8
+ "name": "Banking_Exam_MCQ"
9
+ },
10
+ {
11
+ "task_type": "mmlu",
12
+ "dstype": "kmc_azerbaycan_dili",
13
+ "group": "MMLU",
14
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on grammatical concepts and linguistics. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
15
+ "data": "LLM-Beetle/Azerbaijani_Lang_MC",
16
+ "name": "Azerbaijani_Lang_MC"
17
+ },
18
+ {
19
+ "task_type": "mmlu",
20
+ "dstype": "kmc_edebiyyat",
21
+ "group": "MMLU",
22
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on literary and historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
23
+ "data": "LLM-Beetle/Literature_MC",
24
+ "name": "Azerbaijani_Lit_MC"
25
+ },
26
+ {
27
+ "task_type": "mmlu",
28
+ "dstype": "kmc_biologiya",
29
+ "group": "MMLU",
30
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on biology. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
31
+ "data": "LLM-Beetle/Biology_MC",
32
+ "name": "Biology_MC"
33
+ },
34
+ {
35
+ "task_type": "mmlu",
36
+ "dstype": "kmc_cografiya",
37
+ "group": "MMLU",
38
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on geographical and environmental knowledge. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
39
+ "data": "LLM-Beetle/Geography_MC",
40
+ "name": "Geography_MC"
41
+ },
42
+ {
43
+ "task_type": "mmlu",
44
+ "dstype": "kmc_mentiq",
45
+ "group": "MMLU",
46
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on logical reasoning and problem-solving. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
47
+ "data": "LLM-Beetle/Logic_MC",
48
+ "name": "Logic_MC"
49
+ },
50
+ {
51
+ "task_type": "mmlu",
52
+ "dstype": "kmc_tarix",
53
+ "group": "MMLU",
54
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on historical and cultural facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
55
+ "data": "LLM-Beetle/History_MC",
56
+ "name": "History_MC"
57
+ },
58
+ {
59
+ "task_type": "mmlu",
60
+ "dstype": "kmc_informatika",
61
+ "group": "MMLU",
62
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on technology and computer science. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
63
+ "data": "LLM-Beetle/Informatics_MC",
64
+ "name": "Informatics_MC"
65
+ },
66
+ {
67
+ "task_type": "mmlu",
68
+ "dstype": "kmc_fizika",
69
+ "group": "MMLU",
70
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on physics concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
71
+ "data": "LLM-Beetle/Physics_MC",
72
+ "name": "Physics_MC"
73
+ },
74
+ {
75
+ "task_type": "mmlu",
76
+ "dstype": "kmc_kimya",
77
+ "group": "MMLU",
78
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on chemistry and scientific concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
79
+ "data": "LLM-Beetle/Chemistry_MC",
80
+ "name": "Chemistry_MC"
81
+ },
82
+ {
83
+ "task_type": "mmlu",
84
+ "dstype": "kmc_azerbaycan_tarixi",
85
+ "group": "MMLU",
86
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
87
+ "data": "LLM-Beetle/Azerbaijani_Hist_MC",
88
+ "name": "Azerbaijani_Hist_MC"
89
+ },
90
+ {
91
+ "task_type": "mmlu",
92
+ "dstype": "tc",
93
+ "group": "Banking",
94
+ "subtext": "You are an AI designed to answer questions in Azerbaijani. Your task is to select the correct option from the given question and answer choices. You are given a statement along with multiple options that represent different topics. Choose the option that best categorizes the statement based on its topic. Choose the single letter (A, B, C, D, E, F, G, H, I, J) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
95
+ "data": "LLM-Beetle/Banking_Call_Classification_MC",
96
+ "name": "Banking_Call_Classification_MC"
97
+ },
98
+ {
99
+ "task_type": "arc",
100
+ "dstype": "arc",
101
+ "group": "ARC",
102
+ "subtext": "You are an AI designed to answer questions in Azerbaijani based on reasoning and knowledge. Your task is to select the correct option from the given question and answer choices. You are given a question along with multiple options. Choose the correct option. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
103
+ "data": "LLM-Beetle/ARC",
104
+ "name": "ARC"
105
+ },
106
+ {
107
+ "task_type": "gsm8k",
108
+ "dstype": "mmc",
109
+ "group": "GSM8K",
110
+ "subtext": "You are an AI designed to solve mathematical word problems in Azerbaijani. Your task is to analyze the given question and select the correct option from the provided choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
111
+ "data": "LLM-Beetle/GSM8K",
112
+ "name": "GSM8K"
113
+ },
114
+ {
115
+ "task_type": "qa",
116
+ "dstype": "qa",
117
+ "group": "Banking",
118
+ "subtext": "",
119
+ "data": "LLM-Beetle/Banking_QA",
120
+ "name": "Banking_QA"
121
+ },
122
+ {
123
+ "task_type": "rag",
124
+ "dstype": "cqa",
125
+ "group": "CQA",
126
+ "subtext": "",
127
+ "data": "LLM-Beetle/Wiki_CQA",
128
+ "name": "Wiki_CQA"
129
+ }
130
+ ]
src/display/about.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ import json
4
+
5
+
6
+ @dataclass
7
+ class Task:
8
+ benchmark: str
9
+ metric: str
10
+ col_name: str
11
+
12
+
13
+ # Init: to update with your specific keys
14
+ def create_task_list():
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ with open("src/datasets.json") as f:
17
+
18
+ data = json.load(f)
19
+
20
+ groups = []
21
+ names = []
22
+ for d in data:
23
+ groups.append(d['group'])
24
+ names.append(d['name'])
25
+ groups = list(set(groups))
26
+ tasks = []
27
+ grouped_tasks = []
28
+ for name in names:
29
+ tasks.append(Task(name, "metric_name", name))
30
+ for group in groups:
31
+ grouped_tasks.append(Task(group, "metric_name", group))
32
+
33
+ return tasks, grouped_tasks
34
+
35
+
36
+
37
+ # Your leaderboard name
38
+ TITLE = """<h1 align="center" id="space-title"> Azerbaijani LLM Leaderboard</h1>"""
39
+
40
+ # What does your leaderboard evaluate?
41
+ INTRODUCTION_TEXT = """
42
+ ## Azerbaijani Open LLM sponsored by Kapital Bank
43
+
44
+ The Azerbaijani Open LLM Leaderboard is sponsored by Kapital Bank to support and develop Azerbaijani language NLP. This leaderboard offers a clear and fair ranking of open-source Azerbaijani LLMs, helping researchers, developers, and the AI community work together to improve the quality and use of Azerbaijani language models.
45
+ Through this platform, we hope to bring useful AI technology to the Azerbaijani language and encourage models that are both locally relevant and internationally competitive.
46
+
47
+ ## Partners
48
+
49
+ This leaderboard is supported by Kapital Bank, LocalDocs, PRODATA LLC, and the R&D Center of Baku Higher Oil School.
50
+
51
+ """
52
+
53
+ LLM_BENCHMARKS_TEXT = f"""
54
+ ## Azerbaijani Open LLM sponsored by Kapital Bank
55
+
56
+ The Azerbaijani Open LLM Leaderboard is sponsored by Kapital Bank to support and develop Azerbaijani language NLP. This leaderboard offers a clear and fair ranking of open-source Azerbaijani LLMs, helping researchers, developers, and the AI community work together to improve the quality and use of Azerbaijani language models.
57
+ Through this platform, we hope to bring useful AI technology to the Azerbaijani language and encourage models that are both locally relevant and internationally competitive.PartnersThis leaderboard is supported by Kapital Bank, LocalDocs, PRODATA LLC, and the R&D Center of Baku Higher Oil School.
58
+
59
+ """
60
+
61
+ LLM_DATASET_TEXT = f"""
62
+ ## Banking Call Classification MC:
63
+
64
+ 192 entries; multiple-choice classification for bank-client requests.
65
+
66
+ ## Banking Exam MCQ:
67
+
68
+ 200–300 multiple-choice questions based on university banking exam materials.
69
+
70
+ ## Banking QA: 97 entries:
71
+
72
+ Question-answer pairs on Azerbaijani banking topics.
73
+
74
+ ## Wiki CQA:
75
+
76
+ 97 entries from Azerbaijani Wikipedia, with context, questions, and answers.
77
+
78
+ ## GSM8K:
79
+
80
+ 44 grade-school math problems to test multi-step reasoning.
81
+
82
+ ## ARC:
83
+
84
+ Elementary science questions in Azerbaijani, testing knowledge and reasoning.
85
+
86
+ ## Subject-Specific MCQs:
87
+
88
+ Questions across topics like informatics, history, physics, and more, each with 100 multiple-choice questions for specific subject knowledge.
89
+ """
90
+
91
+
92
+ EVALUATION_QUEUE_TEXT = """
93
+ ## Some good practices before submitting a model
94
+
95
+ ### 1) Make sure your model exists on hub.
96
+ ### 2) Make sure your model is public.
97
+
98
+
99
+ ## In case of model failure
100
+ If your model is displayed in the `FAILED` category, its execution stopped.
101
+ Make sure you have followed the above steps first.
102
+ Please contact us if you are facing any trouble!
103
+ """
src/display/bhosai.jpeg ADDED
src/display/css_html_js.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #leaderboard-table {
12
+ margin-top: 15px
13
+ }
14
+
15
+ #leaderboard-table-lite {
16
+ margin-top: 15px
17
+ }
18
+
19
+ #search-bar-table-box > div:first-child {
20
+ background: none;
21
+ border: none;
22
+ }
23
+
24
+ #search-bar {
25
+ padding: 0px;
26
+ }
27
+
28
+ /* Hides the final AutoEvalColumn */
29
+ #llm-benchmark-tab-table table td:last-child,
30
+ #llm-benchmark-tab-table table th:last-child {
31
+ display: none;
32
+ }
33
+
34
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
35
+ table td:first-child,
36
+ table th:first-child {
37
+ max-width: 400px;
38
+ overflow: auto;
39
+ white-space: nowrap;
40
+ }
41
+
42
+ .tab-buttons button {
43
+ font-size: 20px;
44
+ }
45
+
46
+ #scale-logo {
47
+ border-style: none !important;
48
+ box-shadow: none;
49
+ display: block;
50
+ margin-left: auto;
51
+ margin-right: auto;
52
+ max-width: 600px;
53
+ }
54
+
55
+ #scale-logo .download {
56
+ display: none;
57
+ }
58
+ #filter_type{
59
+ border: 0;
60
+ padding-left: 0;
61
+ padding-top: 0;
62
+ }
63
+ #filter_type label {
64
+ display: flex;
65
+ }
66
+ #filter_type label > span{
67
+ margin-top: var(--spacing-lg);
68
+ margin-right: 0.5em;
69
+ }
70
+ #filter_type label > .wrap{
71
+ width: 103px;
72
+ }
73
+ #filter_type label > .wrap .wrap-inner{
74
+ padding: 2px;
75
+ }
76
+ #filter_type label > .wrap .wrap-inner input{
77
+ width: 1px
78
+ }
79
+ #filter-columns-type{
80
+ border:0;
81
+ padding:0.5;
82
+ }
83
+ #filter-columns-size{
84
+ border:0;
85
+ padding:0.5;
86
+ }
87
+ #box-filter > .form{
88
+ border: 0
89
+ }
90
+ """
91
+
92
+ get_window_url_params = """
93
+ function(url_params) {
94
+ const params = new URLSearchParams(window.location.search);
95
+ url_params = Object.fromEntries(params);
96
+ return url_params;
97
+ }
98
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime, timezone
3
+
4
+ from huggingface_hub import HfApi
5
+ from huggingface_hub.hf_api import ModelInfo
6
+
7
+
8
+ API = HfApi()
9
+
10
+ def model_hyperlink(link, model_name):
11
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
+
13
+
14
+ def make_clickable_model(model_name):
15
+ link = f"https://huggingface.co/{model_name}"
16
+ return model_hyperlink(link, model_name)
17
+
18
+
19
+ def styled_error(error):
20
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
21
+
22
+
23
+ def styled_warning(warn):
24
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
25
+
26
+
27
+ def styled_message(message):
28
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
29
+
30
+
31
+ def has_no_nan_values(df, columns):
32
+ return df[columns].notna().all(axis=1)
33
+
34
+
35
+ def has_nan_values(df, columns):
36
+ return df[columns].isna().any(axis=1)
src/display/kapital.jpg ADDED
src/display/localdocs.jpeg ADDED
src/display/prodata.png ADDED
src/display/utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+
3
+ from src.display.about import create_task_list
4
+
5
+ def fields(raw_class):
6
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
7
+
8
+
9
+ # These classes are for user facing column names,
10
+ # to avoid having to change them all around the code
11
+ # when a modif is needed
12
+ @dataclass
13
+ class ColumnContent:
14
+ name: str
15
+ type: str
16
+ displayed_by_default: bool
17
+ hidden: bool = False
18
+ never_hidden: bool = False
19
+ dummy: bool = False
20
+
21
+ Tasks, Groups = create_task_list()
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ auto_eval_column_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
+
31
+
32
+ for task in Tasks:
33
+ auto_eval_column_dict.append([task.benchmark, ColumnContent, ColumnContent(task.col_name, "number", True)])
34
+ # Dummy column for the search bar (hidden by the custom CSS)
35
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
36
+
37
+ # We use make dataclass to dynamically fill the scores from Tasks
38
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
39
+
40
+ ## For the queue columns in the submission tab
41
+ @dataclass(frozen=True)
42
+ class EvalQueueColumn: # Queue column
43
+ model = ColumnContent("model", "markdown", True)
44
+ submitted_time = ColumnContent("submitted_time", "str", True)
45
+ status = ColumnContent("status", "str", True)
46
+
47
+ # Column selection
48
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
49
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
50
+
51
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
52
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
53
+
54
+ BENCHMARK_COLS = [t.col_name for t in Tasks]
55
+
56
+
57
+
58
+
59
+
60
+ #for grouping
61
+
62
+
63
+ ## Leaderboard columns
64
+ auto_eval_group_dict = []
65
+ # Init
66
+ auto_eval_group_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
67
+ auto_eval_group_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
68
+ #Scores
69
+ auto_eval_group_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
70
+
71
+
72
+ for task in Groups:
73
+ auto_eval_group_dict.append([task.benchmark, ColumnContent, ColumnContent(task.col_name, "number", True)])
74
+ # Dummy column for the search bar (hidden by the custom CSS)
75
+ auto_eval_group_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
76
+
77
+ # We use make dataclass to dynamically fill the scores from Tasks
78
+ AutoEvalColumnGroup = make_dataclass("AutoEvalColumnGroup", auto_eval_group_dict, frozen=True)
79
+
80
+ ## For the queue columns in the submission tab
81
+ @dataclass(frozen=True)
82
+ class EvalQueueColumnGroup: # Queue column
83
+ model = ColumnContent("model", "markdown", True)
84
+ submitted_time = ColumnContent("submitted_time", "str", True)
85
+ status = ColumnContent("status", "str", True)
86
+
87
+ # Column selection
88
+ COLS_GROUP = [c.name for c in fields(AutoEvalColumnGroup) if not c.hidden]
89
+ TYPES_GROUP = [c.type for c in fields(AutoEvalColumnGroup) if not c.hidden]
90
+
91
+ EVAL_COLS_GROUP = [c.name for c in fields(EvalQueueColumnGroup)]
92
+ EVAL_TYPES_GROUP = [c.type for c in fields(EvalQueueColumnGroup)]
93
+
94
+ BENCHMARK_COLS_GROUP = [t.col_name for t in Groups]
src/envs.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ # clone / pull the lmeh eval data
5
+ TOKEN = os.environ.get("HF_TOKEN", None)
6
+
7
+ OWNER = "LLM-Beetle"
8
+ REPO_ID = f"{OWNER}/frontend"
9
+ QUEUE_REPO = f"{OWNER}/requests"
10
+ RESULTS_REPO = f"{OWNER}/results"
11
+ RESULTS_GROUP_REPO = f"{OWNER}/grouped"
12
+
13
+ CACHE_PATH=os.getenv("HF_HOME", ".")
14
+
15
+ # Local caches
16
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
17
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
18
+ EVAL_RESULTS_GROUP_PATH = os.path.join(CACHE_PATH, "eval-results-group")
19
+
20
+ API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, Tasks, Groups
12
+
13
+ @dataclass
14
+ class EvalResult:
15
+ eval_name: str # org_model_date (uid)
16
+ full_model: str # org/model (path on hub)
17
+ org: str
18
+ model: str
19
+ results: dict
20
+ date: str = "" # submission date of request file
21
+
22
+ @classmethod
23
+ def init_from_json_file(self, json_filepath):
24
+ """Inits the result from the specific model result file"""
25
+ with open(json_filepath) as fp:
26
+ data = json.load(fp)
27
+
28
+ config = data.get("config")
29
+
30
+ # Get model and org
31
+ org_and_model = config.get("model_name", None)
32
+ org_and_model = org_and_model.split("/", 1)
33
+
34
+ org = org_and_model[0]
35
+ model = org_and_model[1]
36
+ date = config.get("submitted_time", None)
37
+ result_key = f"{org}_{model}_{date}"
38
+ full_model = "/".join(org_and_model)
39
+
40
+ # Extract results available in this file (some results are split in several files)
41
+ results = {}
42
+ for task in Tasks:
43
+
44
+ # We average all scores of a given metric (not all metrics are present in all files)
45
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
46
+ if accs.size == 0 or any([acc is None for acc in accs]):
47
+ continue
48
+
49
+ mean_acc = np.mean(accs) * 100.0
50
+ results[task.benchmark] = mean_acc
51
+
52
+ return self(
53
+ eval_name=result_key,
54
+ full_model=full_model,
55
+ org=org,
56
+ model=model,
57
+ results=results,
58
+ date=date
59
+ )
60
+
61
+
62
+ def to_dict(self):
63
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
64
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
65
+ data_dict = {
66
+ "eval_name": self.eval_name, # not a column, just a save name,
67
+ AutoEvalColumn.model_submission_date.name: self.date,
68
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
69
+ AutoEvalColumn.dummy.name: self.full_model,
70
+ AutoEvalColumn.average.name: average,
71
+ }
72
+
73
+ for task in Tasks:
74
+ data_dict[task.col_name] = self.results[task.benchmark]
75
+
76
+ return data_dict
77
+
78
+
79
+
80
+
81
+ @dataclass
82
+ class EvalResultGroup:
83
+ eval_name: str # org_model_date (uid)
84
+ full_model: str # org/model (path on hub)
85
+ org: str
86
+ model: str
87
+ results: dict
88
+ date: str = "" # submission date of request file
89
+
90
+ @classmethod
91
+ def init_from_json_file(self, json_filepath):
92
+ """Inits the result from the specific model result file"""
93
+ with open(json_filepath) as fp:
94
+ data = json.load(fp)
95
+
96
+ config = data.get("config")
97
+
98
+ # Get model and org
99
+ org_and_model = config.get("model_name", None)
100
+ org_and_model = org_and_model.split("/", 1)
101
+
102
+ org = org_and_model[0]
103
+ model = org_and_model[1]
104
+ date = config.get("submitted_time", None)
105
+ result_key = f"{org}_{model}_{date}"
106
+ full_model = "/".join(org_and_model)
107
+
108
+ # Extract results available in this file (some results are split in several files)
109
+ results = {}
110
+ for task in Groups:
111
+
112
+ # We average all scores of a given metric (not all metrics are present in all files)
113
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
114
+ if accs.size == 0 or any([acc is None for acc in accs]):
115
+ continue
116
+
117
+ mean_acc = np.mean(accs) * 100.0
118
+ results[task.benchmark] = mean_acc
119
+
120
+ return self(
121
+ eval_name=result_key,
122
+ full_model=full_model,
123
+ org=org,
124
+ model=model,
125
+ results=results,
126
+ date=date
127
+ )
128
+
129
+
130
+ def to_dict(self):
131
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
132
+ average = sum([v for v in self.results.values() if v is not None]) / len(Groups)
133
+ data_dict = {
134
+ "eval_name": self.eval_name, # not a column, just a save name,
135
+ AutoEvalColumn.model_submission_date.name: self.date,
136
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
137
+ AutoEvalColumn.dummy.name: self.full_model,
138
+ AutoEvalColumn.average.name: average,
139
+ }
140
+
141
+ for task in Groups:
142
+ data_dict[task.col_name] = self.results[task.benchmark]
143
+
144
+ return data_dict
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
154
+ """From the path of the results folder root, extract all needed info for results"""
155
+ model_result_filepaths = []
156
+
157
+ for root, _, files in os.walk(results_path):
158
+ # We should only have json files in model results
159
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
160
+ continue
161
+
162
+ # Sort the files by date
163
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
164
+
165
+ for file in files:
166
+ model_result_filepaths.append(os.path.join(root, file))
167
+
168
+ eval_results = {}
169
+ for model_result_filepath in model_result_filepaths:
170
+ # Creation of result
171
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
172
+
173
+ # Store results of same eval together
174
+ eval_name = eval_result.eval_name
175
+ eval_results[eval_name] = eval_result
176
+
177
+ results = []
178
+ for v in eval_results.values():
179
+ try:
180
+ v.to_dict() # we test if the dict version is complete
181
+ results.append(v)
182
+ except KeyError: # not all eval values present
183
+ continue
184
+
185
+ return results
186
+
187
+
188
+
189
+
190
+ def get_group_eval_results(results_path: str) -> list[EvalResultGroup]:
191
+ """From the path of the results folder root, extract all needed info for results"""
192
+ model_result_filepaths = []
193
+
194
+ for root, _, files in os.walk(results_path):
195
+ # We should only have json files in model results
196
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
197
+ continue
198
+
199
+ # Sort the files by date
200
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
201
+
202
+ for file in files:
203
+ model_result_filepaths.append(os.path.join(root, file))
204
+
205
+ eval_results = {}
206
+ for model_result_filepath in model_result_filepaths:
207
+ # Creation of result
208
+ eval_result = EvalResultGroup.init_from_json_file(model_result_filepath)
209
+
210
+ # Store results of same eval together
211
+ eval_name = eval_result.eval_name
212
+ eval_results[eval_name] = eval_result
213
+
214
+ results = []
215
+ print(eval_results)
216
+ for v in eval_results.values():
217
+ try:
218
+ v.to_dict() # we test if the dict version is complete
219
+ results.append(v)
220
+ except KeyError: # not all eval values present
221
+ print("key error")
222
+ continue
223
+
224
+ return results
225
+
src/populate.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, AutoEvalColumnGroup, EvalQueueColumnGroup
8
+ from src.leaderboard.read_evals import get_raw_eval_results, get_group_eval_results
9
+
10
+
11
+ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ raw_data = get_raw_eval_results(results_path)
13
+ all_data_json = [v.to_dict() for v in raw_data]
14
+
15
+ df = pd.DataFrame.from_records(all_data_json)
16
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
+ df = df[cols].round(decimals=2)
18
+
19
+ # filter out if any of the benchmarks have not been produced
20
+ df = df[has_no_nan_values(df, benchmark_cols)]
21
+ return raw_data, df
22
+
23
+
24
+ def get_leaderboard_group_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
25
+ raw_data = get_group_eval_results(results_path)
26
+ all_data_json = [v.to_dict() for v in raw_data]
27
+ df = pd.DataFrame.from_records(all_data_json)
28
+ df = df.sort_values(by=[AutoEvalColumnGroup.average.name], ascending=False)
29
+ df = df[cols].round(decimals=2)
30
+
31
+ # filter out if any of the benchmarks have not been produced
32
+ df = df[has_no_nan_values(df, benchmark_cols)]
33
+ return raw_data, df
34
+
35
+
36
+
37
+
38
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
40
+ all_evals = []
41
+
42
+ for entry in entries:
43
+ if ".json" in entry:
44
+ file_path = os.path.join(save_path, entry)
45
+ with open(file_path) as fp:
46
+ data = json.load(fp)
47
+
48
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
+
50
+ all_evals.append(data)
51
+ elif ".md" not in entry:
52
+ # this is a folder
53
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
54
+ for sub_entry in sub_entries:
55
+ file_path = os.path.join(save_path, entry, sub_entry)
56
+ with open(file_path) as fp:
57
+ data = json.load(fp)
58
+
59
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
60
+ all_evals.append(data)
61
+
62
+ pending_list = [e for e in all_evals if e["status"] == "PENDING"]
63
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
64
+ finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
65
+
66
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
67
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
68
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
69
+
70
+ return df_finished[cols], df_running[cols], df_pending[cols]
71
+
72
+
73
+
74
+
75
+
76
+ def get_evaluation_queue_df_group(save_path: str, cols: list) -> list[pd.DataFrame]:
77
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
78
+ all_evals = []
79
+
80
+ for entry in entries:
81
+ if ".json" in entry:
82
+ file_path = os.path.join(save_path, entry)
83
+ with open(file_path) as fp:
84
+ data = json.load(fp)
85
+
86
+ data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
87
+
88
+ all_evals.append(data)
89
+ elif ".md" not in entry:
90
+ # this is a folder
91
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
92
+ for sub_entry in sub_entries:
93
+ file_path = os.path.join(save_path, entry, sub_entry)
94
+ with open(file_path) as fp:
95
+ data = json.load(fp)
96
+
97
+ data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
98
+ all_evals.append(data)
99
+
100
+ pending_list = [e for e in all_evals if e["status"] == "PENDING"]
101
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
102
+ finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
103
+
104
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
105
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
106
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
107
+
108
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/submission/submit.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message
6
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO
7
+
8
+
9
+ def add_new_eval(model: str, weight_type: str, gguf_filename=None):
10
+ user_name = ""
11
+ model_path = model
12
+ if "/" in model:
13
+ user_name = model.split("/")[0]
14
+ model_path = model.split("/")[1]
15
+
16
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
17
+
18
+ # Is the model info correctly filled?
19
+ try:
20
+ model_info = API.model_info(repo_id=model, revision='main')
21
+ except Exception:
22
+ return styled_error("Could not get your model information.")
23
+
24
+ if weight_type=="safetensors":
25
+ if len(gguf_filename)!=0:
26
+ return styled_error("GGUF filename should be empty when using safetensors.")
27
+
28
+ # Seems good, creating the eval
29
+ print("Adding new eval")
30
+
31
+ eval_entry = {
32
+ "model": model,
33
+ "weight_type": weight_type,
34
+ "gguf_filename": gguf_filename,
35
+ "status": "PENDING",
36
+ "submitted_time": current_time,
37
+ }
38
+
39
+ print("Creating eval file")
40
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
41
+ os.makedirs(OUT_DIR, exist_ok=True)
42
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{current_time}.json"
43
+
44
+ with open(out_path, "w") as f:
45
+ f.write(json.dumps(eval_entry))
46
+
47
+ print("Uploading eval file")
48
+ API.upload_file(
49
+ path_or_fileobj=out_path,
50
+ path_in_repo=out_path.split("eval-queue/")[1],
51
+ repo_id=QUEUE_REPO,
52
+ repo_type="dataset",
53
+ commit_message=f"Add {model} to eval queue",
54
+ )
55
+
56
+ # Remove the local file
57
+ os.remove(out_path)
58
+
59
+ return styled_message(
60
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to five minutes for the model to show in the PENDING list."
61
+ )
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }