MirakramAghalarov
commited on
Commit
•
a76b907
1
Parent(s):
994653a
Productin Commit
Browse files- .gitattributes +1 -1
- .gitignore +23 -0
- .pre-commit-config.yaml +53 -0
- Makefile +13 -0
- README.md +32 -7
- app.py +403 -0
- index.html +0 -19
- pyproject.toml +13 -0
- requirements.txt +88 -0
- src/datasets.json +130 -0
- src/display/about.py +103 -0
- src/display/bhosai.jpeg +0 -0
- src/display/css_html_js.py +98 -0
- src/display/formatting.py +36 -0
- src/display/kapital.jpg +0 -0
- src/display/localdocs.jpeg +0 -0
- src/display/prodata.png +0 -0
- src/display/utils.py +94 -0
- src/envs.py +20 -0
- src/leaderboard/read_evals.py +225 -0
- src/populate.py +108 -0
- src/submission/submit.py +61 -0
- style.css +0 -28
.gitattributes
CHANGED
@@ -25,7 +25,6 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
+
hfvenv/
|
4 |
+
__pycache__/
|
5 |
+
.env
|
6 |
+
.ipynb_checkpoints
|
7 |
+
*ipynb
|
8 |
+
.vscode/
|
9 |
+
|
10 |
+
gpt_4_evals/
|
11 |
+
human_evals/
|
12 |
+
eval-queue/
|
13 |
+
eval-results/
|
14 |
+
eval-results-group/
|
15 |
+
auto_evals/
|
16 |
+
|
17 |
+
src/assets/model_counts.html
|
18 |
+
|
19 |
+
test
|
20 |
+
env
|
21 |
+
a.py
|
22 |
+
testing.py
|
23 |
+
frontend
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
default_language_version:
|
16 |
+
python: python3
|
17 |
+
|
18 |
+
ci:
|
19 |
+
autofix_prs: true
|
20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
21 |
+
autoupdate_schedule: quarterly
|
22 |
+
|
23 |
+
repos:
|
24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
25 |
+
rev: v4.3.0
|
26 |
+
hooks:
|
27 |
+
- id: check-yaml
|
28 |
+
- id: check-case-conflict
|
29 |
+
- id: detect-private-key
|
30 |
+
- id: check-added-large-files
|
31 |
+
args: ['--maxkb=1000']
|
32 |
+
- id: requirements-txt-fixer
|
33 |
+
- id: end-of-file-fixer
|
34 |
+
- id: trailing-whitespace
|
35 |
+
|
36 |
+
- repo: https://github.com/PyCQA/isort
|
37 |
+
rev: 5.12.0
|
38 |
+
hooks:
|
39 |
+
- id: isort
|
40 |
+
name: Format imports
|
41 |
+
|
42 |
+
- repo: https://github.com/psf/black
|
43 |
+
rev: 22.12.0
|
44 |
+
hooks:
|
45 |
+
- id: black
|
46 |
+
name: Format code
|
47 |
+
additional_dependencies: ['click==8.0.2']
|
48 |
+
|
49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
50 |
+
# Ruff version.
|
51 |
+
rev: 'v0.0.267'
|
52 |
+
hooks:
|
53 |
+
- id: ruff
|
Makefile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: style format
|
2 |
+
|
3 |
+
|
4 |
+
style:
|
5 |
+
python -m black --line-length 119 .
|
6 |
+
python -m isort .
|
7 |
+
ruff check --fix .
|
8 |
+
|
9 |
+
|
10 |
+
quality:
|
11 |
+
python -m black --check --line-length 119 .
|
12 |
+
python -m isort --check-only .
|
13 |
+
ruff check .
|
README.md
CHANGED
@@ -1,11 +1,36 @@
|
|
1 |
---
|
2 |
-
title: Leaderboard
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
|
8 |
-
|
|
|
|
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Azerbaijani LLM Leaderboard
|
3 |
+
emoji: 🥇
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.36.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
|
16 |
+
|
17 |
+
Results files should have the following format:
|
18 |
+
```
|
19 |
+
{
|
20 |
+
"config": {
|
21 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
22 |
+
"model_name": "path of the model on the hub: org/model",
|
23 |
+
"model_sha": "revision on the hub",
|
24 |
+
},
|
25 |
+
"results": {
|
26 |
+
"task_name": {
|
27 |
+
"metric_name": score,
|
28 |
+
},
|
29 |
+
"task_name2": {
|
30 |
+
"metric_name": score,
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
```
|
35 |
+
|
36 |
+
Request files are created automatically by this tool.
|
app.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
import os
|
6 |
+
os.environ['CURL_CA_BUNDLE'] = ''
|
7 |
+
|
8 |
+
from src.display.about import (
|
9 |
+
EVALUATION_QUEUE_TEXT,
|
10 |
+
INTRODUCTION_TEXT,
|
11 |
+
LLM_BENCHMARKS_TEXT,
|
12 |
+
LLM_DATASET_TEXT,
|
13 |
+
TITLE,
|
14 |
+
)
|
15 |
+
from src.display.css_html_js import custom_css
|
16 |
+
from src.display.utils import (
|
17 |
+
BENCHMARK_COLS,
|
18 |
+
COLS,
|
19 |
+
EVAL_COLS,
|
20 |
+
EVAL_TYPES,
|
21 |
+
TYPES,
|
22 |
+
AutoEvalColumn,
|
23 |
+
fields,
|
24 |
+
BENCHMARK_COLS_GROUP,
|
25 |
+
COLS_GROUP,
|
26 |
+
EVAL_COLS_GROUP,
|
27 |
+
EVAL_TYPES_GROUP,
|
28 |
+
TYPES_GROUP,
|
29 |
+
AutoEvalColumnGroup,
|
30 |
+
)
|
31 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
|
32 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_evaluation_queue_df_group, get_leaderboard_group_df
|
33 |
+
from src.submission.submit import add_new_eval
|
34 |
+
|
35 |
+
|
36 |
+
def restart_space():
|
37 |
+
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
38 |
+
|
39 |
+
try:
|
40 |
+
print(EVAL_REQUESTS_PATH)
|
41 |
+
snapshot_download(
|
42 |
+
repo_id=QUEUE_REPO,
|
43 |
+
local_dir=EVAL_REQUESTS_PATH,
|
44 |
+
repo_type="dataset",
|
45 |
+
tqdm_class=None,
|
46 |
+
etag_timeout=30,
|
47 |
+
force_download=True,
|
48 |
+
token=TOKEN
|
49 |
+
)
|
50 |
+
except Exception:
|
51 |
+
restart_space()
|
52 |
+
try:
|
53 |
+
print(EVAL_RESULTS_PATH)
|
54 |
+
snapshot_download(
|
55 |
+
repo_id=RESULTS_REPO,
|
56 |
+
local_dir=EVAL_RESULTS_PATH,
|
57 |
+
repo_type="dataset",
|
58 |
+
tqdm_class=None,
|
59 |
+
etag_timeout=30,
|
60 |
+
force_download=True,
|
61 |
+
token=TOKEN
|
62 |
+
)
|
63 |
+
snapshot_download(
|
64 |
+
repo_id=RESULTS_GROUP_REPO,
|
65 |
+
local_dir=EVAL_RESULTS_GROUP_PATH,
|
66 |
+
repo_type="dataset",
|
67 |
+
tqdm_class=None,
|
68 |
+
etag_timeout=30,
|
69 |
+
force_download=True,
|
70 |
+
token=TOKEN)
|
71 |
+
except Exception:
|
72 |
+
restart_space()
|
73 |
+
|
74 |
+
|
75 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
76 |
+
raw_data_grouped, original_df_grouped = get_leaderboard_group_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
|
77 |
+
|
78 |
+
leaderboard_grouped_df = original_df_grouped.copy()
|
79 |
+
leaderboard_df = original_df.copy()
|
80 |
+
|
81 |
+
(
|
82 |
+
finished_eval_queue_df,
|
83 |
+
running_eval_queue_df,
|
84 |
+
pending_eval_queue_df,
|
85 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
86 |
+
|
87 |
+
|
88 |
+
(
|
89 |
+
finished_eval_queue_g_df,
|
90 |
+
running_eval_queue_g_df,
|
91 |
+
pending_eval_queue_g_df,
|
92 |
+
) = get_evaluation_queue_df_group(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
|
93 |
+
|
94 |
+
# Searching and filtering
|
95 |
+
def update_table(
|
96 |
+
hidden_df: pd.DataFrame,
|
97 |
+
columns: list,
|
98 |
+
query: str,
|
99 |
+
):
|
100 |
+
filtered_df = filter_queries(query, hidden_df)
|
101 |
+
df = select_columns(filtered_df, columns)
|
102 |
+
return df
|
103 |
+
|
104 |
+
|
105 |
+
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
106 |
+
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
107 |
+
|
108 |
+
|
109 |
+
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
110 |
+
always_here_cols = [
|
111 |
+
AutoEvalColumn.model_submission_date.name,
|
112 |
+
AutoEvalColumn.model.name,
|
113 |
+
]
|
114 |
+
# We use COLS to maintain sorting
|
115 |
+
filtered_df = df[
|
116 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
117 |
+
]
|
118 |
+
return filtered_df
|
119 |
+
|
120 |
+
|
121 |
+
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
122 |
+
final_df = []
|
123 |
+
if query != "":
|
124 |
+
queries = [q.strip() for q in query.split(";")]
|
125 |
+
for _q in queries:
|
126 |
+
if _q != "":
|
127 |
+
temp_filtered_df = search_table(filtered_df, _q)
|
128 |
+
if len(temp_filtered_df) > 0:
|
129 |
+
final_df.append(temp_filtered_df)
|
130 |
+
if len(final_df) > 0:
|
131 |
+
filtered_df = pd.concat(final_df)
|
132 |
+
filtered_df = filtered_df.drop_duplicates(
|
133 |
+
subset=[AutoEvalColumn.model.name, AutoEvalColumn.model_submission_date.name]
|
134 |
+
)
|
135 |
+
|
136 |
+
return filtered_df
|
137 |
+
|
138 |
+
|
139 |
+
demo = gr.Blocks(css=custom_css)
|
140 |
+
with demo:
|
141 |
+
gr.HTML(TITLE)
|
142 |
+
with gr.Row():
|
143 |
+
with gr.Column(scale=9):
|
144 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
145 |
+
with gr.Column(scale=1, min_width=1):
|
146 |
+
gr.Image('src/display/kapital.jpg', scale=1,
|
147 |
+
show_label=False,
|
148 |
+
interactive=False,
|
149 |
+
show_share_button=False,
|
150 |
+
show_download_button=False)
|
151 |
+
|
152 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
153 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
154 |
+
with gr.Row():
|
155 |
+
with gr.Row():
|
156 |
+
search_bar = gr.Textbox(
|
157 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
158 |
+
show_label=False,
|
159 |
+
elem_id="search-bar",
|
160 |
+
)
|
161 |
+
with gr.Row():
|
162 |
+
shown_columns = gr.CheckboxGroup(
|
163 |
+
choices=[
|
164 |
+
c.name
|
165 |
+
for c in fields(AutoEvalColumnGroup)
|
166 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
167 |
+
],
|
168 |
+
value=[
|
169 |
+
c.name
|
170 |
+
for c in fields(AutoEvalColumnGroup)
|
171 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
172 |
+
],
|
173 |
+
label="Select columns to show",
|
174 |
+
elem_id="column-select",
|
175 |
+
interactive=True,
|
176 |
+
)
|
177 |
+
|
178 |
+
leaderboard_table = gr.components.Dataframe(
|
179 |
+
value=leaderboard_grouped_df[
|
180 |
+
[c.name for c in fields(AutoEvalColumnGroup) if c.never_hidden]
|
181 |
+
+ shown_columns.value
|
182 |
+
+ [AutoEvalColumnGroup.dummy.name]
|
183 |
+
],
|
184 |
+
headers=[c.name for c in fields(AutoEvalColumnGroup) if c.never_hidden] + shown_columns.value + [AutoEvalColumnGroup.dummy.name],
|
185 |
+
datatype=TYPES_GROUP,
|
186 |
+
elem_id="leaderboard-table",
|
187 |
+
interactive=False,
|
188 |
+
visible=True,
|
189 |
+
column_widths=["15%", "30%"]
|
190 |
+
)
|
191 |
+
|
192 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
193 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
194 |
+
value=original_df_grouped[COLS_GROUP],
|
195 |
+
headers=COLS_GROUP,
|
196 |
+
datatype=TYPES_GROUP,
|
197 |
+
visible=False,
|
198 |
+
)
|
199 |
+
search_bar.submit(
|
200 |
+
update_table,
|
201 |
+
[
|
202 |
+
hidden_leaderboard_table_for_search,
|
203 |
+
shown_columns,
|
204 |
+
search_bar,
|
205 |
+
],
|
206 |
+
leaderboard_table,
|
207 |
+
)
|
208 |
+
for selector in [shown_columns]:
|
209 |
+
selector.change(
|
210 |
+
update_table,
|
211 |
+
[
|
212 |
+
hidden_leaderboard_table_for_search,
|
213 |
+
shown_columns,
|
214 |
+
search_bar,
|
215 |
+
],
|
216 |
+
leaderboard_table,
|
217 |
+
queue=True,
|
218 |
+
)
|
219 |
+
|
220 |
+
with gr.TabItem("🏅 LLM Benchmark FineGrained", elem_id="llm-benchmark-tab-table-1", id=1):
|
221 |
+
with gr.Row():
|
222 |
+
with gr.Row():
|
223 |
+
search_bar = gr.Textbox(
|
224 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
225 |
+
show_label=False,
|
226 |
+
elem_id="search-bar",
|
227 |
+
)
|
228 |
+
with gr.Row():
|
229 |
+
shown_columns = gr.CheckboxGroup(
|
230 |
+
choices=[
|
231 |
+
c.name
|
232 |
+
for c in fields(AutoEvalColumn)
|
233 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
234 |
+
],
|
235 |
+
value=[
|
236 |
+
c.name
|
237 |
+
for c in fields(AutoEvalColumn)
|
238 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
239 |
+
],
|
240 |
+
label="Select columns to show",
|
241 |
+
elem_id="column-select",
|
242 |
+
interactive=True,
|
243 |
+
)
|
244 |
+
|
245 |
+
leaderboard_table = gr.components.Dataframe(
|
246 |
+
value=leaderboard_df[
|
247 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
248 |
+
+ shown_columns.value
|
249 |
+
+ [AutoEvalColumn.dummy.name]
|
250 |
+
],
|
251 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name],
|
252 |
+
datatype=TYPES,
|
253 |
+
elem_id="leaderboard-table",
|
254 |
+
interactive=False,
|
255 |
+
visible=True,
|
256 |
+
column_widths=["15%", "30%"]
|
257 |
+
)
|
258 |
+
|
259 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
260 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
261 |
+
value=original_df[COLS],
|
262 |
+
headers=COLS,
|
263 |
+
datatype=TYPES,
|
264 |
+
visible=False,
|
265 |
+
)
|
266 |
+
search_bar.submit(
|
267 |
+
update_table,
|
268 |
+
[
|
269 |
+
hidden_leaderboard_table_for_search,
|
270 |
+
shown_columns,
|
271 |
+
search_bar,
|
272 |
+
],
|
273 |
+
leaderboard_table,
|
274 |
+
)
|
275 |
+
for selector in [shown_columns]:
|
276 |
+
selector.change(
|
277 |
+
update_table,
|
278 |
+
[
|
279 |
+
hidden_leaderboard_table_for_search,
|
280 |
+
shown_columns,
|
281 |
+
search_bar,
|
282 |
+
],
|
283 |
+
leaderboard_table,
|
284 |
+
queue=True,
|
285 |
+
)
|
286 |
+
|
287 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=2):
|
288 |
+
with gr.Column():
|
289 |
+
with gr.Row():
|
290 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
291 |
+
|
292 |
+
with gr.Column():
|
293 |
+
with gr.Accordion(
|
294 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
295 |
+
open=False,
|
296 |
+
):
|
297 |
+
with gr.Row():
|
298 |
+
finished_eval_table = gr.components.Dataframe(
|
299 |
+
value=finished_eval_queue_df,
|
300 |
+
headers=EVAL_COLS,
|
301 |
+
datatype=EVAL_TYPES,
|
302 |
+
row_count=5,
|
303 |
+
)
|
304 |
+
with gr.Accordion(
|
305 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
306 |
+
open=False,
|
307 |
+
):
|
308 |
+
with gr.Row():
|
309 |
+
running_eval_table = gr.components.Dataframe(
|
310 |
+
value=running_eval_queue_df,
|
311 |
+
headers=EVAL_COLS,
|
312 |
+
datatype=EVAL_TYPES,
|
313 |
+
row_count=5,
|
314 |
+
)
|
315 |
+
|
316 |
+
with gr.Accordion(
|
317 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
318 |
+
open=False,
|
319 |
+
):
|
320 |
+
with gr.Row():
|
321 |
+
pending_eval_table = gr.components.Dataframe(
|
322 |
+
value=pending_eval_queue_df,
|
323 |
+
headers=EVAL_COLS,
|
324 |
+
datatype=EVAL_TYPES,
|
325 |
+
row_count=5,
|
326 |
+
)
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
with gr.Row():
|
331 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
332 |
+
|
333 |
+
with gr.Row():
|
334 |
+
with gr.Column():
|
335 |
+
with gr.Row():
|
336 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
337 |
+
|
338 |
+
with gr.Column():
|
339 |
+
with gr.Row():
|
340 |
+
weight_type = gr.Dropdown(
|
341 |
+
choices=['safetensors', 'gguf'],
|
342 |
+
label="Weights type",
|
343 |
+
multiselect=False,
|
344 |
+
value='safgit petensors',
|
345 |
+
interactive=True,
|
346 |
+
)
|
347 |
+
|
348 |
+
with gr.Column():
|
349 |
+
with gr.Row():
|
350 |
+
gguf_filename_textbox = gr.Textbox(label="GGUF filename")
|
351 |
+
|
352 |
+
submit_button = gr.Button("Submit Eval")
|
353 |
+
submission_result = gr.Markdown()
|
354 |
+
submit_button.click(
|
355 |
+
add_new_eval,
|
356 |
+
[
|
357 |
+
model_name_textbox,
|
358 |
+
weight_type,
|
359 |
+
gguf_filename_textbox
|
360 |
+
],
|
361 |
+
submission_result,
|
362 |
+
)
|
363 |
+
|
364 |
+
with gr.TabItem("📝 Evaluation Datasets", elem_id="llm-benchmark-tab-table", id=4):
|
365 |
+
gr.Markdown(LLM_DATASET_TEXT, elem_classes="markdown-text")
|
366 |
+
gr.HTML("""<h1 align="center" id="space-title"> Contributor Companies and Teams </h1>""")
|
367 |
+
with gr.Row():
|
368 |
+
with gr.Column(scale=35):
|
369 |
+
pass
|
370 |
+
with gr.Column(scale=10, min_width=1, elem_classes='center-column'):
|
371 |
+
gr.Image('src/display/localdocs.jpeg',
|
372 |
+
scale = 1,
|
373 |
+
height=160,
|
374 |
+
show_label=False,
|
375 |
+
interactive=False,
|
376 |
+
show_share_button=False,
|
377 |
+
show_download_button=False)
|
378 |
+
gr.HTML("""<h1 align="center" id="company tile"> LocalDocs </h1>""")
|
379 |
+
with gr.Column(scale=10, min_width=1, elem_classes='center-column'):
|
380 |
+
gr.Image('src/display/prodata.png',
|
381 |
+
scale = 1,
|
382 |
+
height=160,
|
383 |
+
show_label=False,
|
384 |
+
interactive=False,
|
385 |
+
show_share_button=False,
|
386 |
+
show_download_button=False)
|
387 |
+
gr.HTML("""<h1 align="center" id="company tile"> PRODATA </h1>""")
|
388 |
+
with gr.Column(scale=10, min_width=1, elem_classes='center-column'):
|
389 |
+
gr.Image('src/display/bhosai.jpeg',
|
390 |
+
scale = 1,
|
391 |
+
height=160,
|
392 |
+
show_label=False,
|
393 |
+
interactive=False,
|
394 |
+
show_share_button=False,
|
395 |
+
show_download_button=False)
|
396 |
+
gr.HTML("""<h1 align="center" id="company tile"> BHOSAI </h1>""")
|
397 |
+
with gr.Column(scale=35):
|
398 |
+
pass
|
399 |
+
|
400 |
+
scheduler = BackgroundScheduler()
|
401 |
+
scheduler.add_job(restart_space, "interval", seconds=1000)
|
402 |
+
scheduler.start()
|
403 |
+
demo.queue(default_concurrency_limit=40).launch()
|
index.html
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
<!doctype html>
|
2 |
-
<html>
|
3 |
-
<head>
|
4 |
-
<meta charset="utf-8" />
|
5 |
-
<meta name="viewport" content="width=device-width" />
|
6 |
-
<title>My static Space</title>
|
7 |
-
<link rel="stylesheet" href="style.css" />
|
8 |
-
</head>
|
9 |
-
<body>
|
10 |
-
<div class="card">
|
11 |
-
<h1>Welcome to your static Space!</h1>
|
12 |
-
<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
|
13 |
-
<p>
|
14 |
-
Also don't forget to check the
|
15 |
-
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
16 |
-
</p>
|
17 |
-
</div>
|
18 |
-
</body>
|
19 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
3 |
+
select = ["E", "F"]
|
4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
5 |
+
line-length = 119
|
6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
7 |
+
|
8 |
+
[tool.isort]
|
9 |
+
profile = "black"
|
10 |
+
line_length = 119
|
11 |
+
|
12 |
+
[tool.black]
|
13 |
+
line-length = 119
|
requirements.txt
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohappyeyeballs==2.4.2
|
3 |
+
aiohttp==3.10.8
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==5.4.1
|
6 |
+
annotated-types==0.7.0
|
7 |
+
anyio==4.6.0
|
8 |
+
APScheduler==3.10.1
|
9 |
+
async-timeout==4.0.3
|
10 |
+
attrs==24.2.0
|
11 |
+
black==23.11.0
|
12 |
+
certifi==2024.8.30
|
13 |
+
charset-normalizer==3.3.2
|
14 |
+
click==8.1.3
|
15 |
+
contourpy==1.3.0
|
16 |
+
cycler==0.12.1
|
17 |
+
datasets==2.14.5
|
18 |
+
dill==0.3.7
|
19 |
+
exceptiongroup==1.2.2
|
20 |
+
fastapi==0.115.3
|
21 |
+
ffmpy==0.4.0
|
22 |
+
filelock==3.16.1
|
23 |
+
fonttools==4.54.1
|
24 |
+
frozenlist==1.4.1
|
25 |
+
fsspec==2023.6.0
|
26 |
+
gradio==5.3.0
|
27 |
+
gradio_client==1.4.2
|
28 |
+
h11==0.14.0
|
29 |
+
httpcore==1.0.5
|
30 |
+
httpx==0.27.2
|
31 |
+
huggingface-hub==0.25.1
|
32 |
+
idna==3.10
|
33 |
+
importlib_resources==6.4.5
|
34 |
+
Jinja2==3.1.4
|
35 |
+
jsonschema==4.23.0
|
36 |
+
jsonschema-specifications==2023.12.1
|
37 |
+
kiwisolver==1.4.7
|
38 |
+
markdown-it-py==3.0.0
|
39 |
+
MarkupSafe==2.1.5
|
40 |
+
matplotlib==3.7.1
|
41 |
+
mdurl==0.1.2
|
42 |
+
multidict==6.1.0
|
43 |
+
multiprocess==0.70.15
|
44 |
+
mypy-extensions==1.0.0
|
45 |
+
narwhals==1.8.4
|
46 |
+
numpy==1.26.4
|
47 |
+
orjson==3.10.7
|
48 |
+
packaging==24.1
|
49 |
+
pandas==2.0.0
|
50 |
+
pathspec==0.12.1
|
51 |
+
pillow==10.4.0
|
52 |
+
platformdirs==4.3.6
|
53 |
+
pyarrow==17.0.0
|
54 |
+
pydantic==2.9.2
|
55 |
+
pydantic_core==2.23.4
|
56 |
+
pydub==0.25.1
|
57 |
+
Pygments==2.18.0
|
58 |
+
pyparsing==3.1.4
|
59 |
+
python-dateutil==2.8.2
|
60 |
+
python-multipart==0.0.12
|
61 |
+
pytz==2024.2
|
62 |
+
PyYAML==6.0.2
|
63 |
+
referencing==0.35.1
|
64 |
+
regex==2024.9.11
|
65 |
+
requests==2.32.3
|
66 |
+
rich==13.8.1
|
67 |
+
rpds-py==0.20.0
|
68 |
+
ruff==0.6.8
|
69 |
+
safetensors==0.4.5
|
70 |
+
semantic-version==2.10.0
|
71 |
+
shellingham==1.5.4
|
72 |
+
six==1.16.0
|
73 |
+
sniffio==1.3.1
|
74 |
+
starlette==0.41.0
|
75 |
+
tokenizers==0.15.2
|
76 |
+
tomli==2.0.1
|
77 |
+
tomlkit==0.12.0
|
78 |
+
tqdm==4.65.0
|
79 |
+
transformers==4.35.2
|
80 |
+
typer==0.12.5
|
81 |
+
typing_extensions==4.12.2
|
82 |
+
tzdata==2024.2
|
83 |
+
tzlocal==5.2
|
84 |
+
urllib3==2.2.3
|
85 |
+
uvicorn==0.31.0
|
86 |
+
websockets==11.0.3
|
87 |
+
xxhash==3.5.0
|
88 |
+
yarl==1.13.1
|
src/datasets.json
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"task_type": "mmlu",
|
4 |
+
"dstype": "mc",
|
5 |
+
"group": "Banking",
|
6 |
+
"subtext": "You are an AI that selects the most accurate answer in Azerbaijani based on a given question. You will be provided with a question in Azerbaijani and multiple options in Azerbaijani. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
7 |
+
"data": "LLM-Beetle/Banking_Exam_MCQ",
|
8 |
+
"name": "Banking_Exam_MCQ"
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"task_type": "mmlu",
|
12 |
+
"dstype": "kmc_azerbaycan_dili",
|
13 |
+
"group": "MMLU",
|
14 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on grammatical concepts and linguistics. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
15 |
+
"data": "LLM-Beetle/Azerbaijani_Lang_MC",
|
16 |
+
"name": "Azerbaijani_Lang_MC"
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"task_type": "mmlu",
|
20 |
+
"dstype": "kmc_edebiyyat",
|
21 |
+
"group": "MMLU",
|
22 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on literary and historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
23 |
+
"data": "LLM-Beetle/Literature_MC",
|
24 |
+
"name": "Azerbaijani_Lit_MC"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"task_type": "mmlu",
|
28 |
+
"dstype": "kmc_biologiya",
|
29 |
+
"group": "MMLU",
|
30 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on biology. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
31 |
+
"data": "LLM-Beetle/Biology_MC",
|
32 |
+
"name": "Biology_MC"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"task_type": "mmlu",
|
36 |
+
"dstype": "kmc_cografiya",
|
37 |
+
"group": "MMLU",
|
38 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on geographical and environmental knowledge. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
39 |
+
"data": "LLM-Beetle/Geography_MC",
|
40 |
+
"name": "Geography_MC"
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"task_type": "mmlu",
|
44 |
+
"dstype": "kmc_mentiq",
|
45 |
+
"group": "MMLU",
|
46 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on logical reasoning and problem-solving. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
47 |
+
"data": "LLM-Beetle/Logic_MC",
|
48 |
+
"name": "Logic_MC"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"task_type": "mmlu",
|
52 |
+
"dstype": "kmc_tarix",
|
53 |
+
"group": "MMLU",
|
54 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on historical and cultural facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
55 |
+
"data": "LLM-Beetle/History_MC",
|
56 |
+
"name": "History_MC"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"task_type": "mmlu",
|
60 |
+
"dstype": "kmc_informatika",
|
61 |
+
"group": "MMLU",
|
62 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on technology and computer science. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
63 |
+
"data": "LLM-Beetle/Informatics_MC",
|
64 |
+
"name": "Informatics_MC"
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_type": "mmlu",
|
68 |
+
"dstype": "kmc_fizika",
|
69 |
+
"group": "MMLU",
|
70 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on physics concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
71 |
+
"data": "LLM-Beetle/Physics_MC",
|
72 |
+
"name": "Physics_MC"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"task_type": "mmlu",
|
76 |
+
"dstype": "kmc_kimya",
|
77 |
+
"group": "MMLU",
|
78 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on chemistry and scientific concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
79 |
+
"data": "LLM-Beetle/Chemistry_MC",
|
80 |
+
"name": "Chemistry_MC"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"task_type": "mmlu",
|
84 |
+
"dstype": "kmc_azerbaycan_tarixi",
|
85 |
+
"group": "MMLU",
|
86 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
87 |
+
"data": "LLM-Beetle/Azerbaijani_Hist_MC",
|
88 |
+
"name": "Azerbaijani_Hist_MC"
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"task_type": "mmlu",
|
92 |
+
"dstype": "tc",
|
93 |
+
"group": "Banking",
|
94 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani. Your task is to select the correct option from the given question and answer choices. You are given a statement along with multiple options that represent different topics. Choose the option that best categorizes the statement based on its topic. Choose the single letter (A, B, C, D, E, F, G, H, I, J) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
95 |
+
"data": "LLM-Beetle/Banking_Call_Classification_MC",
|
96 |
+
"name": "Banking_Call_Classification_MC"
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"task_type": "arc",
|
100 |
+
"dstype": "arc",
|
101 |
+
"group": "ARC",
|
102 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on reasoning and knowledge. Your task is to select the correct option from the given question and answer choices. You are given a question along with multiple options. Choose the correct option. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
103 |
+
"data": "LLM-Beetle/ARC",
|
104 |
+
"name": "ARC"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"task_type": "gsm8k",
|
108 |
+
"dstype": "mmc",
|
109 |
+
"group": "GSM8K",
|
110 |
+
"subtext": "You are an AI designed to solve mathematical word problems in Azerbaijani. Your task is to analyze the given question and select the correct option from the provided choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
111 |
+
"data": "LLM-Beetle/GSM8K",
|
112 |
+
"name": "GSM8K"
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"task_type": "qa",
|
116 |
+
"dstype": "qa",
|
117 |
+
"group": "Banking",
|
118 |
+
"subtext": "",
|
119 |
+
"data": "LLM-Beetle/Banking_QA",
|
120 |
+
"name": "Banking_QA"
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"task_type": "rag",
|
124 |
+
"dstype": "cqa",
|
125 |
+
"group": "CQA",
|
126 |
+
"subtext": "",
|
127 |
+
"data": "LLM-Beetle/Wiki_CQA",
|
128 |
+
"name": "Wiki_CQA"
|
129 |
+
}
|
130 |
+
]
|
src/display/about.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
import json
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class Task:
|
8 |
+
benchmark: str
|
9 |
+
metric: str
|
10 |
+
col_name: str
|
11 |
+
|
12 |
+
|
13 |
+
# Init: to update with your specific keys
|
14 |
+
def create_task_list():
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
with open("src/datasets.json") as f:
|
17 |
+
|
18 |
+
data = json.load(f)
|
19 |
+
|
20 |
+
groups = []
|
21 |
+
names = []
|
22 |
+
for d in data:
|
23 |
+
groups.append(d['group'])
|
24 |
+
names.append(d['name'])
|
25 |
+
groups = list(set(groups))
|
26 |
+
tasks = []
|
27 |
+
grouped_tasks = []
|
28 |
+
for name in names:
|
29 |
+
tasks.append(Task(name, "metric_name", name))
|
30 |
+
for group in groups:
|
31 |
+
grouped_tasks.append(Task(group, "metric_name", group))
|
32 |
+
|
33 |
+
return tasks, grouped_tasks
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
# Your leaderboard name
|
38 |
+
TITLE = """<h1 align="center" id="space-title"> Azerbaijani LLM Leaderboard</h1>"""
|
39 |
+
|
40 |
+
# What does your leaderboard evaluate?
|
41 |
+
INTRODUCTION_TEXT = """
|
42 |
+
## Azerbaijani Open LLM sponsored by Kapital Bank
|
43 |
+
|
44 |
+
The Azerbaijani Open LLM Leaderboard is sponsored by Kapital Bank to support and develop Azerbaijani language NLP. This leaderboard offers a clear and fair ranking of open-source Azerbaijani LLMs, helping researchers, developers, and the AI community work together to improve the quality and use of Azerbaijani language models.
|
45 |
+
Through this platform, we hope to bring useful AI technology to the Azerbaijani language and encourage models that are both locally relevant and internationally competitive.
|
46 |
+
|
47 |
+
## Partners
|
48 |
+
|
49 |
+
This leaderboard is supported by Kapital Bank, LocalDocs, PRODATA LLC, and the R&D Center of Baku Higher Oil School.
|
50 |
+
|
51 |
+
"""
|
52 |
+
|
53 |
+
LLM_BENCHMARKS_TEXT = f"""
|
54 |
+
## Azerbaijani Open LLM sponsored by Kapital Bank
|
55 |
+
|
56 |
+
The Azerbaijani Open LLM Leaderboard is sponsored by Kapital Bank to support and develop Azerbaijani language NLP. This leaderboard offers a clear and fair ranking of open-source Azerbaijani LLMs, helping researchers, developers, and the AI community work together to improve the quality and use of Azerbaijani language models.
|
57 |
+
Through this platform, we hope to bring useful AI technology to the Azerbaijani language and encourage models that are both locally relevant and internationally competitive.PartnersThis leaderboard is supported by Kapital Bank, LocalDocs, PRODATA LLC, and the R&D Center of Baku Higher Oil School.
|
58 |
+
|
59 |
+
"""
|
60 |
+
|
61 |
+
LLM_DATASET_TEXT = f"""
|
62 |
+
## Banking Call Classification MC:
|
63 |
+
|
64 |
+
192 entries; multiple-choice classification for bank-client requests.
|
65 |
+
|
66 |
+
## Banking Exam MCQ:
|
67 |
+
|
68 |
+
200–300 multiple-choice questions based on university banking exam materials.
|
69 |
+
|
70 |
+
## Banking QA: 97 entries:
|
71 |
+
|
72 |
+
Question-answer pairs on Azerbaijani banking topics.
|
73 |
+
|
74 |
+
## Wiki CQA:
|
75 |
+
|
76 |
+
97 entries from Azerbaijani Wikipedia, with context, questions, and answers.
|
77 |
+
|
78 |
+
## GSM8K:
|
79 |
+
|
80 |
+
44 grade-school math problems to test multi-step reasoning.
|
81 |
+
|
82 |
+
## ARC:
|
83 |
+
|
84 |
+
Elementary science questions in Azerbaijani, testing knowledge and reasoning.
|
85 |
+
|
86 |
+
## Subject-Specific MCQs:
|
87 |
+
|
88 |
+
Questions across topics like informatics, history, physics, and more, each with 100 multiple-choice questions for specific subject knowledge.
|
89 |
+
"""
|
90 |
+
|
91 |
+
|
92 |
+
EVALUATION_QUEUE_TEXT = """
|
93 |
+
## Some good practices before submitting a model
|
94 |
+
|
95 |
+
### 1) Make sure your model exists on hub.
|
96 |
+
### 2) Make sure your model is public.
|
97 |
+
|
98 |
+
|
99 |
+
## In case of model failure
|
100 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
101 |
+
Make sure you have followed the above steps first.
|
102 |
+
Please contact us if you are facing any trouble!
|
103 |
+
"""
|
src/display/bhosai.jpeg
ADDED
src/display/css_html_js.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
|
3 |
+
.markdown-text {
|
4 |
+
font-size: 16px !important;
|
5 |
+
}
|
6 |
+
|
7 |
+
#models-to-add-text {
|
8 |
+
font-size: 18px !important;
|
9 |
+
}
|
10 |
+
|
11 |
+
#leaderboard-table {
|
12 |
+
margin-top: 15px
|
13 |
+
}
|
14 |
+
|
15 |
+
#leaderboard-table-lite {
|
16 |
+
margin-top: 15px
|
17 |
+
}
|
18 |
+
|
19 |
+
#search-bar-table-box > div:first-child {
|
20 |
+
background: none;
|
21 |
+
border: none;
|
22 |
+
}
|
23 |
+
|
24 |
+
#search-bar {
|
25 |
+
padding: 0px;
|
26 |
+
}
|
27 |
+
|
28 |
+
/* Hides the final AutoEvalColumn */
|
29 |
+
#llm-benchmark-tab-table table td:last-child,
|
30 |
+
#llm-benchmark-tab-table table th:last-child {
|
31 |
+
display: none;
|
32 |
+
}
|
33 |
+
|
34 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
35 |
+
table td:first-child,
|
36 |
+
table th:first-child {
|
37 |
+
max-width: 400px;
|
38 |
+
overflow: auto;
|
39 |
+
white-space: nowrap;
|
40 |
+
}
|
41 |
+
|
42 |
+
.tab-buttons button {
|
43 |
+
font-size: 20px;
|
44 |
+
}
|
45 |
+
|
46 |
+
#scale-logo {
|
47 |
+
border-style: none !important;
|
48 |
+
box-shadow: none;
|
49 |
+
display: block;
|
50 |
+
margin-left: auto;
|
51 |
+
margin-right: auto;
|
52 |
+
max-width: 600px;
|
53 |
+
}
|
54 |
+
|
55 |
+
#scale-logo .download {
|
56 |
+
display: none;
|
57 |
+
}
|
58 |
+
#filter_type{
|
59 |
+
border: 0;
|
60 |
+
padding-left: 0;
|
61 |
+
padding-top: 0;
|
62 |
+
}
|
63 |
+
#filter_type label {
|
64 |
+
display: flex;
|
65 |
+
}
|
66 |
+
#filter_type label > span{
|
67 |
+
margin-top: var(--spacing-lg);
|
68 |
+
margin-right: 0.5em;
|
69 |
+
}
|
70 |
+
#filter_type label > .wrap{
|
71 |
+
width: 103px;
|
72 |
+
}
|
73 |
+
#filter_type label > .wrap .wrap-inner{
|
74 |
+
padding: 2px;
|
75 |
+
}
|
76 |
+
#filter_type label > .wrap .wrap-inner input{
|
77 |
+
width: 1px
|
78 |
+
}
|
79 |
+
#filter-columns-type{
|
80 |
+
border:0;
|
81 |
+
padding:0.5;
|
82 |
+
}
|
83 |
+
#filter-columns-size{
|
84 |
+
border:0;
|
85 |
+
padding:0.5;
|
86 |
+
}
|
87 |
+
#box-filter > .form{
|
88 |
+
border: 0
|
89 |
+
}
|
90 |
+
"""
|
91 |
+
|
92 |
+
get_window_url_params = """
|
93 |
+
function(url_params) {
|
94 |
+
const params = new URLSearchParams(window.location.search);
|
95 |
+
url_params = Object.fromEntries(params);
|
96 |
+
return url_params;
|
97 |
+
}
|
98 |
+
"""
|
src/display/formatting.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime, timezone
|
3 |
+
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
from huggingface_hub.hf_api import ModelInfo
|
6 |
+
|
7 |
+
|
8 |
+
API = HfApi()
|
9 |
+
|
10 |
+
def model_hyperlink(link, model_name):
|
11 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
+
|
13 |
+
|
14 |
+
def make_clickable_model(model_name):
|
15 |
+
link = f"https://huggingface.co/{model_name}"
|
16 |
+
return model_hyperlink(link, model_name)
|
17 |
+
|
18 |
+
|
19 |
+
def styled_error(error):
|
20 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
21 |
+
|
22 |
+
|
23 |
+
def styled_warning(warn):
|
24 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
25 |
+
|
26 |
+
|
27 |
+
def styled_message(message):
|
28 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
29 |
+
|
30 |
+
|
31 |
+
def has_no_nan_values(df, columns):
|
32 |
+
return df[columns].notna().all(axis=1)
|
33 |
+
|
34 |
+
|
35 |
+
def has_nan_values(df, columns):
|
36 |
+
return df[columns].isna().any(axis=1)
|
src/display/kapital.jpg
ADDED
src/display/localdocs.jpeg
ADDED
src/display/prodata.png
ADDED
src/display/utils.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
2 |
+
|
3 |
+
from src.display.about import create_task_list
|
4 |
+
|
5 |
+
def fields(raw_class):
|
6 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
7 |
+
|
8 |
+
|
9 |
+
# These classes are for user facing column names,
|
10 |
+
# to avoid having to change them all around the code
|
11 |
+
# when a modif is needed
|
12 |
+
@dataclass
|
13 |
+
class ColumnContent:
|
14 |
+
name: str
|
15 |
+
type: str
|
16 |
+
displayed_by_default: bool
|
17 |
+
hidden: bool = False
|
18 |
+
never_hidden: bool = False
|
19 |
+
dummy: bool = False
|
20 |
+
|
21 |
+
Tasks, Groups = create_task_list()
|
22 |
+
|
23 |
+
## Leaderboard columns
|
24 |
+
auto_eval_column_dict = []
|
25 |
+
# Init
|
26 |
+
auto_eval_column_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
+
|
31 |
+
|
32 |
+
for task in Tasks:
|
33 |
+
auto_eval_column_dict.append([task.benchmark, ColumnContent, ColumnContent(task.col_name, "number", True)])
|
34 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
35 |
+
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
36 |
+
|
37 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
38 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
39 |
+
|
40 |
+
## For the queue columns in the submission tab
|
41 |
+
@dataclass(frozen=True)
|
42 |
+
class EvalQueueColumn: # Queue column
|
43 |
+
model = ColumnContent("model", "markdown", True)
|
44 |
+
submitted_time = ColumnContent("submitted_time", "str", True)
|
45 |
+
status = ColumnContent("status", "str", True)
|
46 |
+
|
47 |
+
# Column selection
|
48 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
49 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
50 |
+
|
51 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
52 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
53 |
+
|
54 |
+
BENCHMARK_COLS = [t.col_name for t in Tasks]
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
#for grouping
|
61 |
+
|
62 |
+
|
63 |
+
## Leaderboard columns
|
64 |
+
auto_eval_group_dict = []
|
65 |
+
# Init
|
66 |
+
auto_eval_group_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
|
67 |
+
auto_eval_group_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
68 |
+
#Scores
|
69 |
+
auto_eval_group_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
70 |
+
|
71 |
+
|
72 |
+
for task in Groups:
|
73 |
+
auto_eval_group_dict.append([task.benchmark, ColumnContent, ColumnContent(task.col_name, "number", True)])
|
74 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
75 |
+
auto_eval_group_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
76 |
+
|
77 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
78 |
+
AutoEvalColumnGroup = make_dataclass("AutoEvalColumnGroup", auto_eval_group_dict, frozen=True)
|
79 |
+
|
80 |
+
## For the queue columns in the submission tab
|
81 |
+
@dataclass(frozen=True)
|
82 |
+
class EvalQueueColumnGroup: # Queue column
|
83 |
+
model = ColumnContent("model", "markdown", True)
|
84 |
+
submitted_time = ColumnContent("submitted_time", "str", True)
|
85 |
+
status = ColumnContent("status", "str", True)
|
86 |
+
|
87 |
+
# Column selection
|
88 |
+
COLS_GROUP = [c.name for c in fields(AutoEvalColumnGroup) if not c.hidden]
|
89 |
+
TYPES_GROUP = [c.type for c in fields(AutoEvalColumnGroup) if not c.hidden]
|
90 |
+
|
91 |
+
EVAL_COLS_GROUP = [c.name for c in fields(EvalQueueColumnGroup)]
|
92 |
+
EVAL_TYPES_GROUP = [c.type for c in fields(EvalQueueColumnGroup)]
|
93 |
+
|
94 |
+
BENCHMARK_COLS_GROUP = [t.col_name for t in Groups]
|
src/envs.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from huggingface_hub import HfApi
|
3 |
+
|
4 |
+
# clone / pull the lmeh eval data
|
5 |
+
TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
+
|
7 |
+
OWNER = "LLM-Beetle"
|
8 |
+
REPO_ID = f"{OWNER}/frontend"
|
9 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
10 |
+
RESULTS_REPO = f"{OWNER}/results"
|
11 |
+
RESULTS_GROUP_REPO = f"{OWNER}/grouped"
|
12 |
+
|
13 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
14 |
+
|
15 |
+
# Local caches
|
16 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
17 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
18 |
+
EVAL_RESULTS_GROUP_PATH = os.path.join(CACHE_PATH, "eval-results-group")
|
19 |
+
|
20 |
+
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
from dataclasses import dataclass
|
6 |
+
|
7 |
+
import dateutil
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, Tasks, Groups
|
12 |
+
|
13 |
+
@dataclass
|
14 |
+
class EvalResult:
|
15 |
+
eval_name: str # org_model_date (uid)
|
16 |
+
full_model: str # org/model (path on hub)
|
17 |
+
org: str
|
18 |
+
model: str
|
19 |
+
results: dict
|
20 |
+
date: str = "" # submission date of request file
|
21 |
+
|
22 |
+
@classmethod
|
23 |
+
def init_from_json_file(self, json_filepath):
|
24 |
+
"""Inits the result from the specific model result file"""
|
25 |
+
with open(json_filepath) as fp:
|
26 |
+
data = json.load(fp)
|
27 |
+
|
28 |
+
config = data.get("config")
|
29 |
+
|
30 |
+
# Get model and org
|
31 |
+
org_and_model = config.get("model_name", None)
|
32 |
+
org_and_model = org_and_model.split("/", 1)
|
33 |
+
|
34 |
+
org = org_and_model[0]
|
35 |
+
model = org_and_model[1]
|
36 |
+
date = config.get("submitted_time", None)
|
37 |
+
result_key = f"{org}_{model}_{date}"
|
38 |
+
full_model = "/".join(org_and_model)
|
39 |
+
|
40 |
+
# Extract results available in this file (some results are split in several files)
|
41 |
+
results = {}
|
42 |
+
for task in Tasks:
|
43 |
+
|
44 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
45 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
46 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
47 |
+
continue
|
48 |
+
|
49 |
+
mean_acc = np.mean(accs) * 100.0
|
50 |
+
results[task.benchmark] = mean_acc
|
51 |
+
|
52 |
+
return self(
|
53 |
+
eval_name=result_key,
|
54 |
+
full_model=full_model,
|
55 |
+
org=org,
|
56 |
+
model=model,
|
57 |
+
results=results,
|
58 |
+
date=date
|
59 |
+
)
|
60 |
+
|
61 |
+
|
62 |
+
def to_dict(self):
|
63 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
64 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
65 |
+
data_dict = {
|
66 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
67 |
+
AutoEvalColumn.model_submission_date.name: self.date,
|
68 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
69 |
+
AutoEvalColumn.dummy.name: self.full_model,
|
70 |
+
AutoEvalColumn.average.name: average,
|
71 |
+
}
|
72 |
+
|
73 |
+
for task in Tasks:
|
74 |
+
data_dict[task.col_name] = self.results[task.benchmark]
|
75 |
+
|
76 |
+
return data_dict
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
@dataclass
|
82 |
+
class EvalResultGroup:
|
83 |
+
eval_name: str # org_model_date (uid)
|
84 |
+
full_model: str # org/model (path on hub)
|
85 |
+
org: str
|
86 |
+
model: str
|
87 |
+
results: dict
|
88 |
+
date: str = "" # submission date of request file
|
89 |
+
|
90 |
+
@classmethod
|
91 |
+
def init_from_json_file(self, json_filepath):
|
92 |
+
"""Inits the result from the specific model result file"""
|
93 |
+
with open(json_filepath) as fp:
|
94 |
+
data = json.load(fp)
|
95 |
+
|
96 |
+
config = data.get("config")
|
97 |
+
|
98 |
+
# Get model and org
|
99 |
+
org_and_model = config.get("model_name", None)
|
100 |
+
org_and_model = org_and_model.split("/", 1)
|
101 |
+
|
102 |
+
org = org_and_model[0]
|
103 |
+
model = org_and_model[1]
|
104 |
+
date = config.get("submitted_time", None)
|
105 |
+
result_key = f"{org}_{model}_{date}"
|
106 |
+
full_model = "/".join(org_and_model)
|
107 |
+
|
108 |
+
# Extract results available in this file (some results are split in several files)
|
109 |
+
results = {}
|
110 |
+
for task in Groups:
|
111 |
+
|
112 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
113 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
114 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
115 |
+
continue
|
116 |
+
|
117 |
+
mean_acc = np.mean(accs) * 100.0
|
118 |
+
results[task.benchmark] = mean_acc
|
119 |
+
|
120 |
+
return self(
|
121 |
+
eval_name=result_key,
|
122 |
+
full_model=full_model,
|
123 |
+
org=org,
|
124 |
+
model=model,
|
125 |
+
results=results,
|
126 |
+
date=date
|
127 |
+
)
|
128 |
+
|
129 |
+
|
130 |
+
def to_dict(self):
|
131 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
132 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Groups)
|
133 |
+
data_dict = {
|
134 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
135 |
+
AutoEvalColumn.model_submission_date.name: self.date,
|
136 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
137 |
+
AutoEvalColumn.dummy.name: self.full_model,
|
138 |
+
AutoEvalColumn.average.name: average,
|
139 |
+
}
|
140 |
+
|
141 |
+
for task in Groups:
|
142 |
+
data_dict[task.col_name] = self.results[task.benchmark]
|
143 |
+
|
144 |
+
return data_dict
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
154 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
155 |
+
model_result_filepaths = []
|
156 |
+
|
157 |
+
for root, _, files in os.walk(results_path):
|
158 |
+
# We should only have json files in model results
|
159 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
160 |
+
continue
|
161 |
+
|
162 |
+
# Sort the files by date
|
163 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
164 |
+
|
165 |
+
for file in files:
|
166 |
+
model_result_filepaths.append(os.path.join(root, file))
|
167 |
+
|
168 |
+
eval_results = {}
|
169 |
+
for model_result_filepath in model_result_filepaths:
|
170 |
+
# Creation of result
|
171 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
172 |
+
|
173 |
+
# Store results of same eval together
|
174 |
+
eval_name = eval_result.eval_name
|
175 |
+
eval_results[eval_name] = eval_result
|
176 |
+
|
177 |
+
results = []
|
178 |
+
for v in eval_results.values():
|
179 |
+
try:
|
180 |
+
v.to_dict() # we test if the dict version is complete
|
181 |
+
results.append(v)
|
182 |
+
except KeyError: # not all eval values present
|
183 |
+
continue
|
184 |
+
|
185 |
+
return results
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
def get_group_eval_results(results_path: str) -> list[EvalResultGroup]:
|
191 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
192 |
+
model_result_filepaths = []
|
193 |
+
|
194 |
+
for root, _, files in os.walk(results_path):
|
195 |
+
# We should only have json files in model results
|
196 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
197 |
+
continue
|
198 |
+
|
199 |
+
# Sort the files by date
|
200 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
201 |
+
|
202 |
+
for file in files:
|
203 |
+
model_result_filepaths.append(os.path.join(root, file))
|
204 |
+
|
205 |
+
eval_results = {}
|
206 |
+
for model_result_filepath in model_result_filepaths:
|
207 |
+
# Creation of result
|
208 |
+
eval_result = EvalResultGroup.init_from_json_file(model_result_filepath)
|
209 |
+
|
210 |
+
# Store results of same eval together
|
211 |
+
eval_name = eval_result.eval_name
|
212 |
+
eval_results[eval_name] = eval_result
|
213 |
+
|
214 |
+
results = []
|
215 |
+
print(eval_results)
|
216 |
+
for v in eval_results.values():
|
217 |
+
try:
|
218 |
+
v.to_dict() # we test if the dict version is complete
|
219 |
+
results.append(v)
|
220 |
+
except KeyError: # not all eval values present
|
221 |
+
print("key error")
|
222 |
+
continue
|
223 |
+
|
224 |
+
return results
|
225 |
+
|
src/populate.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, AutoEvalColumnGroup, EvalQueueColumnGroup
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, get_group_eval_results
|
9 |
+
|
10 |
+
|
11 |
+
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
+
raw_data = get_raw_eval_results(results_path)
|
13 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
+
|
15 |
+
df = pd.DataFrame.from_records(all_data_json)
|
16 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
17 |
+
df = df[cols].round(decimals=2)
|
18 |
+
|
19 |
+
# filter out if any of the benchmarks have not been produced
|
20 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
21 |
+
return raw_data, df
|
22 |
+
|
23 |
+
|
24 |
+
def get_leaderboard_group_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
25 |
+
raw_data = get_group_eval_results(results_path)
|
26 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
27 |
+
df = pd.DataFrame.from_records(all_data_json)
|
28 |
+
df = df.sort_values(by=[AutoEvalColumnGroup.average.name], ascending=False)
|
29 |
+
df = df[cols].round(decimals=2)
|
30 |
+
|
31 |
+
# filter out if any of the benchmarks have not been produced
|
32 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
33 |
+
return raw_data, df
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
39 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
40 |
+
all_evals = []
|
41 |
+
|
42 |
+
for entry in entries:
|
43 |
+
if ".json" in entry:
|
44 |
+
file_path = os.path.join(save_path, entry)
|
45 |
+
with open(file_path) as fp:
|
46 |
+
data = json.load(fp)
|
47 |
+
|
48 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
+
|
50 |
+
all_evals.append(data)
|
51 |
+
elif ".md" not in entry:
|
52 |
+
# this is a folder
|
53 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
54 |
+
for sub_entry in sub_entries:
|
55 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
56 |
+
with open(file_path) as fp:
|
57 |
+
data = json.load(fp)
|
58 |
+
|
59 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
60 |
+
all_evals.append(data)
|
61 |
+
|
62 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
63 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
64 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
65 |
+
|
66 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
67 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
68 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
69 |
+
|
70 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
def get_evaluation_queue_df_group(save_path: str, cols: list) -> list[pd.DataFrame]:
|
77 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
78 |
+
all_evals = []
|
79 |
+
|
80 |
+
for entry in entries:
|
81 |
+
if ".json" in entry:
|
82 |
+
file_path = os.path.join(save_path, entry)
|
83 |
+
with open(file_path) as fp:
|
84 |
+
data = json.load(fp)
|
85 |
+
|
86 |
+
data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
|
87 |
+
|
88 |
+
all_evals.append(data)
|
89 |
+
elif ".md" not in entry:
|
90 |
+
# this is a folder
|
91 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
92 |
+
for sub_entry in sub_entries:
|
93 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
94 |
+
with open(file_path) as fp:
|
95 |
+
data = json.load(fp)
|
96 |
+
|
97 |
+
data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
|
98 |
+
all_evals.append(data)
|
99 |
+
|
100 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
101 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
102 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
103 |
+
|
104 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
105 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
106 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
107 |
+
|
108 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/submission/submit.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
+
|
5 |
+
from src.display.formatting import styled_error, styled_message
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO
|
7 |
+
|
8 |
+
|
9 |
+
def add_new_eval(model: str, weight_type: str, gguf_filename=None):
|
10 |
+
user_name = ""
|
11 |
+
model_path = model
|
12 |
+
if "/" in model:
|
13 |
+
user_name = model.split("/")[0]
|
14 |
+
model_path = model.split("/")[1]
|
15 |
+
|
16 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
17 |
+
|
18 |
+
# Is the model info correctly filled?
|
19 |
+
try:
|
20 |
+
model_info = API.model_info(repo_id=model, revision='main')
|
21 |
+
except Exception:
|
22 |
+
return styled_error("Could not get your model information.")
|
23 |
+
|
24 |
+
if weight_type=="safetensors":
|
25 |
+
if len(gguf_filename)!=0:
|
26 |
+
return styled_error("GGUF filename should be empty when using safetensors.")
|
27 |
+
|
28 |
+
# Seems good, creating the eval
|
29 |
+
print("Adding new eval")
|
30 |
+
|
31 |
+
eval_entry = {
|
32 |
+
"model": model,
|
33 |
+
"weight_type": weight_type,
|
34 |
+
"gguf_filename": gguf_filename,
|
35 |
+
"status": "PENDING",
|
36 |
+
"submitted_time": current_time,
|
37 |
+
}
|
38 |
+
|
39 |
+
print("Creating eval file")
|
40 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
41 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
42 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_{current_time}.json"
|
43 |
+
|
44 |
+
with open(out_path, "w") as f:
|
45 |
+
f.write(json.dumps(eval_entry))
|
46 |
+
|
47 |
+
print("Uploading eval file")
|
48 |
+
API.upload_file(
|
49 |
+
path_or_fileobj=out_path,
|
50 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
51 |
+
repo_id=QUEUE_REPO,
|
52 |
+
repo_type="dataset",
|
53 |
+
commit_message=f"Add {model} to eval queue",
|
54 |
+
)
|
55 |
+
|
56 |
+
# Remove the local file
|
57 |
+
os.remove(out_path)
|
58 |
+
|
59 |
+
return styled_message(
|
60 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to five minutes for the model to show in the PENDING list."
|
61 |
+
)
|
style.css
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
body {
|
2 |
-
padding: 2rem;
|
3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
4 |
-
}
|
5 |
-
|
6 |
-
h1 {
|
7 |
-
font-size: 16px;
|
8 |
-
margin-top: 0;
|
9 |
-
}
|
10 |
-
|
11 |
-
p {
|
12 |
-
color: rgb(107, 114, 128);
|
13 |
-
font-size: 15px;
|
14 |
-
margin-bottom: 10px;
|
15 |
-
margin-top: 5px;
|
16 |
-
}
|
17 |
-
|
18 |
-
.card {
|
19 |
-
max-width: 620px;
|
20 |
-
margin: 0 auto;
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
-
}
|
25 |
-
|
26 |
-
.card p:last-child {
|
27 |
-
margin-bottom: 0;
|
28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|