Commit
·
d5b71b9
1
Parent(s):
dfc075f
Added information for second table
Browse files- .gitignore +2 -1
- app.py +75 -34
- src/datasets.json +130 -0
- src/display/about.py +16 -20
- src/display/utils.py +47 -3
- src/envs.py +2 -0
.gitignore
CHANGED
@@ -17,4 +17,5 @@ src/assets/model_counts.html
|
|
17 |
|
18 |
test
|
19 |
env
|
20 |
-
a.py
|
|
|
|
17 |
|
18 |
test
|
19 |
env
|
20 |
+
a.py
|
21 |
+
testing.py
|
app.py
CHANGED
@@ -20,9 +20,15 @@ from src.display.utils import (
|
|
20 |
EVAL_TYPES,
|
21 |
TYPES,
|
22 |
AutoEvalColumn,
|
23 |
-
fields
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
)
|
25 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
26 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
27 |
from src.submission.submit import add_new_eval
|
28 |
|
@@ -59,6 +65,9 @@ except Exception:
|
|
59 |
|
60 |
|
61 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
|
|
|
62 |
leaderboard_df = original_df.copy()
|
63 |
|
64 |
(
|
@@ -68,6 +77,12 @@ leaderboard_df = original_df.copy()
|
|
68 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
# Searching and filtering
|
72 |
def update_table(
|
73 |
hidden_df: pd.DataFrame,
|
@@ -193,38 +208,54 @@ with demo:
|
|
193 |
leaderboard_table,
|
194 |
queue=True,
|
195 |
)
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
228 |
update_table,
|
229 |
[
|
230 |
hidden_leaderboard_table_for_search,
|
@@ -232,8 +263,18 @@ with demo:
|
|
232 |
search_bar,
|
233 |
],
|
234 |
leaderboard_table,
|
235 |
-
queue=True,
|
236 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
239 |
with gr.Column():
|
|
|
20 |
EVAL_TYPES,
|
21 |
TYPES,
|
22 |
AutoEvalColumn,
|
23 |
+
fields,
|
24 |
+
BENCHMARK_COLS_GROUP,
|
25 |
+
COLS_GROUP,
|
26 |
+
EVAL_COLS_GROUP,
|
27 |
+
EVAL_TYPES_GROUP,
|
28 |
+
TYPES_GROUP,
|
29 |
+
AutoEvalColumnGroup,
|
30 |
)
|
31 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
|
32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
|
|
|
65 |
|
66 |
|
67 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
68 |
+
raw_data_grouped, original_df_grouped = get_leaderboard_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
|
69 |
+
|
70 |
+
leaderboard_grouped_df = original_df_grouped.copy()
|
71 |
leaderboard_df = original_df.copy()
|
72 |
|
73 |
(
|
|
|
77 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
78 |
|
79 |
|
80 |
+
(
|
81 |
+
finished_eval_queue_g_df,
|
82 |
+
running_eval_queue_g_df,
|
83 |
+
pending_eval_queue_g_df,
|
84 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
|
85 |
+
|
86 |
# Searching and filtering
|
87 |
def update_table(
|
88 |
hidden_df: pd.DataFrame,
|
|
|
208 |
leaderboard_table,
|
209 |
queue=True,
|
210 |
)
|
211 |
+
|
212 |
+
with gr.TabItem("🏅 LLM Benchmark FineGrained", elem_id="llm-benchmark-tab-table", id=0):
|
213 |
+
with gr.Row():
|
214 |
+
with gr.Row():
|
215 |
+
search_bar = gr.Textbox(
|
216 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
217 |
+
show_label=False,
|
218 |
+
elem_id="search-bar",
|
219 |
+
)
|
220 |
+
with gr.Row():
|
221 |
+
shown_columns = gr.CheckboxGroup(
|
222 |
+
choices=[
|
223 |
+
c.name
|
224 |
+
for c in fields(AutoEvalColumn)
|
225 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
226 |
+
],
|
227 |
+
value=[
|
228 |
+
c.name
|
229 |
+
for c in fields(AutoEvalColumn)
|
230 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
231 |
+
],
|
232 |
+
label="Select columns to show",
|
233 |
+
elem_id="column-select",
|
234 |
+
interactive=True,
|
235 |
+
)
|
236 |
|
237 |
+
leaderboard_table = gr.components.Dataframe(
|
238 |
+
value=leaderboard_df[
|
239 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
240 |
+
+ shown_columns.value
|
241 |
+
+ [AutoEvalColumn.dummy.name]
|
242 |
+
],
|
243 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name],
|
244 |
+
datatype=TYPES,
|
245 |
+
elem_id="leaderboard-table",
|
246 |
+
interactive=False,
|
247 |
+
visible=True,
|
248 |
+
column_widths=["15%", "30%"]
|
249 |
+
)
|
250 |
+
|
251 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
252 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
253 |
+
value=original_df[COLS],
|
254 |
+
headers=COLS,
|
255 |
+
datatype=TYPES,
|
256 |
+
visible=False,
|
257 |
+
)
|
258 |
+
search_bar.submit(
|
259 |
update_table,
|
260 |
[
|
261 |
hidden_leaderboard_table_for_search,
|
|
|
263 |
search_bar,
|
264 |
],
|
265 |
leaderboard_table,
|
|
|
266 |
)
|
267 |
+
for selector in [shown_columns]:
|
268 |
+
selector.change(
|
269 |
+
update_table,
|
270 |
+
[
|
271 |
+
hidden_leaderboard_table_for_search,
|
272 |
+
shown_columns,
|
273 |
+
search_bar,
|
274 |
+
],
|
275 |
+
leaderboard_table,
|
276 |
+
queue=True,
|
277 |
+
)
|
278 |
|
279 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
280 |
with gr.Column():
|
src/datasets.json
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"task_type": "mmlu",
|
4 |
+
"dstype": "mc",
|
5 |
+
"group": "Banking",
|
6 |
+
"subtext": "You are an AI that selects the most accurate answer in Azerbaijani based on a given question. You will be provided with a question in Azerbaijani and multiple options in Azerbaijani. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
7 |
+
"data": "LLM-Beetle/Banking_Exam_MCQ",
|
8 |
+
"name": "Banking_Exam_MCQ"
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"task_type": "mmlu",
|
12 |
+
"dstype": "kmc_azerbaycan_dili",
|
13 |
+
"group": "MMLU",
|
14 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on grammatical concepts and linguistics. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
15 |
+
"data": "LLM-Beetle/Azerbaijani_Lang_MC",
|
16 |
+
"name": "Azerbaijani_Lang_MC"
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"task_type": "mmlu",
|
20 |
+
"dstype": "kmc_edebiyyat",
|
21 |
+
"group": "MMLU",
|
22 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on literary and historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
23 |
+
"data": "LLM-Beetle/Literature_MC",
|
24 |
+
"name": "Azerbaijani_Lit_MC"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"task_type": "mmlu",
|
28 |
+
"dstype": "kmc_biologiya",
|
29 |
+
"group": "MMLU",
|
30 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on biology. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
31 |
+
"data": "LLM-Beetle/Biology_MC",
|
32 |
+
"name": "Biology_MC"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"task_type": "mmlu",
|
36 |
+
"dstype": "kmc_cografiya",
|
37 |
+
"group": "MMLU",
|
38 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on geographical and environmental knowledge. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
39 |
+
"data": "LLM-Beetle/Geography_MC",
|
40 |
+
"name": "Geography_MC"
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"task_type": "mmlu",
|
44 |
+
"dstype": "kmc_mentiq",
|
45 |
+
"group": "MMLU",
|
46 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on logical reasoning and problem-solving. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
47 |
+
"data": "LLM-Beetle/Logic_MC",
|
48 |
+
"name": "Logic_MC"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"task_type": "mmlu",
|
52 |
+
"dstype": "kmc_tarix",
|
53 |
+
"group": "MMLU",
|
54 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on historical and cultural facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
55 |
+
"data": "LLM-Beetle/History_MC",
|
56 |
+
"name": "History_MC"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"task_type": "mmlu",
|
60 |
+
"dstype": "kmc_informatika",
|
61 |
+
"group": "MMLU",
|
62 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on technology and computer science. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
63 |
+
"data": "LLM-Beetle/Informatics_MC",
|
64 |
+
"name": "Informatics_MC"
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_type": "mmlu",
|
68 |
+
"dstype": "kmc_fizika",
|
69 |
+
"group": "MMLU",
|
70 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on physics concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
71 |
+
"data": "LLM-Beetle/Physics_MC",
|
72 |
+
"name": "Physics_MC"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"task_type": "mmlu",
|
76 |
+
"dstype": "kmc_kimya",
|
77 |
+
"group": "MMLU",
|
78 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on chemistry and scientific concepts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
79 |
+
"data": "LLM-Beetle/Chemistry_MC",
|
80 |
+
"name": "Chemistry_MC"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"task_type": "mmlu",
|
84 |
+
"dstype": "kmc_azerbaycan_tarixi",
|
85 |
+
"group": "MMLU",
|
86 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on historical facts. Your task is to select the correct option from the given question and answer choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
87 |
+
"data": "LLM-Beetle/Azerbaijani_Hist_MC",
|
88 |
+
"name": "Azerbaijani_Hist_MC"
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"task_type": "mmlu",
|
92 |
+
"dstype": "tc",
|
93 |
+
"group": "Banking",
|
94 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani. Your task is to select the correct option from the given question and answer choices. You are given a statement along with multiple options that represent different topics. Choose the option that best categorizes the statement based on its topic. Choose the single letter (A, B, C, D, E, F, G, H, I, J) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
95 |
+
"data": "LLM-Beetle/Banking_Call_Classification_MC",
|
96 |
+
"name": "Banking_Call_Classification_MC"
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"task_type": "arc",
|
100 |
+
"dstype": "arc",
|
101 |
+
"group": "ARC",
|
102 |
+
"subtext": "You are an AI designed to answer questions in Azerbaijani based on reasoning and knowledge. Your task is to select the correct option from the given question and answer choices. You are given a question along with multiple options. Choose the correct option. Choose the single letter (A, B, C, D) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
103 |
+
"data": "LLM-Beetle/ARC",
|
104 |
+
"name": "ARC"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"task_type": "gsm8k",
|
108 |
+
"dstype": "mmc",
|
109 |
+
"group": "GSM8K",
|
110 |
+
"subtext": "You are an AI designed to solve mathematical word problems in Azerbaijani. Your task is to analyze the given question and select the correct option from the provided choices. Choose the single letter (A, B, C, D, E) that best answers the question. Respond with only the letter of the chosen answer, without any additional text.",
|
111 |
+
"data": "LLM-Beetle/GSM8K",
|
112 |
+
"name": "GSM8K"
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"task_type": "qa",
|
116 |
+
"dstype": "qa",
|
117 |
+
"group": "Banking",
|
118 |
+
"subtext": "",
|
119 |
+
"data": "LLM-Beetle/Banking_QA",
|
120 |
+
"name": "Banking_QA"
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"task_type": "rag",
|
124 |
+
"dstype": "cqa",
|
125 |
+
"group": "CQA",
|
126 |
+
"subtext": "",
|
127 |
+
"data": "LLM-Beetle/Wiki_CQA",
|
128 |
+
"name": "Wiki_CQA"
|
129 |
+
}
|
130 |
+
]
|
src/display/about.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
|
|
|
|
3 |
|
4 |
@dataclass
|
5 |
class Task:
|
@@ -9,30 +11,24 @@ class Task:
|
|
9 |
|
10 |
|
11 |
# Init: to update with your specific keys
|
12 |
-
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
-
|
15 |
-
task1 = Task("Synthetic_QA", "metric_name", "Synthetic_QA")
|
16 |
-
task2 = Task("Support_MC", "metric_name", "Support_MC")
|
17 |
-
task3 = Task("Context_QA", "metric_name", "Context_QA")
|
18 |
-
task4 = Task("Banking_MC", "metric_name", "Banking_MC")
|
19 |
-
task5 = Task("ARC", "metric_name", "ARC")
|
20 |
-
task6 = Task("Binary_QA", "metric_name", "Binary_QA")
|
21 |
-
task7 = Task("ANL_Quad", "metric_name", "ANL_Quad")
|
22 |
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
|
37 |
|
38 |
# Your leaderboard name
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
+
import json
|
4 |
+
|
5 |
|
6 |
@dataclass
|
7 |
class Task:
|
|
|
11 |
|
12 |
|
13 |
# Init: to update with your specific keys
|
14 |
+
def create_task_list():
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
with open("src/datasets.json") as f:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
data = json.load(f)
|
19 |
|
20 |
+
groups = []
|
21 |
+
names = []
|
22 |
+
for d in data:
|
23 |
+
groups.append(d['group'])
|
24 |
+
names.append(d['name'])
|
25 |
+
|
26 |
+
tasks = []
|
27 |
+
for name in names:
|
28 |
+
tasks.append(Task(name, "metric_name", name))
|
29 |
+
|
30 |
+
return tasks, list(set(groups))
|
31 |
|
|
|
|
|
32 |
|
33 |
|
34 |
# Your leaderboard name
|
src/display/utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
|
3 |
-
from src.display.about import
|
4 |
|
5 |
def fields(raw_class):
|
6 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -18,6 +18,8 @@ class ColumnContent:
|
|
18 |
never_hidden: bool = False
|
19 |
dummy: bool = False
|
20 |
|
|
|
|
|
21 |
## Leaderboard columns
|
22 |
auto_eval_column_dict = []
|
23 |
# Init
|
@@ -25,8 +27,10 @@ auto_eval_column_dict.append(["model_submission_date", ColumnContent, ColumnCont
|
|
25 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
26 |
#Scores
|
27 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
|
|
28 |
for task in Tasks:
|
29 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.
|
30 |
# Dummy column for the search bar (hidden by the custom CSS)
|
31 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
32 |
|
@@ -47,4 +51,44 @@ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
47 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
48 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
49 |
|
50 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
|
3 |
+
from src.display.about import create_task_list
|
4 |
|
5 |
def fields(raw_class):
|
6 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
18 |
never_hidden: bool = False
|
19 |
dummy: bool = False
|
20 |
|
21 |
+
Tasks, Groups = create_task_list()
|
22 |
+
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
|
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
+
|
31 |
+
|
32 |
for task in Tasks:
|
33 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.col_name, "number", True)])
|
34 |
# Dummy column for the search bar (hidden by the custom CSS)
|
35 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
36 |
|
|
|
51 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
52 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
53 |
|
54 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
#for grouping
|
61 |
+
|
62 |
+
|
63 |
+
## Leaderboard columns
|
64 |
+
auto_eval_group_dict = []
|
65 |
+
# Init
|
66 |
+
auto_eval_group_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
|
67 |
+
auto_eval_group_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
68 |
+
#Scores
|
69 |
+
auto_eval_group_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
70 |
+
|
71 |
+
|
72 |
+
for task in Groups:
|
73 |
+
auto_eval_group_dict.append([task.name, ColumnContent, ColumnContent(task.col_name, "number", True)])
|
74 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
75 |
+
auto_eval_group_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
76 |
+
|
77 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
78 |
+
AutoEvalColumnGroup = make_dataclass("AutoEvalColumnGroup", auto_eval_column_dict, frozen=True)
|
79 |
+
|
80 |
+
## For the queue columns in the submission tab
|
81 |
+
@dataclass(frozen=True)
|
82 |
+
class EvalQueueColumnGroup: # Queue column
|
83 |
+
model = ColumnContent("model", "markdown", True)
|
84 |
+
submitted_time = ColumnContent("submitted_time", "str", True)
|
85 |
+
status = ColumnContent("status", "str", True)
|
86 |
+
|
87 |
+
# Column selection
|
88 |
+
COLS_GROUP = [c.name for c in fields(AutoEvalColumnGroup) if not c.hidden]
|
89 |
+
TYPES_GROUP = [c.type for c in fields(AutoEvalColumnGroup) if not c.hidden]
|
90 |
+
|
91 |
+
EVAL_COLS_GROUP = [c.name for c in fields(EvalQueueColumnGroup)]
|
92 |
+
EVAL_TYPES_GROUP = [c.type for c in fields(EvalQueueColumnGroup)]
|
93 |
+
|
94 |
+
BENCHMARK_COLS_GROUP = [t.value.col_name for t in Groups]
|
src/envs.py
CHANGED
@@ -8,11 +8,13 @@ OWNER = "LLM-Beetle"
|
|
8 |
REPO_ID = f"{OWNER}/frontend"
|
9 |
QUEUE_REPO = f"{OWNER}/requests"
|
10 |
RESULTS_REPO = f"{OWNER}/results"
|
|
|
11 |
|
12 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
13 |
|
14 |
# Local caches
|
15 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
16 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
17 |
|
18 |
API = HfApi(token=TOKEN)
|
|
|
8 |
REPO_ID = f"{OWNER}/frontend"
|
9 |
QUEUE_REPO = f"{OWNER}/requests"
|
10 |
RESULTS_REPO = f"{OWNER}/results"
|
11 |
+
RESULTS_GROUP_REPO = "Emirrv/results"
|
12 |
|
13 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
14 |
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
17 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
18 |
+
EVAL_RESULTS_GROUP_PATH = os.path.join(CACHE_PATH, "eval-results")
|
19 |
|
20 |
API = HfApi(token=TOKEN)
|