Cleanup
Browse filesSigned-off-by: Jonathan Bnayahu <[email protected]>
- app.py +2 -140
- src/about.py +2 -33
- src/display/formatting.py +0 -9
- src/display/utils.py +0 -51
- src/envs.py +0 -9
- src/leaderboard/read_evals.py +0 -95
- src/populate.py +2 -41
app.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard
|
3 |
-
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
9 |
CITATION_BUTTON_TEXT,
|
10 |
-
# EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
@@ -17,73 +14,25 @@ from src.display.css_html_js import custom_css
|
|
17 |
from src.display.utils import (
|
18 |
BENCHMARK_COLS,
|
19 |
COLS,
|
20 |
-
EVAL_COLS,
|
21 |
-
# EVAL_TYPES,
|
22 |
AutoEvalColumn,
|
23 |
-
# ModelType,
|
24 |
fields,
|
25 |
-
# WeightType,
|
26 |
-
# Precision
|
27 |
)
|
28 |
-
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID
|
29 |
from src.populate import get_leaderboard_df
|
30 |
|
31 |
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
-
### Space initialisation
|
36 |
-
# try:
|
37 |
-
# print(EVAL_REQUESTS_PATH)
|
38 |
-
# snapshot_download(
|
39 |
-
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
# )
|
41 |
-
# except Exception:
|
42 |
-
# restart_space()
|
43 |
-
# try:
|
44 |
-
# print(EVAL_RESULTS_PATH)
|
45 |
-
# snapshot_download(
|
46 |
-
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
-
# )
|
48 |
-
# except Exception:
|
49 |
-
# restart_space()
|
50 |
-
|
51 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
52 |
|
53 |
-
# (
|
54 |
-
# finished_eval_queue_df,
|
55 |
-
# running_eval_queue_df,
|
56 |
-
# pending_eval_queue_df,
|
57 |
-
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
58 |
-
|
59 |
def init_leaderboard(dataframe):
|
60 |
if dataframe is None or dataframe.empty:
|
61 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
62 |
return Leaderboard(
|
63 |
value=dataframe,
|
64 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
65 |
-
# select_columns=SelectColumns(
|
66 |
-
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
67 |
-
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
68 |
-
# label="Select Columns to Display:",
|
69 |
-
# ),
|
70 |
search_columns=[AutoEvalColumn.model.name],
|
71 |
-
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
72 |
-
# filter_columns=[
|
73 |
-
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
74 |
-
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
75 |
-
# ColumnFilter(
|
76 |
-
# AutoEvalColumn.params.name,
|
77 |
-
# type="slider",
|
78 |
-
# min=0.01,
|
79 |
-
# max=150,
|
80 |
-
# label="Select the number of parameters (B)",
|
81 |
-
# ),
|
82 |
-
# ColumnFilter(
|
83 |
-
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
84 |
-
# ),
|
85 |
-
# ],
|
86 |
-
# bool_checkboxgroup_label="Hide models",
|
87 |
interactive=False,
|
88 |
)
|
89 |
|
@@ -101,93 +50,6 @@ with demo:
|
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
-
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
# with gr.Column():
|
106 |
-
# with gr.Row():
|
107 |
-
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
# with gr.Column():
|
110 |
-
# with gr.Accordion(
|
111 |
-
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
# open=False,
|
113 |
-
# ):
|
114 |
-
# with gr.Row():
|
115 |
-
# finished_eval_table = gr.components.Dataframe(
|
116 |
-
# value=finished_eval_queue_df,
|
117 |
-
# headers=EVAL_COLS,
|
118 |
-
# datatype=EVAL_TYPES,
|
119 |
-
# row_count=5,
|
120 |
-
# )
|
121 |
-
# with gr.Accordion(
|
122 |
-
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
# open=False,
|
124 |
-
# ):
|
125 |
-
# with gr.Row():
|
126 |
-
# running_eval_table = gr.components.Dataframe(
|
127 |
-
# value=running_eval_queue_df,
|
128 |
-
# headers=EVAL_COLS,
|
129 |
-
# datatype=EVAL_TYPES,
|
130 |
-
# row_count=5,
|
131 |
-
# )
|
132 |
-
|
133 |
-
# with gr.Accordion(
|
134 |
-
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
# open=False,
|
136 |
-
# ):
|
137 |
-
# with gr.Row():
|
138 |
-
# pending_eval_table = gr.components.Dataframe(
|
139 |
-
# value=pending_eval_queue_df,
|
140 |
-
# headers=EVAL_COLS,
|
141 |
-
# datatype=EVAL_TYPES,
|
142 |
-
# row_count=5,
|
143 |
-
# )
|
144 |
-
# with gr.Row():
|
145 |
-
# gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
-
|
147 |
-
# with gr.Row():
|
148 |
-
# with gr.Column():
|
149 |
-
# model_name_textbox = gr.Textbox(label="Model name")
|
150 |
-
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
# model_type = gr.Dropdown(
|
152 |
-
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
# label="Model type",
|
154 |
-
# multiselect=False,
|
155 |
-
# value=None,
|
156 |
-
# interactive=True,
|
157 |
-
# )
|
158 |
-
|
159 |
-
# with gr.Column():
|
160 |
-
# precision = gr.Dropdown(
|
161 |
-
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
-
# label="Precision",
|
163 |
-
# multiselect=False,
|
164 |
-
# value="float16",
|
165 |
-
# interactive=True,
|
166 |
-
# )
|
167 |
-
# weight_type = gr.Dropdown(
|
168 |
-
# choices=[i.value.name for i in WeightType],
|
169 |
-
# label="Weights type",
|
170 |
-
# multiselect=False,
|
171 |
-
# value="Original",
|
172 |
-
# interactive=True,
|
173 |
-
# )
|
174 |
-
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
-
|
176 |
-
# submit_button = gr.Button("Submit Eval")
|
177 |
-
# submission_result = gr.Markdown()
|
178 |
-
# submit_button.click(
|
179 |
-
# add_new_eval,
|
180 |
-
# [
|
181 |
-
# model_name_textbox,
|
182 |
-
# base_model_name_textbox,
|
183 |
-
# revision_name_textbox,
|
184 |
-
# precision,
|
185 |
-
# weight_type,
|
186 |
-
# model_type,
|
187 |
-
# ],
|
188 |
-
# submission_result,
|
189 |
-
# )
|
190 |
-
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
193 |
citation_button = gr.Textbox(
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard
|
|
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
4 |
|
5 |
from src.about import (
|
6 |
CITATION_BUTTON_LABEL,
|
7 |
CITATION_BUTTON_TEXT,
|
|
|
8 |
INTRODUCTION_TEXT,
|
9 |
LLM_BENCHMARKS_TEXT,
|
10 |
TITLE,
|
|
|
14 |
from src.display.utils import (
|
15 |
BENCHMARK_COLS,
|
16 |
COLS,
|
|
|
|
|
17 |
AutoEvalColumn,
|
|
|
18 |
fields,
|
|
|
|
|
19 |
)
|
20 |
+
from src.envs import API, EVAL_RESULTS_PATH, REPO_ID
|
21 |
from src.populate import get_leaderboard_df
|
22 |
|
23 |
|
24 |
def restart_space():
|
25 |
API.restart_space(repo_id=REPO_ID)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def init_leaderboard(dataframe):
|
30 |
if dataframe is None or dataframe.empty:
|
31 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
32 |
return Leaderboard(
|
33 |
value=dataframe,
|
34 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
|
|
|
|
|
|
|
|
35 |
search_columns=[AutoEvalColumn.model.name],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
interactive=False,
|
37 |
)
|
38 |
|
|
|
50 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
51 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
with gr.Row():
|
54 |
with gr.Accordion("📙 Citation", open=False):
|
55 |
citation_button = gr.Textbox(
|
src/about.py
CHANGED
@@ -27,7 +27,6 @@ class Tasks(Enum):
|
|
27 |
task11 = Task("summarization", "score", "Summarization")
|
28 |
task12 = Task("translation", "score", "Translation")
|
29 |
|
30 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
31 |
# ---------------------------------------------------
|
32 |
|
33 |
|
@@ -39,7 +38,7 @@ TITLE = """<h1 align="center" id="space-title">BlueBench Leaderboard</h1>"""
|
|
39 |
# What does your leaderboard evaluate?
|
40 |
INTRODUCTION_TEXT = """
|
41 |
BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
|
42 |
-
It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt
|
43 |
As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
|
44 |
"""
|
45 |
|
@@ -66,7 +65,7 @@ table th:nth-of-type(3) {
|
|
66 |
| Reasoning | <pre><p><b>Hellaswag</b></p>[Dataset](https://huggingface.co/datasets/Rowan/hellaswag), [Paper](https://arxiv.org/abs/1905.07830), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.hellaswag.html)</pre> | <p>Commonsense natural language inference</p>Given an event description such as "A woman sits at a piano," a machine must select the most likely followup: "She sets her fingers on the keys." Gatherd via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. |
|
67 |
| Reasoning | <pre><p><b>Openbook QA</b></p>[Dataset](https://huggingface.co/datasets/allenai/openbookqa), [Paper](https://aclanthology.org/D18-1260/), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.openbook_qa.html)</pre> | <p>Question answering dataset using open book exams</p>Comes with our questions is a set of 1326 elementary level science facts. Roughly 6000 questions probe an understanding of these facts and their application to novel situations. This requires combining an open book fact (e.g., metals conduct electricity) with broad common knowledge (e.g., a suit of armor is made of metal) obtained from other sources. |
|
68 |
| Machine Translation | <pre><p><b>Flores 101</b></p>[Dataset](https://huggingface.co/datasets/gsarti/flores_101), [Paper](https://arxiv.org/abs/2106.03193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.mt.flores_101.__dir__.html)</pre> | <p>Benchmark dataset for machine translation</p>There are 101 lanugages in this dataset, each sentence appears in all languages, and all a total of `2k` sentences. We use the following language pairs: `["ara_eng", "deu_eng", "eng_ara", "eng_deu", "eng_fra", "eng_kor", "eng_por", "eng_ron", "eng_spa", "fra_eng", "jpn_eng", "kor_eng", "por_eng", "ron_eng", "spa_eng"]`. |
|
69 |
-
| Chatbot Abilities | <pre><p><b>Arena Hard</b></p>[Dataset](https://huggingface.co/datasets/lmsys/arena-hard-auto-v0.1), [Paper](https://arxiv.org/abs/2406.11939), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.arena_hard.generation.english_gpt_4_0314_reference.html)</pre> | <p>An automatic evaluation tool for instruction-tuned LLMs</p>Contains 500 challenging user queries sourced from Chatbot Arena. We prompt GPT-4-Turbo as judge to compare the models" responses against a baseline model (default: GPT-4-0314 for here we are using `llama-3.
|
70 |
| Classification | <pre><p><b>20_newsgroups</b></p>[Dataset](https://huggingface.co/datasets/SetFit/20_newsgroups), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.20_newsgroups.html)</pre> | <p>News article classification</p>The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date. |
|
71 |
| Bias | <pre><p><b>BBQ</b></p>[Dataset](https://huggingface.co/datasets/heegyu/bbq), [Paper](https://arxiv.org/abs/2110.08193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.safety.bbq.__dir__.html)</pre> | <p>Question sets constructed to highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts.</p>It is well documented that NLP models learn social biases, but little work has been done on how these biases manifest in model outputs for applied tasks like question answering (QA). We introduce the Bias Benchmark for QA (BBQ), a dataset of question sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts. Our task evaluates model responses at two levels: (i) given an under-informative context, we test how strongly responses refect social biases, and (ii) given an adequately informative context, we test whether the model's biases override a correct answer choice. We fnd that models often rely on stereotypes when the context is under-informative, meaning the model's outputs consistently reproduce harmful biases in this setting. Though models are more accurate when the context provides an informative answer, they still rely on stereotypes and average up to 3.4 percentage points higher accuracy when the correct answer aligns with a social bias than when it conficts, with this difference widening to over 5 points on examples targeting gender for most models tested. |
|
72 |
| Legal Reasoning | <pre><p><b>Legalbench</b></p>[Dataset](https://huggingface.co/datasets/nguha/legalbench), [Paper](https://arxiv.org/abs/2308.11462), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.legalbench.__dir__.html)</pre> | <p>Evaluating legal reasoning in English large language models (LLMs).</p>LegalBench tasks span multiple types (binary classification, multi-class classification, extraction, generation, entailment), multiple types of text (statutes, judicial opinions, contracts, etc.), and multiple areas of law (evidence, contracts, civil procedure, etc.). For more information on tasks, we recommend visiting the website, where you can search through task descriptions, or the Github repository, which contains more granular task descriptions. We also recommend reading the paper, which provides more background on task significance and construction process. |
|
@@ -88,36 +87,6 @@ unitxt-evaluate --tasks "benchmarks.bluebench" --model cross_provider --model_ar
|
|
88 |
```
|
89 |
"""
|
90 |
|
91 |
-
EVALUATION_QUEUE_TEXT = """
|
92 |
-
## Some good practices before submitting a model
|
93 |
-
|
94 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
95 |
-
```python
|
96 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
97 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
98 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
99 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
100 |
-
```
|
101 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
102 |
-
|
103 |
-
Note: make sure your model is public!
|
104 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
105 |
-
|
106 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
107 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
108 |
-
|
109 |
-
### 3) Make sure your model has an open license!
|
110 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
111 |
-
|
112 |
-
### 4) Fill up your model card
|
113 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
114 |
-
|
115 |
-
## In case of model failure
|
116 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
117 |
-
Make sure you have followed the above steps first.
|
118 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
119 |
-
"""
|
120 |
-
|
121 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
122 |
CITATION_BUTTON_TEXT = r"""
|
123 |
"""
|
|
|
27 |
task11 = Task("summarization", "score", "Summarization")
|
28 |
task12 = Task("translation", "score", "Translation")
|
29 |
|
|
|
30 |
# ---------------------------------------------------
|
31 |
|
32 |
|
|
|
38 |
# What does your leaderboard evaluate?
|
39 |
INTRODUCTION_TEXT = """
|
40 |
BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
|
41 |
+
It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing <a href="https://www.unitxt.ai">unitxt</a>’s abilities for dynamic and flexible text processing.
|
42 |
As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
|
43 |
"""
|
44 |
|
|
|
65 |
| Reasoning | <pre><p><b>Hellaswag</b></p>[Dataset](https://huggingface.co/datasets/Rowan/hellaswag), [Paper](https://arxiv.org/abs/1905.07830), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.hellaswag.html)</pre> | <p>Commonsense natural language inference</p>Given an event description such as "A woman sits at a piano," a machine must select the most likely followup: "She sets her fingers on the keys." Gatherd via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. |
|
66 |
| Reasoning | <pre><p><b>Openbook QA</b></p>[Dataset](https://huggingface.co/datasets/allenai/openbookqa), [Paper](https://aclanthology.org/D18-1260/), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.openbook_qa.html)</pre> | <p>Question answering dataset using open book exams</p>Comes with our questions is a set of 1326 elementary level science facts. Roughly 6000 questions probe an understanding of these facts and their application to novel situations. This requires combining an open book fact (e.g., metals conduct electricity) with broad common knowledge (e.g., a suit of armor is made of metal) obtained from other sources. |
|
67 |
| Machine Translation | <pre><p><b>Flores 101</b></p>[Dataset](https://huggingface.co/datasets/gsarti/flores_101), [Paper](https://arxiv.org/abs/2106.03193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.mt.flores_101.__dir__.html)</pre> | <p>Benchmark dataset for machine translation</p>There are 101 lanugages in this dataset, each sentence appears in all languages, and all a total of `2k` sentences. We use the following language pairs: `["ara_eng", "deu_eng", "eng_ara", "eng_deu", "eng_fra", "eng_kor", "eng_por", "eng_ron", "eng_spa", "fra_eng", "jpn_eng", "kor_eng", "por_eng", "ron_eng", "spa_eng"]`. |
|
68 |
+
| Chatbot Abilities | <pre><p><b>Arena Hard</b></p>[Dataset](https://huggingface.co/datasets/lmsys/arena-hard-auto-v0.1), [Paper](https://arxiv.org/abs/2406.11939), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.arena_hard.generation.english_gpt_4_0314_reference.html)</pre> | <p>An automatic evaluation tool for instruction-tuned LLMs</p>Contains 500 challenging user queries sourced from Chatbot Arena. We prompt GPT-4-Turbo as judge to compare the models" responses against a baseline model (default: GPT-4-0314 for here we are using `llama-3.3-70b`). |
|
69 |
| Classification | <pre><p><b>20_newsgroups</b></p>[Dataset](https://huggingface.co/datasets/SetFit/20_newsgroups), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.20_newsgroups.html)</pre> | <p>News article classification</p>The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date. |
|
70 |
| Bias | <pre><p><b>BBQ</b></p>[Dataset](https://huggingface.co/datasets/heegyu/bbq), [Paper](https://arxiv.org/abs/2110.08193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.safety.bbq.__dir__.html)</pre> | <p>Question sets constructed to highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts.</p>It is well documented that NLP models learn social biases, but little work has been done on how these biases manifest in model outputs for applied tasks like question answering (QA). We introduce the Bias Benchmark for QA (BBQ), a dataset of question sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts. Our task evaluates model responses at two levels: (i) given an under-informative context, we test how strongly responses refect social biases, and (ii) given an adequately informative context, we test whether the model's biases override a correct answer choice. We fnd that models often rely on stereotypes when the context is under-informative, meaning the model's outputs consistently reproduce harmful biases in this setting. Though models are more accurate when the context provides an informative answer, they still rely on stereotypes and average up to 3.4 percentage points higher accuracy when the correct answer aligns with a social bias than when it conficts, with this difference widening to over 5 points on examples targeting gender for most models tested. |
|
71 |
| Legal Reasoning | <pre><p><b>Legalbench</b></p>[Dataset](https://huggingface.co/datasets/nguha/legalbench), [Paper](https://arxiv.org/abs/2308.11462), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.legalbench.__dir__.html)</pre> | <p>Evaluating legal reasoning in English large language models (LLMs).</p>LegalBench tasks span multiple types (binary classification, multi-class classification, extraction, generation, entailment), multiple types of text (statutes, judicial opinions, contracts, etc.), and multiple areas of law (evidence, contracts, civil procedure, etc.). For more information on tasks, we recommend visiting the website, where you can search through task descriptions, or the Github repository, which contains more granular task descriptions. We also recommend reading the paper, which provides more background on task significance and construction process. |
|
|
|
87 |
```
|
88 |
"""
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
91 |
CITATION_BUTTON_TEXT = r"""
|
92 |
"""
|
src/display/formatting.py
CHANGED
@@ -1,12 +1,3 @@
|
|
1 |
-
# def model_hyperlink(link, model_name):
|
2 |
-
# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
-
|
4 |
-
|
5 |
-
# def make_clickable_model(model_name):
|
6 |
-
# link = f"https://huggingface.co/{model_name}"
|
7 |
-
# return model_hyperlink(link, model_name)
|
8 |
-
|
9 |
-
|
10 |
def styled_error(error):
|
11 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def styled_error(error):
|
2 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
3 |
|
src/display/utils.py
CHANGED
@@ -23,22 +23,11 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
-
# Model information
|
33 |
-
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -60,46 +49,6 @@ class ModelDetails:
|
|
60 |
display_name: str = ""
|
61 |
symbol: str = "" # emoji
|
62 |
|
63 |
-
|
64 |
-
# class ModelType(Enum):
|
65 |
-
# PT = ModelDetails(name="pretrained", symbol="🟢")
|
66 |
-
# FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
67 |
-
# IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
68 |
-
# RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
69 |
-
# Unknown = ModelDetails(name="", symbol="?")
|
70 |
-
|
71 |
-
# def to_str(self, separator=" "):
|
72 |
-
# return f"{self.value.symbol}{separator}{self.value.name}"
|
73 |
-
|
74 |
-
# @staticmethod
|
75 |
-
# def from_str(type):
|
76 |
-
# if "fine-tuned" in type or "🔶" in type:
|
77 |
-
# return ModelType.FT
|
78 |
-
# if "pretrained" in type or "🟢" in type:
|
79 |
-
# return ModelType.PT
|
80 |
-
# if "RL-tuned" in type or "🟦" in type:
|
81 |
-
# return ModelType.RL
|
82 |
-
# if "instruction-tuned" in type or "⭕" in type:
|
83 |
-
# return ModelType.IFT
|
84 |
-
# return ModelType.Unknown
|
85 |
-
|
86 |
-
# class WeightType(Enum):
|
87 |
-
# Adapter = ModelDetails("Adapter")
|
88 |
-
# Original = ModelDetails("Original")
|
89 |
-
# Delta = ModelDetails("Delta")
|
90 |
-
|
91 |
-
# class Precision(Enum):
|
92 |
-
# float16 = ModelDetails("float16")
|
93 |
-
# bfloat16 = ModelDetails("bfloat16")
|
94 |
-
# Unknown = ModelDetails("?")
|
95 |
-
|
96 |
-
# def from_str(precision):
|
97 |
-
# if precision in ["torch.float16", "float16"]:
|
98 |
-
# return Precision.float16
|
99 |
-
# if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
-
# return Precision.bfloat16
|
101 |
-
# return Precision.Unknown
|
102 |
-
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
105 |
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
|
|
26 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
27 |
#Scores
|
28 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
29 |
for task in Tasks:
|
30 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# We use make dataclass to dynamically fill the scores from Tasks
|
33 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
49 |
display_name: str = ""
|
50 |
symbol: str = "" # emoji
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
# Column selection
|
53 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
54 |
|
src/envs.py
CHANGED
@@ -10,18 +10,9 @@ OWNER = "jbnayahu" # Change to your org - don't forget to create a results and r
|
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/bluebench"
|
13 |
-
# QUEUE_REPO = f"{OWNER}/bluebench-requests"
|
14 |
|
15 |
# If you setup a cache later, just change HF_HOME
|
16 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
17 |
-
|
18 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results/bluebench")
|
19 |
|
20 |
-
|
21 |
-
# Local caches
|
22 |
-
# EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
23 |
-
# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
24 |
-
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
25 |
-
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
26 |
-
|
27 |
API = HfApi(token=TOKEN)
|
|
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/bluebench"
|
|
|
13 |
|
14 |
# If you setup a cache later, just change HF_HOME
|
15 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
16 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results/bluebench")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
-
import glob
|
2 |
import json
|
3 |
-
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
|
@@ -8,8 +6,6 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.utils import AutoEvalColumn, Tasks
|
11 |
-
# from src.submission.check_validity import is_model_on_hub
|
12 |
-
|
13 |
|
14 |
@dataclass
|
15 |
class EvalResult:
|
@@ -17,19 +13,8 @@ class EvalResult:
|
|
17 |
"""
|
18 |
eval_name: str # org_model_precision (uid)
|
19 |
full_model: str # org/model (path on hub)
|
20 |
-
# org: str
|
21 |
-
# model: str
|
22 |
-
# revision: str # commit hash, "" if main
|
23 |
results: dict
|
24 |
-
# precision: Precision = Precision.Unknown
|
25 |
-
# model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
26 |
-
# weight_type: WeightType = WeightType.Original # Original or Adapter
|
27 |
-
# architecture: str = "Unknown"
|
28 |
-
# license: str = "?"
|
29 |
-
# likes: int = 0
|
30 |
-
# num_params: int = 0
|
31 |
date: str = "" # submission date of request file
|
32 |
-
# still_on_hub: bool = False
|
33 |
|
34 |
@classmethod
|
35 |
def init_from_json_file(self, json_filepath):
|
@@ -41,32 +26,6 @@ class EvalResult:
|
|
41 |
|
42 |
full_model = env_info.get("model")
|
43 |
|
44 |
-
# Precision
|
45 |
-
# precision = Precision.from_str(config.get("model_dtype"))
|
46 |
-
|
47 |
-
# Get model and org
|
48 |
-
# org_and_model = config.get("model_name", config.get("model_args", None))
|
49 |
-
# org_and_model = org_and_model.split("/", 1)
|
50 |
-
|
51 |
-
# if len(org_and_model) == 1:
|
52 |
-
# org = None
|
53 |
-
# model = org_and_model[0]
|
54 |
-
# result_key = f"{model}_{precision.value.name}"
|
55 |
-
# else:
|
56 |
-
# org = org_and_model[0]
|
57 |
-
# model = org_and_model[1]
|
58 |
-
# result_key = f"{org}_{model}_{precision.value.name}"
|
59 |
-
# full_model = "/".join(org_and_model)
|
60 |
-
|
61 |
-
# still_on_hub, _, model_config = is_model_on_hub(
|
62 |
-
# full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
63 |
-
# )
|
64 |
-
# architecture = "?"
|
65 |
-
# if model_config is not None:
|
66 |
-
# architectures = getattr(model_config, "architectures", None)
|
67 |
-
# if architectures:
|
68 |
-
# architecture = ";".join(architectures)
|
69 |
-
|
70 |
# Extract results available in this file (some results are split in several files)
|
71 |
results = {}
|
72 |
for task in Tasks:
|
@@ -83,48 +42,16 @@ class EvalResult:
|
|
83 |
return self(
|
84 |
eval_name=full_model,
|
85 |
full_model=full_model,
|
86 |
-
# org=org,
|
87 |
-
# model=model,
|
88 |
results=results,
|
89 |
-
# precision=precision,
|
90 |
-
# revision= config.get("model_sha", ""),
|
91 |
-
# still_on_hub=still_on_hub,
|
92 |
-
# architecture=architecture
|
93 |
)
|
94 |
|
95 |
-
# def update_with_request_file(self, requests_path):
|
96 |
-
# """Finds the relevant request file for the current model and updates info with it"""
|
97 |
-
# request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
98 |
-
|
99 |
-
# try:
|
100 |
-
# with open(request_file, "r") as f:
|
101 |
-
# request = json.load(f)
|
102 |
-
# self.model_type = ModelType.from_str(request.get("model_type", ""))
|
103 |
-
# self.weight_type = WeightType[request.get("weight_type", "Original")]
|
104 |
-
# self.license = request.get("license", "?")
|
105 |
-
# self.likes = request.get("likes", 0)
|
106 |
-
# self.num_params = request.get("params", 0)
|
107 |
-
# self.date = request.get("submitted_time", "")
|
108 |
-
# except Exception:
|
109 |
-
# print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
110 |
-
|
111 |
def to_dict(self):
|
112 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
113 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
114 |
data_dict = {
|
115 |
"eval_name": self.eval_name, # not a column, just a save name,
|
116 |
-
# AutoEvalColumn.precision.name: self.precision.value.name,
|
117 |
-
# AutoEvalColumn.model_type.name: self.model_type.value.name,
|
118 |
-
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
119 |
-
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
120 |
-
# AutoEvalColumn.architecture.name: self.architecture,
|
121 |
AutoEvalColumn.model.name: self.full_model,
|
122 |
-
# AutoEvalColumn.revision.name: self.revision,
|
123 |
AutoEvalColumn.average.name: average,
|
124 |
-
# AutoEvalColumn.license.name: self.license,
|
125 |
-
# AutoEvalColumn.likes.name: self.likes,
|
126 |
-
# AutoEvalColumn.params.name: self.num_params,
|
127 |
-
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
128 |
}
|
129 |
|
130 |
for task in Tasks:
|
@@ -133,28 +60,6 @@ class EvalResult:
|
|
133 |
return data_dict
|
134 |
|
135 |
|
136 |
-
# def get_request_file_for_model(requests_path, model_name, precision):
|
137 |
-
# """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
138 |
-
# request_files = os.path.join(
|
139 |
-
# requests_path,
|
140 |
-
# f"{model_name}_eval_request_*.json",
|
141 |
-
# )
|
142 |
-
# request_files = glob.glob(request_files)
|
143 |
-
|
144 |
-
# # Select correct request file (precision)
|
145 |
-
# request_file = ""
|
146 |
-
# request_files = sorted(request_files, reverse=True)
|
147 |
-
# for tmp_request_file in request_files:
|
148 |
-
# with open(tmp_request_file, "r") as f:
|
149 |
-
# req_content = json.load(f)
|
150 |
-
# if (
|
151 |
-
# req_content["status"] in ["FINISHED"]
|
152 |
-
# and req_content["precision"] == precision.split(".")[-1]
|
153 |
-
# ):
|
154 |
-
# request_file = tmp_request_file
|
155 |
-
# return request_file
|
156 |
-
|
157 |
-
|
158 |
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
159 |
"""From the path of the results folder root, extract all needed info for results"""
|
160 |
model_result_filepaths = []
|
|
|
|
|
1 |
import json
|
|
|
2 |
import os
|
3 |
from dataclasses import dataclass
|
4 |
|
|
|
6 |
import numpy as np
|
7 |
|
8 |
from src.display.utils import AutoEvalColumn, Tasks
|
|
|
|
|
9 |
|
10 |
@dataclass
|
11 |
class EvalResult:
|
|
|
13 |
"""
|
14 |
eval_name: str # org_model_precision (uid)
|
15 |
full_model: str # org/model (path on hub)
|
|
|
|
|
|
|
16 |
results: dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
date: str = "" # submission date of request file
|
|
|
18 |
|
19 |
@classmethod
|
20 |
def init_from_json_file(self, json_filepath):
|
|
|
26 |
|
27 |
full_model = env_info.get("model")
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Extract results available in this file (some results are split in several files)
|
30 |
results = {}
|
31 |
for task in Tasks:
|
|
|
42 |
return self(
|
43 |
eval_name=full_model,
|
44 |
full_model=full_model,
|
|
|
|
|
45 |
results=results,
|
|
|
|
|
|
|
|
|
46 |
)
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def to_dict(self):
|
49 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
50 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
51 |
data_dict = {
|
52 |
"eval_name": self.eval_name, # not a column, just a save name,
|
|
|
|
|
|
|
|
|
|
|
53 |
AutoEvalColumn.model.name: self.full_model,
|
|
|
54 |
AutoEvalColumn.average.name: average,
|
|
|
|
|
|
|
|
|
55 |
}
|
56 |
|
57 |
for task in Tasks:
|
|
|
60 |
return data_dict
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
64 |
"""From the path of the results folder root, extract all needed info for results"""
|
65 |
model_result_filepaths = []
|
src/populate.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values
|
7 |
-
from src.display.utils import AutoEvalColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
@@ -19,40 +16,4 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
-
return df
|
23 |
-
|
24 |
-
|
25 |
-
# def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
-
# """Creates the different dataframes for the evaluation queues requestes"""
|
27 |
-
# entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
28 |
-
# all_evals = []
|
29 |
-
|
30 |
-
# for entry in entries:
|
31 |
-
# if ".json" in entry:
|
32 |
-
# file_path = os.path.join(save_path, entry)
|
33 |
-
# with open(file_path) as fp:
|
34 |
-
# data = json.load(fp)
|
35 |
-
|
36 |
-
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
-
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
-
|
39 |
-
# all_evals.append(data)
|
40 |
-
# elif ".md" not in entry:
|
41 |
-
# # this is a folder
|
42 |
-
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
43 |
-
# for sub_entry in sub_entries:
|
44 |
-
# file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
-
# with open(file_path) as fp:
|
46 |
-
# data = json.load(fp)
|
47 |
-
|
48 |
-
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
-
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
-
# all_evals.append(data)
|
51 |
-
|
52 |
-
# pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
-
# running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
-
# finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
55 |
-
# df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
-
# df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
-
# df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
-
# return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
from src.display.formatting import has_no_nan_values
|
4 |
+
from src.display.utils import AutoEvalColumn
|
5 |
from src.leaderboard.read_evals import get_raw_eval_results
|
6 |
|
7 |
|
|
|
16 |
|
17 |
# filter out if any of the benchmarks have not been produced
|
18 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
19 |
+
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|