jbnayahu commited on
Commit
460efe2
·
unverified ·
1 Parent(s): c47d3ec

Signed-off-by: Jonathan Bnayahu <[email protected]>

app.py CHANGED
@@ -1,13 +1,10 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
- # EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
@@ -17,73 +14,25 @@ from src.display.css_html_js import custom_css
17
  from src.display.utils import (
18
  BENCHMARK_COLS,
19
  COLS,
20
- EVAL_COLS,
21
- # EVAL_TYPES,
22
  AutoEvalColumn,
23
- # ModelType,
24
  fields,
25
- # WeightType,
26
- # Precision
27
  )
28
- from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, TOKEN
29
  from src.populate import get_leaderboard_df
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- # try:
37
- # print(EVAL_REQUESTS_PATH)
38
- # snapshot_download(
39
- # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- # )
41
- # except Exception:
42
- # restart_space()
43
- # try:
44
- # print(EVAL_RESULTS_PATH)
45
- # snapshot_download(
46
- # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- # )
48
- # except Exception:
49
- # restart_space()
50
-
51
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
52
 
53
- # (
54
- # finished_eval_queue_df,
55
- # running_eval_queue_df,
56
- # pending_eval_queue_df,
57
- # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
58
-
59
  def init_leaderboard(dataframe):
60
  if dataframe is None or dataframe.empty:
61
  raise ValueError("Leaderboard DataFrame is empty or None.")
62
  return Leaderboard(
63
  value=dataframe,
64
  datatype=[c.type for c in fields(AutoEvalColumn)],
65
- # select_columns=SelectColumns(
66
- # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
67
- # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
68
- # label="Select Columns to Display:",
69
- # ),
70
  search_columns=[AutoEvalColumn.model.name],
71
- # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
72
- # filter_columns=[
73
- # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
74
- # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
75
- # ColumnFilter(
76
- # AutoEvalColumn.params.name,
77
- # type="slider",
78
- # min=0.01,
79
- # max=150,
80
- # label="Select the number of parameters (B)",
81
- # ),
82
- # ColumnFilter(
83
- # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
84
- # ),
85
- # ],
86
- # bool_checkboxgroup_label="Hide models",
87
  interactive=False,
88
  )
89
 
@@ -101,93 +50,6 @@ with demo:
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- # with gr.Column():
106
- # with gr.Row():
107
- # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- # with gr.Column():
110
- # with gr.Accordion(
111
- # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- # open=False,
113
- # ):
114
- # with gr.Row():
115
- # finished_eval_table = gr.components.Dataframe(
116
- # value=finished_eval_queue_df,
117
- # headers=EVAL_COLS,
118
- # datatype=EVAL_TYPES,
119
- # row_count=5,
120
- # )
121
- # with gr.Accordion(
122
- # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- # open=False,
124
- # ):
125
- # with gr.Row():
126
- # running_eval_table = gr.components.Dataframe(
127
- # value=running_eval_queue_df,
128
- # headers=EVAL_COLS,
129
- # datatype=EVAL_TYPES,
130
- # row_count=5,
131
- # )
132
-
133
- # with gr.Accordion(
134
- # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- # open=False,
136
- # ):
137
- # with gr.Row():
138
- # pending_eval_table = gr.components.Dataframe(
139
- # value=pending_eval_queue_df,
140
- # headers=EVAL_COLS,
141
- # datatype=EVAL_TYPES,
142
- # row_count=5,
143
- # )
144
- # with gr.Row():
145
- # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- # with gr.Row():
148
- # with gr.Column():
149
- # model_name_textbox = gr.Textbox(label="Model name")
150
- # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- # model_type = gr.Dropdown(
152
- # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- # label="Model type",
154
- # multiselect=False,
155
- # value=None,
156
- # interactive=True,
157
- # )
158
-
159
- # with gr.Column():
160
- # precision = gr.Dropdown(
161
- # choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- # label="Precision",
163
- # multiselect=False,
164
- # value="float16",
165
- # interactive=True,
166
- # )
167
- # weight_type = gr.Dropdown(
168
- # choices=[i.value.name for i in WeightType],
169
- # label="Weights type",
170
- # multiselect=False,
171
- # value="Original",
172
- # interactive=True,
173
- # )
174
- # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- # submit_button = gr.Button("Submit Eval")
177
- # submission_result = gr.Markdown()
178
- # submit_button.click(
179
- # add_new_eval,
180
- # [
181
- # model_name_textbox,
182
- # base_model_name_textbox,
183
- # revision_name_textbox,
184
- # precision,
185
- # weight_type,
186
- # model_type,
187
- # ],
188
- # submission_result,
189
- # )
190
-
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard
 
3
  from apscheduler.schedulers.background import BackgroundScheduler
 
4
 
5
  from src.about import (
6
  CITATION_BUTTON_LABEL,
7
  CITATION_BUTTON_TEXT,
 
8
  INTRODUCTION_TEXT,
9
  LLM_BENCHMARKS_TEXT,
10
  TITLE,
 
14
  from src.display.utils import (
15
  BENCHMARK_COLS,
16
  COLS,
 
 
17
  AutoEvalColumn,
 
18
  fields,
 
 
19
  )
20
+ from src.envs import API, EVAL_RESULTS_PATH, REPO_ID
21
  from src.populate import get_leaderboard_df
22
 
23
 
24
  def restart_space():
25
  API.restart_space(repo_id=REPO_ID)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
28
 
 
 
 
 
 
 
29
  def init_leaderboard(dataframe):
30
  if dataframe is None or dataframe.empty:
31
  raise ValueError("Leaderboard DataFrame is empty or None.")
32
  return Leaderboard(
33
  value=dataframe,
34
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
 
 
 
 
35
  search_columns=[AutoEvalColumn.model.name],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  interactive=False,
37
  )
38
 
 
50
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
51
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  with gr.Row():
54
  with gr.Accordion("📙 Citation", open=False):
55
  citation_button = gr.Textbox(
src/about.py CHANGED
@@ -27,7 +27,6 @@ class Tasks(Enum):
27
  task11 = Task("summarization", "score", "Summarization")
28
  task12 = Task("translation", "score", "Translation")
29
 
30
- NUM_FEWSHOT = 0 # Change with your few shot
31
  # ---------------------------------------------------
32
 
33
 
@@ -39,7 +38,7 @@ TITLE = """<h1 align="center" id="space-title">BlueBench Leaderboard</h1>"""
39
  # What does your leaderboard evaluate?
40
  INTRODUCTION_TEXT = """
41
  BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
42
- It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxts abilities for dynamic and flexible text processing.
43
  As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
44
  """
45
 
@@ -66,7 +65,7 @@ table th:nth-of-type(3) {
66
  | Reasoning | <pre><p><b>Hellaswag</b></p>[Dataset](https://huggingface.co/datasets/Rowan/hellaswag), [Paper](https://arxiv.org/abs/1905.07830), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.hellaswag.html)</pre> | <p>Commonsense natural language inference</p>Given an event description such as "A woman sits at a piano," a machine must select the most likely followup: "She sets her fingers on the keys." Gatherd via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. |
67
  | Reasoning | <pre><p><b>Openbook QA</b></p>[Dataset](https://huggingface.co/datasets/allenai/openbookqa), [Paper](https://aclanthology.org/D18-1260/), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.openbook_qa.html)</pre> | <p>Question answering dataset using open book exams</p>Comes with our questions is a set of 1326 elementary level science facts. Roughly 6000 questions probe an understanding of these facts and their application to novel situations. This requires combining an open book fact (e.g., metals conduct electricity) with broad common knowledge (e.g., a suit of armor is made of metal) obtained from other sources. |
68
  | Machine Translation | <pre><p><b>Flores 101</b></p>[Dataset](https://huggingface.co/datasets/gsarti/flores_101), [Paper](https://arxiv.org/abs/2106.03193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.mt.flores_101.__dir__.html)</pre> | <p>Benchmark dataset for machine translation</p>There are 101 lanugages in this dataset, each sentence appears in all languages, and all a total of `2k` sentences. We use the following language pairs: `["ara_eng", "deu_eng", "eng_ara", "eng_deu", "eng_fra", "eng_kor", "eng_por", "eng_ron", "eng_spa", "fra_eng", "jpn_eng", "kor_eng", "por_eng", "ron_eng", "spa_eng"]`. |
69
- | Chatbot Abilities | <pre><p><b>Arena Hard</b></p>[Dataset](https://huggingface.co/datasets/lmsys/arena-hard-auto-v0.1), [Paper](https://arxiv.org/abs/2406.11939), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.arena_hard.generation.english_gpt_4_0314_reference.html)</pre> | <p>An automatic evaluation tool for instruction-tuned LLMs</p>Contains 500 challenging user queries sourced from Chatbot Arena. We prompt GPT-4-Turbo as judge to compare the models" responses against a baseline model (default: GPT-4-0314 for here we are using `llama-3.1-70b`). |
70
  | Classification | <pre><p><b>20_newsgroups</b></p>[Dataset](https://huggingface.co/datasets/SetFit/20_newsgroups), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.20_newsgroups.html)</pre> | <p>News article classification</p>The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date. |
71
  | Bias | <pre><p><b>BBQ</b></p>[Dataset](https://huggingface.co/datasets/heegyu/bbq), [Paper](https://arxiv.org/abs/2110.08193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.safety.bbq.__dir__.html)</pre> | <p>Question sets constructed to highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts.</p>It is well documented that NLP models learn social biases, but little work has been done on how these biases manifest in model outputs for applied tasks like question answering (QA). We introduce the Bias Benchmark for QA (BBQ), a dataset of question sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts. Our task evaluates model responses at two levels: (i) given an under-informative context, we test how strongly responses refect social biases, and (ii) given an adequately informative context, we test whether the model's biases override a correct answer choice. We fnd that models often rely on stereotypes when the context is under-informative, meaning the model's outputs consistently reproduce harmful biases in this setting. Though models are more accurate when the context provides an informative answer, they still rely on stereotypes and average up to 3.4 percentage points higher accuracy when the correct answer aligns with a social bias than when it conficts, with this difference widening to over 5 points on examples targeting gender for most models tested. |
72
  | Legal Reasoning | <pre><p><b>Legalbench</b></p>[Dataset](https://huggingface.co/datasets/nguha/legalbench), [Paper](https://arxiv.org/abs/2308.11462), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.legalbench.__dir__.html)</pre> | <p>Evaluating legal reasoning in English large language models (LLMs).</p>LegalBench tasks span multiple types (binary classification, multi-class classification, extraction, generation, entailment), multiple types of text (statutes, judicial opinions, contracts, etc.), and multiple areas of law (evidence, contracts, civil procedure, etc.). For more information on tasks, we recommend visiting the website, where you can search through task descriptions, or the Github repository, which contains more granular task descriptions. We also recommend reading the paper, which provides more background on task significance and construction process. |
@@ -88,36 +87,6 @@ unitxt-evaluate --tasks "benchmarks.bluebench" --model cross_provider --model_ar
88
  ```
89
  """
90
 
91
- EVALUATION_QUEUE_TEXT = """
92
- ## Some good practices before submitting a model
93
-
94
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
95
- ```python
96
- from transformers import AutoConfig, AutoModel, AutoTokenizer
97
- config = AutoConfig.from_pretrained("your model name", revision=revision)
98
- model = AutoModel.from_pretrained("your model name", revision=revision)
99
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
100
- ```
101
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
102
-
103
- Note: make sure your model is public!
104
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
105
-
106
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
107
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
108
-
109
- ### 3) Make sure your model has an open license!
110
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
111
-
112
- ### 4) Fill up your model card
113
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
114
-
115
- ## In case of model failure
116
- If your model is displayed in the `FAILED` category, its execution stopped.
117
- Make sure you have followed the above steps first.
118
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
119
- """
120
-
121
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
122
  CITATION_BUTTON_TEXT = r"""
123
  """
 
27
  task11 = Task("summarization", "score", "Summarization")
28
  task12 = Task("translation", "score", "Translation")
29
 
 
30
  # ---------------------------------------------------
31
 
32
 
 
38
  # What does your leaderboard evaluate?
39
  INTRODUCTION_TEXT = """
40
  BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
41
+ It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing <a href="https://www.unitxt.ai">unitxt</a>’s abilities for dynamic and flexible text processing.
42
  As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
43
  """
44
 
 
65
  | Reasoning | <pre><p><b>Hellaswag</b></p>[Dataset](https://huggingface.co/datasets/Rowan/hellaswag), [Paper](https://arxiv.org/abs/1905.07830), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.hellaswag.html)</pre> | <p>Commonsense natural language inference</p>Given an event description such as "A woman sits at a piano," a machine must select the most likely followup: "She sets her fingers on the keys." Gatherd via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. |
66
  | Reasoning | <pre><p><b>Openbook QA</b></p>[Dataset](https://huggingface.co/datasets/allenai/openbookqa), [Paper](https://aclanthology.org/D18-1260/), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.openbook_qa.html)</pre> | <p>Question answering dataset using open book exams</p>Comes with our questions is a set of 1326 elementary level science facts. Roughly 6000 questions probe an understanding of these facts and their application to novel situations. This requires combining an open book fact (e.g., metals conduct electricity) with broad common knowledge (e.g., a suit of armor is made of metal) obtained from other sources. |
67
  | Machine Translation | <pre><p><b>Flores 101</b></p>[Dataset](https://huggingface.co/datasets/gsarti/flores_101), [Paper](https://arxiv.org/abs/2106.03193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.mt.flores_101.__dir__.html)</pre> | <p>Benchmark dataset for machine translation</p>There are 101 lanugages in this dataset, each sentence appears in all languages, and all a total of `2k` sentences. We use the following language pairs: `["ara_eng", "deu_eng", "eng_ara", "eng_deu", "eng_fra", "eng_kor", "eng_por", "eng_ron", "eng_spa", "fra_eng", "jpn_eng", "kor_eng", "por_eng", "ron_eng", "spa_eng"]`. |
68
+ | Chatbot Abilities | <pre><p><b>Arena Hard</b></p>[Dataset](https://huggingface.co/datasets/lmsys/arena-hard-auto-v0.1), [Paper](https://arxiv.org/abs/2406.11939), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.arena_hard.generation.english_gpt_4_0314_reference.html)</pre> | <p>An automatic evaluation tool for instruction-tuned LLMs</p>Contains 500 challenging user queries sourced from Chatbot Arena. We prompt GPT-4-Turbo as judge to compare the models" responses against a baseline model (default: GPT-4-0314 for here we are using `llama-3.3-70b`). |
69
  | Classification | <pre><p><b>20_newsgroups</b></p>[Dataset](https://huggingface.co/datasets/SetFit/20_newsgroups), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.20_newsgroups.html)</pre> | <p>News article classification</p>The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date. |
70
  | Bias | <pre><p><b>BBQ</b></p>[Dataset](https://huggingface.co/datasets/heegyu/bbq), [Paper](https://arxiv.org/abs/2110.08193), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.safety.bbq.__dir__.html)</pre> | <p>Question sets constructed to highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts.</p>It is well documented that NLP models learn social biases, but little work has been done on how these biases manifest in model outputs for applied tasks like question answering (QA). We introduce the Bias Benchmark for QA (BBQ), a dataset of question sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts. Our task evaluates model responses at two levels: (i) given an under-informative context, we test how strongly responses refect social biases, and (ii) given an adequately informative context, we test whether the model's biases override a correct answer choice. We fnd that models often rely on stereotypes when the context is under-informative, meaning the model's outputs consistently reproduce harmful biases in this setting. Though models are more accurate when the context provides an informative answer, they still rely on stereotypes and average up to 3.4 percentage points higher accuracy when the correct answer aligns with a social bias than when it conficts, with this difference widening to over 5 points on examples targeting gender for most models tested. |
71
  | Legal Reasoning | <pre><p><b>Legalbench</b></p>[Dataset](https://huggingface.co/datasets/nguha/legalbench), [Paper](https://arxiv.org/abs/2308.11462), [Unitxt Card](https://www.unitxt.ai/en/latest/catalog/catalog.cards.legalbench.__dir__.html)</pre> | <p>Evaluating legal reasoning in English large language models (LLMs).</p>LegalBench tasks span multiple types (binary classification, multi-class classification, extraction, generation, entailment), multiple types of text (statutes, judicial opinions, contracts, etc.), and multiple areas of law (evidence, contracts, civil procedure, etc.). For more information on tasks, we recommend visiting the website, where you can search through task descriptions, or the Github repository, which contains more granular task descriptions. We also recommend reading the paper, which provides more background on task significance and construction process. |
 
87
  ```
88
  """
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
91
  CITATION_BUTTON_TEXT = r"""
92
  """
src/display/formatting.py CHANGED
@@ -1,12 +1,3 @@
1
- # def model_hyperlink(link, model_name):
2
- # return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- # def make_clickable_model(model_name):
6
- # link = f"https://huggingface.co/{model_name}"
7
- # return model_hyperlink(link, model_name)
8
-
9
-
10
  def styled_error(error):
11
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
 
 
 
 
 
 
 
 
 
 
 
1
  def styled_error(error):
2
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
3
 
src/display/utils.py CHANGED
@@ -23,22 +23,11 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -60,46 +49,6 @@ class ModelDetails:
60
  display_name: str = ""
61
  symbol: str = "" # emoji
62
 
63
-
64
- # class ModelType(Enum):
65
- # PT = ModelDetails(name="pretrained", symbol="🟢")
66
- # FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- # IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
- # RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- # Unknown = ModelDetails(name="", symbol="?")
70
-
71
- # def to_str(self, separator=" "):
72
- # return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- # @staticmethod
75
- # def from_str(type):
76
- # if "fine-tuned" in type or "🔶" in type:
77
- # return ModelType.FT
78
- # if "pretrained" in type or "🟢" in type:
79
- # return ModelType.PT
80
- # if "RL-tuned" in type or "🟦" in type:
81
- # return ModelType.RL
82
- # if "instruction-tuned" in type or "⭕" in type:
83
- # return ModelType.IFT
84
- # return ModelType.Unknown
85
-
86
- # class WeightType(Enum):
87
- # Adapter = ModelDetails("Adapter")
88
- # Original = ModelDetails("Original")
89
- # Delta = ModelDetails("Delta")
90
-
91
- # class Precision(Enum):
92
- # float16 = ModelDetails("float16")
93
- # bfloat16 = ModelDetails("bfloat16")
94
- # Unknown = ModelDetails("?")
95
-
96
- # def from_str(precision):
97
- # if precision in ["torch.float16", "float16"]:
98
- # return Precision.float16
99
- # if precision in ["torch.bfloat16", "bfloat16"]:
100
- # return Precision.bfloat16
101
- # return Precision.Unknown
102
-
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
 
26
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
27
  #Scores
28
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
29
  for task in Tasks:
30
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
 
 
 
 
 
 
 
31
 
32
  # We use make dataclass to dynamically fill the scores from Tasks
33
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
49
  display_name: str = ""
50
  symbol: str = "" # emoji
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Column selection
53
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
54
 
src/envs.py CHANGED
@@ -10,18 +10,9 @@ OWNER = "jbnayahu" # Change to your org - don't forget to create a results and r
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/bluebench"
13
- # QUEUE_REPO = f"{OWNER}/bluebench-requests"
14
 
15
  # If you setup a cache later, just change HF_HOME
16
  CACHE_PATH=os.getenv("HF_HOME", ".")
17
-
18
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results/bluebench")
19
 
20
-
21
- # Local caches
22
- # EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
23
- # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
24
- # EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
25
- # EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
26
-
27
  API = HfApi(token=TOKEN)
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/bluebench"
 
13
 
14
  # If you setup a cache later, just change HF_HOME
15
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
16
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results/bluebench")
17
 
 
 
 
 
 
 
 
18
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,4 @@
1
- import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
@@ -8,8 +6,6 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.utils import AutoEvalColumn, Tasks
11
- # from src.submission.check_validity import is_model_on_hub
12
-
13
 
14
  @dataclass
15
  class EvalResult:
@@ -17,19 +13,8 @@ class EvalResult:
17
  """
18
  eval_name: str # org_model_precision (uid)
19
  full_model: str # org/model (path on hub)
20
- # org: str
21
- # model: str
22
- # revision: str # commit hash, "" if main
23
  results: dict
24
- # precision: Precision = Precision.Unknown
25
- # model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
26
- # weight_type: WeightType = WeightType.Original # Original or Adapter
27
- # architecture: str = "Unknown"
28
- # license: str = "?"
29
- # likes: int = 0
30
- # num_params: int = 0
31
  date: str = "" # submission date of request file
32
- # still_on_hub: bool = False
33
 
34
  @classmethod
35
  def init_from_json_file(self, json_filepath):
@@ -41,32 +26,6 @@ class EvalResult:
41
 
42
  full_model = env_info.get("model")
43
 
44
- # Precision
45
- # precision = Precision.from_str(config.get("model_dtype"))
46
-
47
- # Get model and org
48
- # org_and_model = config.get("model_name", config.get("model_args", None))
49
- # org_and_model = org_and_model.split("/", 1)
50
-
51
- # if len(org_and_model) == 1:
52
- # org = None
53
- # model = org_and_model[0]
54
- # result_key = f"{model}_{precision.value.name}"
55
- # else:
56
- # org = org_and_model[0]
57
- # model = org_and_model[1]
58
- # result_key = f"{org}_{model}_{precision.value.name}"
59
- # full_model = "/".join(org_and_model)
60
-
61
- # still_on_hub, _, model_config = is_model_on_hub(
62
- # full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
63
- # )
64
- # architecture = "?"
65
- # if model_config is not None:
66
- # architectures = getattr(model_config, "architectures", None)
67
- # if architectures:
68
- # architecture = ";".join(architectures)
69
-
70
  # Extract results available in this file (some results are split in several files)
71
  results = {}
72
  for task in Tasks:
@@ -83,48 +42,16 @@ class EvalResult:
83
  return self(
84
  eval_name=full_model,
85
  full_model=full_model,
86
- # org=org,
87
- # model=model,
88
  results=results,
89
- # precision=precision,
90
- # revision= config.get("model_sha", ""),
91
- # still_on_hub=still_on_hub,
92
- # architecture=architecture
93
  )
94
 
95
- # def update_with_request_file(self, requests_path):
96
- # """Finds the relevant request file for the current model and updates info with it"""
97
- # request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
98
-
99
- # try:
100
- # with open(request_file, "r") as f:
101
- # request = json.load(f)
102
- # self.model_type = ModelType.from_str(request.get("model_type", ""))
103
- # self.weight_type = WeightType[request.get("weight_type", "Original")]
104
- # self.license = request.get("license", "?")
105
- # self.likes = request.get("likes", 0)
106
- # self.num_params = request.get("params", 0)
107
- # self.date = request.get("submitted_time", "")
108
- # except Exception:
109
- # print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
110
-
111
  def to_dict(self):
112
  """Converts the Eval Result to a dict compatible with our dataframe display"""
113
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
114
  data_dict = {
115
  "eval_name": self.eval_name, # not a column, just a save name,
116
- # AutoEvalColumn.precision.name: self.precision.value.name,
117
- # AutoEvalColumn.model_type.name: self.model_type.value.name,
118
- # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
119
- # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
120
- # AutoEvalColumn.architecture.name: self.architecture,
121
  AutoEvalColumn.model.name: self.full_model,
122
- # AutoEvalColumn.revision.name: self.revision,
123
  AutoEvalColumn.average.name: average,
124
- # AutoEvalColumn.license.name: self.license,
125
- # AutoEvalColumn.likes.name: self.likes,
126
- # AutoEvalColumn.params.name: self.num_params,
127
- # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
128
  }
129
 
130
  for task in Tasks:
@@ -133,28 +60,6 @@ class EvalResult:
133
  return data_dict
134
 
135
 
136
- # def get_request_file_for_model(requests_path, model_name, precision):
137
- # """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
138
- # request_files = os.path.join(
139
- # requests_path,
140
- # f"{model_name}_eval_request_*.json",
141
- # )
142
- # request_files = glob.glob(request_files)
143
-
144
- # # Select correct request file (precision)
145
- # request_file = ""
146
- # request_files = sorted(request_files, reverse=True)
147
- # for tmp_request_file in request_files:
148
- # with open(tmp_request_file, "r") as f:
149
- # req_content = json.load(f)
150
- # if (
151
- # req_content["status"] in ["FINISHED"]
152
- # and req_content["precision"] == precision.split(".")[-1]
153
- # ):
154
- # request_file = tmp_request_file
155
- # return request_file
156
-
157
-
158
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
159
  """From the path of the results folder root, extract all needed info for results"""
160
  model_result_filepaths = []
 
 
1
  import json
 
2
  import os
3
  from dataclasses import dataclass
4
 
 
6
  import numpy as np
7
 
8
  from src.display.utils import AutoEvalColumn, Tasks
 
 
9
 
10
  @dataclass
11
  class EvalResult:
 
13
  """
14
  eval_name: str # org_model_precision (uid)
15
  full_model: str # org/model (path on hub)
 
 
 
16
  results: dict
 
 
 
 
 
 
 
17
  date: str = "" # submission date of request file
 
18
 
19
  @classmethod
20
  def init_from_json_file(self, json_filepath):
 
26
 
27
  full_model = env_info.get("model")
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Extract results available in this file (some results are split in several files)
30
  results = {}
31
  for task in Tasks:
 
42
  return self(
43
  eval_name=full_model,
44
  full_model=full_model,
 
 
45
  results=results,
 
 
 
 
46
  )
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def to_dict(self):
49
  """Converts the Eval Result to a dict compatible with our dataframe display"""
50
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
51
  data_dict = {
52
  "eval_name": self.eval_name, # not a column, just a save name,
 
 
 
 
 
53
  AutoEvalColumn.model.name: self.full_model,
 
54
  AutoEvalColumn.average.name: average,
 
 
 
 
55
  }
56
 
57
  for task in Tasks:
 
60
  return data_dict
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
64
  """From the path of the results folder root, extract all needed info for results"""
65
  model_result_filepaths = []
src/populate.py CHANGED
@@ -1,10 +1,7 @@
1
- import json
2
- import os
3
-
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
@@ -19,40 +16,4 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- # def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- # """Creates the different dataframes for the evaluation queues requestes"""
27
- # entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- # all_evals = []
29
-
30
- # for entry in entries:
31
- # if ".json" in entry:
32
- # file_path = os.path.join(save_path, entry)
33
- # with open(file_path) as fp:
34
- # data = json.load(fp)
35
-
36
- # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- # all_evals.append(data)
40
- # elif ".md" not in entry:
41
- # # this is a folder
42
- # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- # for sub_entry in sub_entries:
44
- # file_path = os.path.join(save_path, entry, sub_entry)
45
- # with open(file_path) as fp:
46
- # data = json.load(fp)
47
-
48
- # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- # all_evals.append(data)
51
-
52
- # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- # finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- # df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- # df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- # return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
1
  import pandas as pd
2
 
3
  from src.display.formatting import has_no_nan_values
4
+ from src.display.utils import AutoEvalColumn
5
  from src.leaderboard.read_evals import get_raw_eval_results
6
 
7
 
 
16
 
17
  # filter out if any of the benchmarks have not been produced
18
  df = df[has_no_nan_values(df, benchmark_cols)]
19
+ return df