Signed-off-by: Jonathan Bnayahu <[email protected]>
- src/about.py +6 -4
src/about.py
CHANGED
@@ -12,8 +12,6 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
-
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
|
18 |
task0 = Task("bias", "score", "Bias")
|
19 |
task1 = Task("chatbot_abilities", "score", "Chatbot Abilities")
|
@@ -35,16 +33,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
35 |
|
36 |
|
37 |
# Your leaderboard name
|
38 |
-
TITLE = """<h1 align="center" id="space-title">
|
39 |
|
40 |
# What does your leaderboard evaluate?
|
41 |
INTRODUCTION_TEXT = """
|
42 |
-
|
|
|
|
|
|
|
43 |
"""
|
44 |
|
45 |
# Which evaluations are you running? how can people reproduce what you have?
|
46 |
LLM_BENCHMARKS_TEXT = f"""
|
47 |
## How it works
|
|
|
48 |
|
49 |
## Reproducibility
|
50 |
To reproduce our results, here is the commands you can run:
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
|
|
|
15 |
|
16 |
task0 = Task("bias", "score", "Bias")
|
17 |
task1 = Task("chatbot_abilities", "score", "Chatbot Abilities")
|
|
|
33 |
|
34 |
|
35 |
# Your leaderboard name
|
36 |
+
TITLE = """<h1 align="center" id="space-title">BlueBench Leaderboard</h1>"""
|
37 |
|
38 |
# What does your leaderboard evaluate?
|
39 |
INTRODUCTION_TEXT = """
|
40 |
+
BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
|
41 |
+
|
42 |
+
It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text processing.
|
43 |
+
As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
|
44 |
"""
|
45 |
|
46 |
# Which evaluations are you running? how can people reproduce what you have?
|
47 |
LLM_BENCHMARKS_TEXT = f"""
|
48 |
## How it works
|
49 |
+
<a href="https://www.unitxt.ai/en/latest/catalog/catalog.benchmarks.bluebench.html">See here</a>
|
50 |
|
51 |
## Reproducibility
|
52 |
To reproduce our results, here is the commands you can run:
|