jbnayahu commited on
Commit
50cdce4
·
unverified ·
1 Parent(s): ab870dd

Signed-off-by: Jonathan Bnayahu <[email protected]>

Files changed (1) hide show
  1. src/about.py +6 -4
src/about.py CHANGED
@@ -12,8 +12,6 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- # task0 = Task("anli_r1", "acc", "ANLI")
16
- # task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
  task0 = Task("bias", "score", "Bias")
19
  task1 = Task("chatbot_abilities", "score", "Chatbot Abilities")
@@ -35,16 +33,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
35
 
36
 
37
  # Your leaderboard name
38
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
- Intro text
 
 
 
43
  """
44
 
45
  # Which evaluations are you running? how can people reproduce what you have?
46
  LLM_BENCHMARKS_TEXT = f"""
47
  ## How it works
 
48
 
49
  ## Reproducibility
50
  To reproduce our results, here is the commands you can run:
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
 
 
15
 
16
  task0 = Task("bias", "score", "Bias")
17
  task1 = Task("chatbot_abilities", "score", "Chatbot Abilities")
 
33
 
34
 
35
  # Your leaderboard name
36
+ TITLE = """<h1 align="center" id="space-title">BlueBench Leaderboard</h1>"""
37
 
38
  # What does your leaderboard evaluate?
39
  INTRODUCTION_TEXT = """
40
+ BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
41
+
42
+ It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text processing.
43
+ As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
44
  """
45
 
46
  # Which evaluations are you running? how can people reproduce what you have?
47
  LLM_BENCHMARKS_TEXT = f"""
48
  ## How it works
49
+ <a href="https://www.unitxt.ai/en/latest/catalog/catalog.benchmarks.bluebench.html">See here</a>
50
 
51
  ## Reproducibility
52
  To reproduce our results, here is the commands you can run: