Spaces:

jbnayahu
/

bluebench

Running

jbnayahu commited on 18 days ago

Commit

50cdce4

unverified ·

1 Parent(s): ab870dd

.

Signed-off-by: Jonathan Bnayahu <[email protected]>

Files changed (1) hide show

src/about.py CHANGED Viewed

@@ -12,8 +12,6 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-#    task0 = Task("anli_r1", "acc", "ANLI")
-#    task1 = Task("logiqa", "acc_norm", "LogiQA")
     task0 = Task("bias", "score", "Bias")
     task1 = Task("chatbot_abilities", "score", "Chatbot Abilities")
@@ -35,16 +33,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
 To reproduce our results, here is the commands you can run:

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("bias", "score", "Bias")
     task1 = Task("chatbot_abilities", "score", "Chatbot Abilities")
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">BlueBench Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.
+It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text processing.
+As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+<a href="https://www.unitxt.ai/en/latest/catalog/catalog.benchmarks.bluebench.html">See here</a>
 ## Reproducibility
 To reproduce our results, here is the commands you can run: