xeon27
commited on
Commit
·
ba2f546
1
Parent(s):
e004342
Add title and required text
Browse files- src/about.py +6 -3
src/about.py
CHANGED
@@ -38,20 +38,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
38 |
|
39 |
|
40 |
# Your leaderboard name
|
41 |
-
TITLE = """<h1 align="center" id="space-title">
|
42 |
|
43 |
# What does your leaderboard evaluate?
|
44 |
INTRODUCTION_TEXT = """
|
45 |
-
|
46 |
"""
|
47 |
|
48 |
# Which evaluations are you running? how can people reproduce what you have?
|
49 |
LLM_BENCHMARKS_TEXT = f"""
|
50 |
## How it works
|
|
|
|
|
|
|
51 |
|
52 |
## Reproducibility
|
53 |
To reproduce our results, here is the commands you can run:
|
54 |
-
|
55 |
"""
|
56 |
|
57 |
EVALUATION_QUEUE_TEXT = """
|
|
|
38 |
|
39 |
|
40 |
# Your leaderboard name
|
41 |
+
TITLE = """<h1 align="center" id="space-title">LLM Evaluation Leaderboard</h1>"""
|
42 |
|
43 |
# What does your leaderboard evaluate?
|
44 |
INTRODUCTION_TEXT = """
|
45 |
+
This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond. The agentic tasks are GAIA and GDM-InterCode-CTF.
|
46 |
"""
|
47 |
|
48 |
# Which evaluations are you running? how can people reproduce what you have?
|
49 |
LLM_BENCHMARKS_TEXT = f"""
|
50 |
## How it works
|
51 |
+
The following benchmarks are included:
|
52 |
+
Base: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond
|
53 |
+
Agentic: GAIA, GDM-InterCode-CTF
|
54 |
|
55 |
## Reproducibility
|
56 |
To reproduce our results, here is the commands you can run:
|
57 |
+
TBD
|
58 |
"""
|
59 |
|
60 |
EVALUATION_QUEUE_TEXT = """
|