xeon27 commited on
Commit
ba2f546
·
1 Parent(s): e004342

Add title and required text

Browse files
Files changed (1) hide show
  1. src/about.py +6 -3
src/about.py CHANGED
@@ -38,20 +38,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
38
 
39
 
40
  # Your leaderboard name
41
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
42
 
43
  # What does your leaderboard evaluate?
44
  INTRODUCTION_TEXT = """
45
- Intro text
46
  """
47
 
48
  # Which evaluations are you running? how can people reproduce what you have?
49
  LLM_BENCHMARKS_TEXT = f"""
50
  ## How it works
 
 
 
51
 
52
  ## Reproducibility
53
  To reproduce our results, here is the commands you can run:
54
-
55
  """
56
 
57
  EVALUATION_QUEUE_TEXT = """
 
38
 
39
 
40
  # Your leaderboard name
41
+ TITLE = """<h1 align="center" id="space-title">LLM Evaluation Leaderboard</h1>"""
42
 
43
  # What does your leaderboard evaluate?
44
  INTRODUCTION_TEXT = """
45
+ This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond. The agentic tasks are GAIA and GDM-InterCode-CTF.
46
  """
47
 
48
  # Which evaluations are you running? how can people reproduce what you have?
49
  LLM_BENCHMARKS_TEXT = f"""
50
  ## How it works
51
+ The following benchmarks are included:
52
+ Base: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond
53
+ Agentic: GAIA, GDM-InterCode-CTF
54
 
55
  ## Reproducibility
56
  To reproduce our results, here is the commands you can run:
57
+ TBD
58
  """
59
 
60
  EVALUATION_QUEUE_TEXT = """