Spaces:
Running
Running
Commit
·
472c111
1
Parent(s):
4a49ee3
Added asterisk for closed models
Browse files- leaderboard/md.py +1 -1
- leaderboard/utils.py +7 -0
leaderboard/md.py
CHANGED
@@ -178,7 +178,7 @@ TOP_TEXT = """# RewardBench: Evaluating Reward Models"""
|
|
178 |
|
179 |
CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
|
180 |
|
181 |
-
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} |
|
182 |
|
183 |
CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
|
184 |
|
|
|
178 |
|
179 |
CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
|
180 |
|
181 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} | | * Closed models not run on Ai2 infrastructure | Last restart (PST): {current_time}"""
|
182 |
|
183 |
CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
|
184 |
|
leaderboard/utils.py
CHANGED
@@ -43,6 +43,13 @@ CONTAMINATED_MODELS_V1 = [
|
|
43 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
44 |
]
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# From Open LLM Leaderboard
|
48 |
def model_hyperlink(link, model_name):
|
|
|
43 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
44 |
]
|
45 |
|
46 |
+
UNVERIFIED_MODELS_V2 = [
|
47 |
+
"ContextualAI/LMUnit-llama3.1-70b",
|
48 |
+
"ContextualAI/LMUnit-qwen2.5-72b",
|
49 |
+
]
|
50 |
+
|
51 |
+
UNVERIFIED_MODELS = UNVERIFIED_MODELS_V2
|
52 |
+
|
53 |
|
54 |
# From Open LLM Leaderboard
|
55 |
def model_hyperlink(link, model_name):
|