Spaces:

allenai
/

reward-bench

Running

saumyamalik commited on Jun 23

Commit

472c111

1 Parent(s): 4a49ee3

Added asterisk for closed models

Files changed (2) hide show

leaderboard/md.py CHANGED Viewed

@@ -178,7 +178,7 @@ TOP_TEXT = """# RewardBench: Evaluating Reward Models"""
 CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
-[Code](https://github.com/allenai/reward-bench) |  [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} |  Last restart (PST): {current_time}"""
 CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.

 CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
+[Code](https://github.com/allenai/reward-bench) |  [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} | | * Closed models not run on Ai2 infrastructure | Last restart (PST): {current_time}"""
 CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.

leaderboard/utils.py CHANGED Viewed

@@ -43,6 +43,13 @@ CONTAMINATED_MODELS_V1 = [
     "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
 ]
 # From Open LLM Leaderboard
 def model_hyperlink(link, model_name):

     "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
 ]
+UNVERIFIED_MODELS_V2 = [
+    "ContextualAI/LMUnit-llama3.1-70b",
+    "ContextualAI/LMUnit-qwen2.5-72b",
+]
+UNVERIFIED_MODELS = UNVERIFIED_MODELS_V2
 # From Open LLM Leaderboard
 def model_hyperlink(link, model_name):